Commit 86c8b86e authored by Oscar Roisin's avatar Oscar Roisin
Browse files

clean code + add map to kmeans

parent 24cbaffa
...@@ -14,145 +14,145 @@ MIN_DATE = loading.MIN_DATE ...@@ -14,145 +14,145 @@ MIN_DATE = loading.MIN_DATE
MAX_DATE = loading.MAX_DATE MAX_DATE = loading.MAX_DATE
def getHistory(station, indicator): def getHistory(station, indicator):
datas = session.execute(f"SELECT year, month, day, {indicator} FROM {table_name_space} where station = '{station}'") datas = session.execute(f"SELECT year, month, day, {indicator} FROM {table_name_space} where station = '{station}'")
return datas return datas
def getMeanByDay(table, dateMin, dateMax): def getMeanByDay(table, dateMin, dateMax):
# Dictionary to store sum of measures and number of measures by day # Dictionary to store sum of measures and number of measures by day
table_date = {} table_date = {}
for r in table: for r in table:
year = r[0] year = r[0]
# Verify the measures is for the period chosen by user and have a value # Verify the measures is for the period chosen by user and have a value
if year >= dateMin and year < dateMax and r[len(r) - 1] != None: if year >= dateMin and year < dateMax and r[len(r) - 1] != None:
# convert attributes to date format as string # convert attributes to date format as string
date = str(r[0]) + "-" + "0" * (2 - len(str(r[1]))) + str(r[1]) + "-" + "0" * (2 - len(str(r[2]))) + str(r[2]) date = str(r[0]) + "-" + "0" * (2 - len(str(r[1]))) + str(r[1]) + "-" + "0" * (2 - len(str(r[2]))) + str(r[2])
if date not in table_date.keys(): if date not in table_date.keys():
table_date[date] = 0,0 table_date[date] = 0,0
table_date[date] = (table_date[date][0] + r[len(r) - 1], table_date[date][1] + 1) table_date[date] = (table_date[date][0] + r[len(r) - 1], table_date[date][1] + 1)
# Treat datas to get mean by day # Treat datas to get mean by day
for d in table_date.keys(): for d in table_date.keys():
table_date[d] = table_date[d][0] / table_date[d][1] table_date[d] = table_date[d][0] / table_date[d][1]
return table_date return table_date
def getMeanByMonth(table): def getMeanByMonth(table):
# Dictionary to store sum of measures and number of measures by month # Dictionary to store sum of measures and number of measures by month
table_month = {} table_month = {}
for r in table: for r in table:
# Verify the value of measure (needs to have a value) # Verify the value of measure (needs to have a value)
if r[len(r) - 1] != None: if r[len(r) - 1] != None:
month = r[1] month = r[1]
if month not in table_month.keys(): if month not in table_month.keys():
table_month[month] = 0,0 table_month[month] = 0,0
table_month[month] = (table_month[month][0] + r[len(r) - 1], table_month[month][1] + 1) table_month[month] = (table_month[month][0] + r[len(r) - 1], table_month[month][1] + 1)
# Treat datas to get mean by month # Treat datas to get mean by month
for d in table_month.keys(): for d in table_month.keys():
table_month[d] = table_month[d][0] / table_month[d][1] table_month[d] = table_month[d][0] / table_month[d][1]
return table_month return table_month
# Verify the validity of the years given # Verify the validity of the years given
def verifyYearValidity(dateMin, dateMax): def verifyYearValidity(dateMin, dateMax):
# Verification to ensure the validity of parameters, dates not equal # Verification to ensure the validity of parameters, dates not equal
if dateMin == dateMax: if dateMin == dateMax:
print(f"Les dates ne doivent pas être égales") print(f"Les dates ne doivent pas être égales")
return False return False
# Verification to ensure the validity of parameters, dates in the right period # Verification to ensure the validity of parameters, dates in the right period
if dateMin < MIN_DATE or dateMin > (MAX_DATE + 1) or dateMax < MIN_DATE or dateMax > (MAX_DATE + 1): if dateMin < MIN_DATE or dateMin > (MAX_DATE + 1) or dateMax < MIN_DATE or dateMax > (MAX_DATE + 1):
print(f"Les dates doivent être comprises entre {MIN_DATE} et {MAX_DATE}") print(f"Les dates doivent être comprises entre {MIN_DATE} et {MAX_DATE}")
return False return False
return True return True
def plotHistory(station, indicator, dateMin, dateMax): def plotHistory(station, indicator, dateMin, dateMax):
dateMax = dateMax + 1 dateMax = dateMax + 1
# Accept only indicator with numeric values (not factors) # Accept only indicator with numeric values (not factors)
if indicator in numeric_columns: if indicator in numeric_columns:
if not verifyYearValidity(dateMin, dateMax): if not verifyYearValidity(dateMin, dateMax):
return return
# Get datas from cassandra table # Get datas from cassandra table
table = getHistory(station, indicator) table = getHistory(station, indicator)
table = list(table) table = list(table)
# If no data for the period selected # If no data for the period selected
if not table: if not table:
print(f"Aucune donnée pour la station {station} et pour l'indicateur {indicator} et pour la période {dateMin} - {dateMax}") print(f"Aucune donnée pour la station {station} et pour l'indicateur {indicator} et pour la période {dateMin} - {dateMax}")
return return
# Treat datas # Treat datas
table_mean = getMeanByDay(table, dateMin, dateMax) table_mean = getMeanByDay(table, dateMin, dateMax)
table_mean_by_month = getMeanByMonth(table) table_mean_by_month = getMeanByMonth(table)
if not table_mean or not table_mean_by_month: if not table_mean or not table_mean_by_month:
print(f"Aucune donnée pour la station {station} et pour l'indicateur {indicator} et pour la période {dateMin} - {dateMax}") print(f"Aucune donnée pour la station {station} et pour l'indicateur {indicator} et pour la période {dateMin} - {dateMax}")
return return
# Duplicate list for each year in the period required # Duplicate list for each year in the period required
liste = [] liste = []
for i in range(dateMax - dateMin): for i in range(dateMax - dateMin):
for key,value in table_mean_by_month.items(): for key,value in table_mean_by_month.items():
liste.append([key, value]) liste.append([key, value])
# Completing the month to have a date format (yyyy-month-01) # Completing the month to have a date format (yyyy-month-01)
i = dateMin i = dateMin
j = 1 j = 1
for k in range(len(liste)): for k in range(len(liste)):
j += 1 j += 1
liste[k][0] = str(i) + '-' + str(liste[k][0]) liste[k][0] = str(i) + '-' + str(liste[k][0])
if j > 12: if j > 12:
i += 1 i += 1
j = 1 j = 1
# Name for file # Name for file
currentDateTime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") currentDateTime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
file_name = str(currentDateTime) + "_" + station + "_" + indicator + ".png" file_name = str(currentDateTime) + "_" + station + "_" + indicator + ".png"
# Configure graduation of plot # Configure graduation of plot
# Need to find a nice step for graduation (no more than 6 values to be understandable) # Need to find a nice step for graduation (no more than 6 values to be understandable)
step_graduation = max(math.ceil((dateMax - dateMin) / 5), 1) step_graduation = max(math.ceil((dateMax - dateMin) / 5), 1)
# Convert the graduation to the date format (yyyy-month-01) # Convert the graduation to the date format (yyyy-month-01)
graduation = ["20" + "0" * (2 - len(str(i))) + str(i) + "-01-01" for i in range(int(str(dateMin)[2:4]), int(str(dateMax)[2:4]), step_graduation)] graduation = ["20" + "0" * (2 - len(str(i))) + str(i) + "-01-01" for i in range(int(str(dateMin)[2:4]), int(str(dateMax)[2:4]), step_graduation)]
# Pour ne pas que la date de fin se superpose avec une autre date # Pour ne pas que la date de fin se superpose avec une autre date
if (dateMax - dateMin) % 2 == 0 or step_graduation == 1: if (dateMax - dateMin) % 2 == 0 or step_graduation == 1:
# Add the last value of graduation for the last day of measures # Add the last value of graduation for the last day of measures
graduation.append("20" + "0" * (2 - len(str(dateMax - 1)[2:4])) + str(dateMax - 1)[2:4] + "-12-31") graduation.append("20" + "0" * (2 - len(str(dateMax - 1)[2:4])) + str(dateMax - 1)[2:4] + "-12-31")
# Plot, with both measures and season mean # Plot, with both measures and season mean
fig, ax1 = plt.subplots() fig, ax1 = plt.subplots()
# Measures on axis 1 # Measures on axis 1
ax1.plot_date(table_mean.keys(), table_mean.values(), '-', xdate = True) ax1.plot_date(table_mean.keys(), table_mean.values(), '-', xdate = True)
ax1.xaxis.set_ticks(graduation) ax1.xaxis.set_ticks(graduation)
ax2 = ax1.twiny() ax2 = ax1.twiny()
# Seasonal mean # Seasonal mean
ax2.plot([elt[0] for elt in liste], [elt[1] for elt in liste], '-', color = "r") ax2.plot([elt[0] for elt in liste], [elt[1] for elt in liste], '-', color = "r")
# Do not show graduation on the top of the plot # Do not show graduation on the top of the plot
ax2.xaxis.set_ticks([]) ax2.xaxis.set_ticks([])
# Set title and labels # Set title and labels
plt.title(f"Evolution de {indicator} pour la station {station}") plt.title(f"Evolution de {indicator} pour la station {station}")
plt.xlabel('Date') plt.xlabel('Date')
plt.ylabel(indicator) plt.ylabel(indicator)
plt.tick_params( plt.tick_params(
axis='x', axis='x',
which='both', which='both',
bottom=False, bottom=False,
top=True top=True
) )
# Save figure # Save figure
plt.savefig(file_name) plt.savefig(file_name)
print(f"Le graphique a été enregistré à {file_name}") print(f"Le graphique a été enregistré à {file_name}")
else: else:
print("Les données pour cet indicateur ne sont pas numériques, impossible de tracer un graphique") print("Les données pour cet indicateur ne sont pas numériques, impossible de tracer un graphique")
if __name__ == '__main__': if __name__ == '__main__':
cluster = Cluster() cluster = Cluster()
session = cluster.connect() session = cluster.connect()
session.set_keyspace("bazinsim_roisinos_metar") session.set_keyspace("bazinsim_roisinos_metar")
print() print()
plotHistory("EFKI", "tmpf", 2001, 2004) plotHistory("EFKI", "tmpf", 2001, 2004)
print() print()
\ No newline at end of file \ No newline at end of file
...@@ -3,17 +3,19 @@ from datetime import datetime ...@@ -3,17 +3,19 @@ from datetime import datetime
from sklearn.cluster import KMeans from sklearn.cluster import KMeans
import numpy as np import numpy as np
import folium
import loading as l import loading as l
import history as h import history as h
colours = ['blue', 'red', 'green', 'orange', 'pink', 'white', 'purple', 'gray']
def getDatasForPeriod(startPeriod, endPeriod, indicators): def getDatasForPeriod(startPeriod, endPeriod, indicators):
datas = [] datas = []
for i in range(int(startPeriod[0:4]), int(endPeriod[0:4]) + 1): for i in range(int(startPeriod[0:4]), int(endPeriod[0:4]) + 1):
datas += session.execute(f"SELECT year, month, day, station, {indicators} FROM {l.table_name_date} where year = {i}") datas += session.execute(f"SELECT year, month, day, station, lat, lon, {indicators} FROM {l.table_name_date} where year = {i}")
return datas return datas
def verifyDateInPeriod(startPeriod, endPeriod, year, month, day): def verifyDateInPeriod(startPeriod, endPeriod, year, month, day):
isDate = year.isdigit() and month.isdigit() and day.isdigit() isDate = year.isdigit() and month.isdigit() and day.isdigit()
...@@ -26,119 +28,131 @@ def verifyDateInPeriod(startPeriod, endPeriod, year, month, day): ...@@ -26,119 +28,131 @@ def verifyDateInPeriod(startPeriod, endPeriod, year, month, day):
return False return False
def getDecileForAllStations(startPeriod, endPeriod, table, nb_indicators, indicators_list): def getDecileForAllStations(startPeriod, endPeriod, table, nb_indicators, indicators_list):
# map with station and list of maps # map with station and list of maps
# the list of maps is used for all indicators # the list of maps is used for all indicators
# the second map contains the indicator with the list of values for this indicator # the second map contains the indicator with the list of values for this indicator
l = {} l = {}
for t in table: for t in table:
if verifyDateInPeriod(startPeriod, endPeriod, str(t[0]), str(t[1]), str(t[2])): if verifyDateInPeriod(startPeriod, endPeriod, str(t[0]), str(t[1]), str(t[2])):
if t[3] not in l.keys(): if t[3] not in l.keys():
l[t[3]] = [] l[t[3]] = []
for i in range(nb_indicators): for i in range(nb_indicators):
if t[4 + i] != None: if t[6 + i] != None:
l[t[3]].append({indicators_list[i] : [float(t[4 + i])]}) l[t[3]].append({indicators_list[i] : [float(t[6 + i])]})
else: else:
for i in range(nb_indicators): for i in range(nb_indicators):
if t[4 + i] != None: if t[6 + i] != None:
l[t[3]][i][indicators_list[i]].append(float(t[4 + i])) l[t[3]][i][indicators_list[i]].append(float(t[6 + i]))
# Sort all lists of values # Sort all lists of values
for station in l.keys(): for station in l.keys():
for i in range(nb_indicators): for i in range(nb_indicators):
l[station][i][indicators_list[i]].sort() l[station][i][indicators_list[i]].sort()
# Deciles is a map mapping station with a list of maps containing indicators and their deciles # Deciles is a map mapping station with a list of maps containing indicators and their deciles
# example for 2 stations with 2 indicators # example for 2 stations with 2 indicators
# {'EFKI': [{'tmpf': [-23.8, 6.8, 17.6, 26.6, 32.0, 39.2, 44.6, 48.2, 53.6, 62.6, 91.4]}, {'dwpf': [-31.0, 5.0, 14.0, 24.8, 32.0, 35.6, 39.2, 42.8, 50.0, 55.4, 69.8]}], 'EFHA': [{'tmpf': [-23.8, 6.8, 17.6, 26.6, 32.0, 39.2, 44.6, 48.2, 53.6, 62.6, 91.4]}, {'dwpf': [-31.0, 5.0, 14.0, 24.8, 32.0, 35.6, 39.2, 42.8, 50.0, 55.4, 69.8]}]} # {'EFKI': [{'tmpf': [-23.8, 6.8, 17.6, 26.6, 32.0, 39.2, 44.6, 48.2, 53.6, 62.6, 91.4]}, {'dwpf': [-31.0, 5.0, 14.0, 24.8, 32.0, 35.6, 39.2, 42.8, 50.0, 55.4, 69.8]}], 'EFHA': [{'tmpf': [-23.8, 6.8, 17.6, 26.6, 32.0, 39.2, 44.6, 48.2, 53.6, 62.6, 91.4]}, {'dwpf': [-31.0, 5.0, 14.0, 24.8, 32.0, 35.6, 39.2, 42.8, 50.0, 55.4, 69.8]}]}
deciles = {} deciles = {}
for station in l.keys(): for station in l.keys():
deciles[station] = [] deciles[station] = []
for i in range(nb_indicators): for i in range(nb_indicators):
deciles[station].append({indicators_list[i] : []}) deciles[station].append({indicators_list[i] : []})
# Compute deciles, from 0 to 10 (= includes min and max) # Compute deciles, from 0 to 10 (= includes min and max)
for d in range(11): for d in range(11):
if d == 10: if d == 10:
deciles[station][i][indicators_list[i]].append(l[station][i][indicators_list[i]][len(l[station][i][indicators_list[i]]) - 1]) deciles[station][i][indicators_list[i]].append(l[station][i][indicators_list[i]][len(l[station][i][indicators_list[i]]) - 1])
else: else:
deciles[station][i][indicators_list[i]].append(l[station][i][indicators_list[i]][len(l[station][i][indicators_list[i]]) // 10 * d]) deciles[station][i][indicators_list[i]].append(l[station][i][indicators_list[i]][len(l[station][i][indicators_list[i]]) // 10 * d])
return deciles return deciles
def applyKmeans(deciles, nb_indicators, indicators_list, startPeriod, endPeriod): def applyKmeans(deciles, nb_indicators, indicators_list, startPeriod, endPeriod):
# Create table without map # Create table without map
table = [] table = []
# Create list with stations name # Create list with stations name
stations_name = [] stations_name = []
for station in deciles.keys(): for station in deciles.keys():
t = [] t = []
stations_name.append(station) stations_name.append(station)
for i in range(nb_indicators): for i in range(nb_indicators):
t += deciles[station][i][indicators_list[i]] t += deciles[station][i][indicators_list[i]]
table.append(t) table.append(t)
nb_clusters = 3 nb_clusters = 4
if len(stations_name) < nb_clusters: if len(stations_name) < nb_clusters:
print(f"Le nombre de villes ayant des données est trop inférieur ({len(stations_name)}) pour appliquer les kmeans pour la période du {startPeriod} au {endPeriod}") print(f"Le nombre de villes ayant des données est trop inférieur ({len(stations_name)}) pour appliquer les kmeans pour la période du {startPeriod} au {endPeriod}")
return None return None
kmeans = KMeans(n_clusters = nb_clusters, max_iter = 100).fit(table) kmeans = KMeans(n_clusters = nb_clusters, max_iter = 100).fit(table)
res = {} res = {}
i = 0 i = 0
for station in stations_name: for station in stations_name:
res[station] = kmeans.labels_[i] res[station] = kmeans.labels_[i]
i += 1 i += 1
return res return res
def kmeans(startPeriod, endPeriod, indicators_list): def kmeans(startPeriod, endPeriod, indicators_list):
startDate = datetime.strptime(startPeriod, "%Y-%m-%d") startDate = datetime.strptime(startPeriod, "%Y-%m-%d")
endDate = datetime.strptime(endPeriod, "%Y-%m-%d") endDate = datetime.strptime(endPeriod, "%Y-%m-%d")
firstDate = datetime.strptime(l.FIRST_DAY, "%Y-%m-%d") firstDate = datetime.strptime(l.FIRST_DAY, "%Y-%m-%d")
lastDate = datetime.strptime(l.LAST_DAY, "%Y-%m-%d") lastDate = datetime.strptime(l.LAST_DAY, "%Y-%m-%d")
if startDate < firstDate or startDate > lastDate or endDate < firstDate or endDate > lastDate: if startDate < firstDate or startDate > lastDate or endDate < firstDate or endDate > lastDate:
print(f"Les dates doivent être comprises entre {l.FIRST_DAY} et {l.LAST_DAY}") print(f"Les dates doivent être comprises entre {l.FIRST_DAY} et {l.LAST_DAY}")
return return
if not h.verifyYearValidity(int(startPeriod[0:4]), int(endPeriod[0:4])): if not h.verifyYearValidity(int(startPeriod[0:4]), int(endPeriod[0:4])):
return return
# Create a string with indicators concatenated # Create a string with indicators concatenated
indicators = "" indicators = ""
indicators_list_numeric = [] indicators_list_numeric = []
nb_indicators = 0 nb_indicators = 0
for ind in indicators_list: for ind in indicators_list:
if ind in l.numeric_columns: if ind in l.numeric_columns:
if nb_indicators == 0: if nb_indicators == 0:
indicators += ind indicators += ind
indicators_list_numeric.append(ind) indicators_list_numeric.append(ind)
nb_indicators += 1 nb_indicators += 1
else: else:
indicators += "," + ind indicators += "," + ind
indicators_list_numeric.append(ind) indicators_list_numeric.append(ind)
nb_indicators += 1 nb_indicators += 1
table = getDatasForPeriod(startPeriod, endPeriod, indicators) table = getDatasForPeriod(startPeriod, endPeriod, indicators)
table = list(table) table = list(table)
# Get coordinates
# Get the map with all deciles for all stations and indicators coord = dict()
table_deciles = getDecileForAllStations(startPeriod, endPeriod, table, nb_indicators, indicators_list_numeric) for t in table:
coord[t[3]]=(t[4], t[5])
station_with_center = applyKmeans(table_deciles, nb_indicators, indicators_list_numeric, startPeriod, endPeriod) # Get the map with all deciles for all stations and indicators
if station_with_center != None: table_deciles = getDecileForAllStations(startPeriod, endPeriod, table, nb_indicators, indicators_list_numeric)
print(f"Voici les villes et le cluster auxquelles elles appartiennent:")
print(f"{station_with_center}") station_with_center = applyKmeans(table_deciles, nb_indicators, indicators_list_numeric, startPeriod, endPeriod)
if station_with_center != None:
file_name = f"{startPeriod} to {endPeriod}.html"
# Create map
m = folium.Map(location=[64.2815, 27.6753])
# Add Marker for each station
for key, value in station_with_center.items():
folium.Marker([coord[key][0], coord[key][1]], popup=f"<b>{key}</b>", icon=folium.Icon(color=colours[value])).add_to(m)
# Save map
m.save(file_name)
print(f"La carte a été enregistrée à {file_name}")
else:
print(f"Aucune clusterisation déterminée")
if __name__ == '__main__': if __name__ == '__main__':
cluster = Cluster() cluster = Cluster()
session = cluster.connect() session = cluster.connect()
session.set_keyspace("bazinsim_roisinos_metar") session.set_keyspace("bazinsim_roisinos_metar")
print() print()
# kmeans("2001-01-01", "2010-12-31", ["tmpf", "skyc1"]) # kmeans("2001-01-01", "2010-12-31", ["tmpf", "skyc1"])
kmeans("2001-01-01", "2010-12-31", ["tmpf", "dwpf", "skyc1"]) kmeans("2001-01-01", "2010-12-31", ["tmpf", "dwpf", "skyc1"])
print() print()
...@@ -51,345 +51,345 @@ LAST_DAY = "2010-12-31" ...@@ -51,345 +51,345 @@ LAST_DAY = "2010-12-31"
# metar: unprocessed reported observation in METAR format # metar: unprocessed reported observation in METAR format
def loadata(filename): def loadata(filename):
dateparser = re.compile( dateparser = re.compile(
"(?P<year>\d+)-(?P<month>\d+)-(?P<day>\d+) (?P<hour>\d+):(?P<minute>\d+)" "(?P<year>\d+)-(?P<month>\d+)-(?P<day>\d+) (?P<hour>\d+):(?P<minute>\d+)"
) )
with open(filename) as f: with open(filename) as f:
for r in csv.DictReader(f): for r in csv.DictReader(f):
data = {} data = {}
data["station"] = r["station"] data["station"] = r["station"]
valid = dateparser.match(r["valid"]).groupdict() valid = dateparser.match(r["valid"]).groupdict()
data["year"] = int(valid["year"]) data["year"] = int(valid["year"])
data["month"] = int(valid["month"]) data["month"] = int(valid["month"])
data["day"] = int(valid["day"]) data["day"] = int(valid["day"])
data["hour"] = int(valid["hour"]) data["hour"] = int(valid["hour"])
data["minute"] = int(valid["minute"]) data["minute"] = int(valid["minute"])
data["lon"] = float(r["lon"]) data["lon"] = float(r["lon"])
data["lat"] = float(r["lat"]) data["lat"] = float(r["lat"])
if r["tmpf"] == 'null': if r["tmpf"] == 'null':
data["tmpf"] = 'null' data["tmpf"] = 'null'
else: else:
data["tmpf"] = float(r["tmpf"]) data["tmpf"] = float(r["tmpf"])
if r["dwpf"] == 'null': if r["dwpf"] == 'null':
data["dwpf"] = 'null' data["dwpf"] = 'null'
else: else:
data["dwpf"] = float(r["dwpf"]) data["dwpf"] = float(r["dwpf"])
if r["relh"] == 'null': if r["relh"] == 'null':
data["relh"] = 'null' data["relh"] = 'null'
else: else:
data["relh"] = float(r["relh"]) data["relh"] = float(r["relh"])
if r["drct"] == 'null': if r["drct"] == 'null':