Commit 86c8b86e authored by Oscar Roisin's avatar Oscar Roisin
Browse files

clean code + add map to kmeans

parent 24cbaffa
......@@ -14,145 +14,145 @@ MIN_DATE = loading.MIN_DATE
MAX_DATE = loading.MAX_DATE
def getHistory(station, indicator):
datas = session.execute(f"SELECT year, month, day, {indicator} FROM {table_name_space} where station = '{station}'")
return datas
datas = session.execute(f"SELECT year, month, day, {indicator} FROM {table_name_space} where station = '{station}'")
return datas
def getMeanByDay(table, dateMin, dateMax):
# Dictionary to store sum of measures and number of measures by day
table_date = {}
for r in table:
year = r[0]
# Verify the measures is for the period chosen by user and have a value
if year >= dateMin and year < dateMax and r[len(r) - 1] != None:
# convert attributes to date format as string
date = str(r[0]) + "-" + "0" * (2 - len(str(r[1]))) + str(r[1]) + "-" + "0" * (2 - len(str(r[2]))) + str(r[2])
if date not in table_date.keys():
table_date[date] = 0,0
table_date[date] = (table_date[date][0] + r[len(r) - 1], table_date[date][1] + 1)
# Treat datas to get mean by day
for d in table_date.keys():
table_date[d] = table_date[d][0] / table_date[d][1]
return table_date
# Dictionary to store sum of measures and number of measures by day
table_date = {}
for r in table:
year = r[0]
# Verify the measures is for the period chosen by user and have a value
if year >= dateMin and year < dateMax and r[len(r) - 1] != None:
# convert attributes to date format as string
date = str(r[0]) + "-" + "0" * (2 - len(str(r[1]))) + str(r[1]) + "-" + "0" * (2 - len(str(r[2]))) + str(r[2])
if date not in table_date.keys():
table_date[date] = 0,0
table_date[date] = (table_date[date][0] + r[len(r) - 1], table_date[date][1] + 1)
# Treat datas to get mean by day
for d in table_date.keys():
table_date[d] = table_date[d][0] / table_date[d][1]
return table_date
def getMeanByMonth(table):
# Dictionary to store sum of measures and number of measures by month
table_month = {}
for r in table:
# Verify the value of measure (needs to have a value)
if r[len(r) - 1] != None:
month = r[1]
if month not in table_month.keys():
table_month[month] = 0,0
table_month[month] = (table_month[month][0] + r[len(r) - 1], table_month[month][1] + 1)
# Treat datas to get mean by month
for d in table_month.keys():
table_month[d] = table_month[d][0] / table_month[d][1]
return table_month
# Dictionary to store sum of measures and number of measures by month
table_month = {}
for r in table:
# Verify the value of measure (needs to have a value)
if r[len(r) - 1] != None:
month = r[1]
if month not in table_month.keys():
table_month[month] = 0,0
table_month[month] = (table_month[month][0] + r[len(r) - 1], table_month[month][1] + 1)
# Treat datas to get mean by month
for d in table_month.keys():
table_month[d] = table_month[d][0] / table_month[d][1]
return table_month
# Verify the validity of the years given
def verifyYearValidity(dateMin, dateMax):
# Verification to ensure the validity of parameters, dates not equal
if dateMin == dateMax:
print(f"Les dates ne doivent pas être égales")
return False
# Verification to ensure the validity of parameters, dates not equal
if dateMin == dateMax:
print(f"Les dates ne doivent pas être égales")
return False
# Verification to ensure the validity of parameters, dates in the right period
if dateMin < MIN_DATE or dateMin > (MAX_DATE + 1) or dateMax < MIN_DATE or dateMax > (MAX_DATE + 1):
print(f"Les dates doivent être comprises entre {MIN_DATE} et {MAX_DATE}")
return False
return True
# Verification to ensure the validity of parameters, dates in the right period
if dateMin < MIN_DATE or dateMin > (MAX_DATE + 1) or dateMax < MIN_DATE or dateMax > (MAX_DATE + 1):
print(f"Les dates doivent être comprises entre {MIN_DATE} et {MAX_DATE}")
return False
return True
def plotHistory(station, indicator, dateMin, dateMax):
dateMax = dateMax + 1
# Accept only indicator with numeric values (not factors)
if indicator in numeric_columns:
if not verifyYearValidity(dateMin, dateMax):
return
# Get datas from cassandra table
table = getHistory(station, indicator)
table = list(table)
# If no data for the period selected
if not table:
print(f"Aucune donnée pour la station {station} et pour l'indicateur {indicator} et pour la période {dateMin} - {dateMax}")
return
# Treat datas
table_mean = getMeanByDay(table, dateMin, dateMax)
table_mean_by_month = getMeanByMonth(table)
if not table_mean or not table_mean_by_month:
print(f"Aucune donnée pour la station {station} et pour l'indicateur {indicator} et pour la période {dateMin} - {dateMax}")
return
# Duplicate list for each year in the period required
liste = []
for i in range(dateMax - dateMin):
for key,value in table_mean_by_month.items():
liste.append([key, value])
# Completing the month to have a date format (yyyy-month-01)
i = dateMin
j = 1
for k in range(len(liste)):
j += 1
liste[k][0] = str(i) + '-' + str(liste[k][0])
if j > 12:
i += 1
j = 1
# Name for file
currentDateTime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
file_name = str(currentDateTime) + "_" + station + "_" + indicator + ".png"
# Configure graduation of plot
# Need to find a nice step for graduation (no more than 6 values to be understandable)
step_graduation = max(math.ceil((dateMax - dateMin) / 5), 1)
# Convert the graduation to the date format (yyyy-month-01)
graduation = ["20" + "0" * (2 - len(str(i))) + str(i) + "-01-01" for i in range(int(str(dateMin)[2:4]), int(str(dateMax)[2:4]), step_graduation)]
# Pour ne pas que la date de fin se superpose avec une autre date
if (dateMax - dateMin) % 2 == 0 or step_graduation == 1:
# Add the last value of graduation for the last day of measures
graduation.append("20" + "0" * (2 - len(str(dateMax - 1)[2:4])) + str(dateMax - 1)[2:4] + "-12-31")
# Plot, with both measures and season mean
fig, ax1 = plt.subplots()
# Measures on axis 1
ax1.plot_date(table_mean.keys(), table_mean.values(), '-', xdate = True)
ax1.xaxis.set_ticks(graduation)
ax2 = ax1.twiny()
# Seasonal mean
ax2.plot([elt[0] for elt in liste], [elt[1] for elt in liste], '-', color = "r")
# Do not show graduation on the top of the plot
ax2.xaxis.set_ticks([])
# Set title and labels
plt.title(f"Evolution de {indicator} pour la station {station}")
plt.xlabel('Date')
plt.ylabel(indicator)
plt.tick_params(
axis='x',
which='both',
bottom=False,
top=True
)
# Save figure
plt.savefig(file_name)
print(f"Le graphique a été enregistré à {file_name}")
else:
print("Les données pour cet indicateur ne sont pas numériques, impossible de tracer un graphique")
dateMax = dateMax + 1
# Accept only indicator with numeric values (not factors)
if indicator in numeric_columns:
if not verifyYearValidity(dateMin, dateMax):
return
# Get datas from cassandra table
table = getHistory(station, indicator)
table = list(table)
# If no data for the period selected
if not table:
print(f"Aucune donnée pour la station {station} et pour l'indicateur {indicator} et pour la période {dateMin} - {dateMax}")
return
# Treat datas
table_mean = getMeanByDay(table, dateMin, dateMax)
table_mean_by_month = getMeanByMonth(table)
if not table_mean or not table_mean_by_month:
print(f"Aucune donnée pour la station {station} et pour l'indicateur {indicator} et pour la période {dateMin} - {dateMax}")
return
# Duplicate list for each year in the period required
liste = []
for i in range(dateMax - dateMin):
for key,value in table_mean_by_month.items():
liste.append([key, value])
# Completing the month to have a date format (yyyy-month-01)
i = dateMin
j = 1
for k in range(len(liste)):
j += 1
liste[k][0] = str(i) + '-' + str(liste[k][0])
if j > 12:
i += 1
j = 1
# Name for file
currentDateTime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
file_name = str(currentDateTime) + "_" + station + "_" + indicator + ".png"
# Configure graduation of plot
# Need to find a nice step for graduation (no more than 6 values to be understandable)
step_graduation = max(math.ceil((dateMax - dateMin) / 5), 1)
# Convert the graduation to the date format (yyyy-month-01)
graduation = ["20" + "0" * (2 - len(str(i))) + str(i) + "-01-01" for i in range(int(str(dateMin)[2:4]), int(str(dateMax)[2:4]), step_graduation)]
# Pour ne pas que la date de fin se superpose avec une autre date
if (dateMax - dateMin) % 2 == 0 or step_graduation == 1:
# Add the last value of graduation for the last day of measures
graduation.append("20" + "0" * (2 - len(str(dateMax - 1)[2:4])) + str(dateMax - 1)[2:4] + "-12-31")
# Plot, with both measures and season mean
fig, ax1 = plt.subplots()
# Measures on axis 1
ax1.plot_date(table_mean.keys(), table_mean.values(), '-', xdate = True)
ax1.xaxis.set_ticks(graduation)
ax2 = ax1.twiny()
# Seasonal mean
ax2.plot([elt[0] for elt in liste], [elt[1] for elt in liste], '-', color = "r")
# Do not show graduation on the top of the plot
ax2.xaxis.set_ticks([])
# Set title and labels
plt.title(f"Evolution de {indicator} pour la station {station}")
plt.xlabel('Date')
plt.ylabel(indicator)
plt.tick_params(
axis='x',
which='both',
bottom=False,
top=True
)
# Save figure
plt.savefig(file_name)
print(f"Le graphique a été enregistré à {file_name}")
else:
print("Les données pour cet indicateur ne sont pas numériques, impossible de tracer un graphique")
if __name__ == '__main__':
cluster = Cluster()
session = cluster.connect()
session.set_keyspace("bazinsim_roisinos_metar")
cluster = Cluster()
session = cluster.connect()
session.set_keyspace("bazinsim_roisinos_metar")
print()
plotHistory("EFKI", "tmpf", 2001, 2004)
print()
\ No newline at end of file
print()
plotHistory("EFKI", "tmpf", 2001, 2004)
print()
\ No newline at end of file
......@@ -3,17 +3,19 @@ from datetime import datetime
from sklearn.cluster import KMeans
import numpy as np
import folium
import loading as l
import history as h
colours = ['blue', 'red', 'green', 'orange', 'pink', 'white', 'purple', 'gray']
def getDatasForPeriod(startPeriod, endPeriod, indicators):
datas = []
for i in range(int(startPeriod[0:4]), int(endPeriod[0:4]) + 1):
datas += session.execute(f"SELECT year, month, day, station, {indicators} FROM {l.table_name_date} where year = {i}")
datas = []
for i in range(int(startPeriod[0:4]), int(endPeriod[0:4]) + 1):
datas += session.execute(f"SELECT year, month, day, station, lat, lon, {indicators} FROM {l.table_name_date} where year = {i}")
return datas
return datas
def verifyDateInPeriod(startPeriod, endPeriod, year, month, day):
isDate = year.isdigit() and month.isdigit() and day.isdigit()
......@@ -26,119 +28,131 @@ def verifyDateInPeriod(startPeriod, endPeriod, year, month, day):
return False
def getDecileForAllStations(startPeriod, endPeriod, table, nb_indicators, indicators_list):
# map with station and list of maps
# the list of maps is used for all indicators
# the second map contains the indicator with the list of values for this indicator
l = {}
for t in table:
if verifyDateInPeriod(startPeriod, endPeriod, str(t[0]), str(t[1]), str(t[2])):
if t[3] not in l.keys():
l[t[3]] = []
for i in range(nb_indicators):
if t[4 + i] != None:
l[t[3]].append({indicators_list[i] : [float(t[4 + i])]})
else:
for i in range(nb_indicators):
if t[4 + i] != None:
l[t[3]][i][indicators_list[i]].append(float(t[4 + i]))
# Sort all lists of values
for station in l.keys():
for i in range(nb_indicators):
l[station][i][indicators_list[i]].sort()
# Deciles is a map mapping station with a list of maps containing indicators and their deciles
# example for 2 stations with 2 indicators
# {'EFKI': [{'tmpf': [-23.8, 6.8, 17.6, 26.6, 32.0, 39.2, 44.6, 48.2, 53.6, 62.6, 91.4]}, {'dwpf': [-31.0, 5.0, 14.0, 24.8, 32.0, 35.6, 39.2, 42.8, 50.0, 55.4, 69.8]}], 'EFHA': [{'tmpf': [-23.8, 6.8, 17.6, 26.6, 32.0, 39.2, 44.6, 48.2, 53.6, 62.6, 91.4]}, {'dwpf': [-31.0, 5.0, 14.0, 24.8, 32.0, 35.6, 39.2, 42.8, 50.0, 55.4, 69.8]}]}
deciles = {}
for station in l.keys():
deciles[station] = []
for i in range(nb_indicators):
deciles[station].append({indicators_list[i] : []})
# Compute deciles, from 0 to 10 (= includes min and max)
for d in range(11):
if d == 10:
deciles[station][i][indicators_list[i]].append(l[station][i][indicators_list[i]][len(l[station][i][indicators_list[i]]) - 1])
else:
deciles[station][i][indicators_list[i]].append(l[station][i][indicators_list[i]][len(l[station][i][indicators_list[i]]) // 10 * d])
return deciles
# map with station and list of maps
# the list of maps is used for all indicators
# the second map contains the indicator with the list of values for this indicator
l = {}
for t in table:
if verifyDateInPeriod(startPeriod, endPeriod, str(t[0]), str(t[1]), str(t[2])):
if t[3] not in l.keys():
l[t[3]] = []
for i in range(nb_indicators):
if t[6 + i] != None:
l[t[3]].append({indicators_list[i] : [float(t[6 + i])]})
else:
for i in range(nb_indicators):
if t[6 + i] != None:
l[t[3]][i][indicators_list[i]].append(float(t[6 + i]))
# Sort all lists of values
for station in l.keys():
for i in range(nb_indicators):
l[station][i][indicators_list[i]].sort()
# Deciles is a map mapping station with a list of maps containing indicators and their deciles
# example for 2 stations with 2 indicators
# {'EFKI': [{'tmpf': [-23.8, 6.8, 17.6, 26.6, 32.0, 39.2, 44.6, 48.2, 53.6, 62.6, 91.4]}, {'dwpf': [-31.0, 5.0, 14.0, 24.8, 32.0, 35.6, 39.2, 42.8, 50.0, 55.4, 69.8]}], 'EFHA': [{'tmpf': [-23.8, 6.8, 17.6, 26.6, 32.0, 39.2, 44.6, 48.2, 53.6, 62.6, 91.4]}, {'dwpf': [-31.0, 5.0, 14.0, 24.8, 32.0, 35.6, 39.2, 42.8, 50.0, 55.4, 69.8]}]}
deciles = {}
for station in l.keys():
deciles[station] = []
for i in range(nb_indicators):
deciles[station].append({indicators_list[i] : []})
# Compute deciles, from 0 to 10 (= includes min and max)
for d in range(11):
if d == 10:
deciles[station][i][indicators_list[i]].append(l[station][i][indicators_list[i]][len(l[station][i][indicators_list[i]]) - 1])
else:
deciles[station][i][indicators_list[i]].append(l[station][i][indicators_list[i]][len(l[station][i][indicators_list[i]]) // 10 * d])
return deciles
def applyKmeans(deciles, nb_indicators, indicators_list, startPeriod, endPeriod):
# Create table without map
table = []
# Create list with stations name
stations_name = []
# Create table without map
table = []
# Create list with stations name
stations_name = []
for station in deciles.keys():
t = []
stations_name.append(station)
for i in range(nb_indicators):
t += deciles[station][i][indicators_list[i]]
table.append(t)
for station in deciles.keys():
t = []
stations_name.append(station)
for i in range(nb_indicators):
t += deciles[station][i][indicators_list[i]]
table.append(t)
nb_clusters = 3
if len(stations_name) < nb_clusters:
print(f"Le nombre de villes ayant des données est trop inférieur ({len(stations_name)}) pour appliquer les kmeans pour la période du {startPeriod} au {endPeriod}")
return None
nb_clusters = 4
if len(stations_name) < nb_clusters:
print(f"Le nombre de villes ayant des données est trop inférieur ({len(stations_name)}) pour appliquer les kmeans pour la période du {startPeriod} au {endPeriod}")
return None
kmeans = KMeans(n_clusters = nb_clusters, max_iter = 100).fit(table)
kmeans = KMeans(n_clusters = nb_clusters, max_iter = 100).fit(table)
res = {}
i = 0
for station in stations_name:
res[station] = kmeans.labels_[i]
i += 1
res = {}
i = 0
for station in stations_name:
res[station] = kmeans.labels_[i]
i += 1
return res
return res
def kmeans(startPeriod, endPeriod, indicators_list):
startDate = datetime.strptime(startPeriod, "%Y-%m-%d")
endDate = datetime.strptime(endPeriod, "%Y-%m-%d")
firstDate = datetime.strptime(l.FIRST_DAY, "%Y-%m-%d")
lastDate = datetime.strptime(l.LAST_DAY, "%Y-%m-%d")
if startDate < firstDate or startDate > lastDate or endDate < firstDate or endDate > lastDate:
print(f"Les dates doivent être comprises entre {l.FIRST_DAY} et {l.LAST_DAY}")
return
if not h.verifyYearValidity(int(startPeriod[0:4]), int(endPeriod[0:4])):
return
# Create a string with indicators concatenated
indicators = ""
indicators_list_numeric = []
nb_indicators = 0
for ind in indicators_list:
if ind in l.numeric_columns:
if nb_indicators == 0:
indicators += ind
indicators_list_numeric.append(ind)
nb_indicators += 1
else:
indicators += "," + ind
indicators_list_numeric.append(ind)
nb_indicators += 1
table = getDatasForPeriod(startPeriod, endPeriod, indicators)
table = list(table)
# Get the map with all deciles for all stations and indicators
table_deciles = getDecileForAllStations(startPeriod, endPeriod, table, nb_indicators, indicators_list_numeric)
station_with_center = applyKmeans(table_deciles, nb_indicators, indicators_list_numeric, startPeriod, endPeriod)
if station_with_center != None:
print(f"Voici les villes et le cluster auxquelles elles appartiennent:")
print(f"{station_with_center}")
startDate = datetime.strptime(startPeriod, "%Y-%m-%d")
endDate = datetime.strptime(endPeriod, "%Y-%m-%d")
firstDate = datetime.strptime(l.FIRST_DAY, "%Y-%m-%d")
lastDate = datetime.strptime(l.LAST_DAY, "%Y-%m-%d")
if startDate < firstDate or startDate > lastDate or endDate < firstDate or endDate > lastDate:
print(f"Les dates doivent être comprises entre {l.FIRST_DAY} et {l.LAST_DAY}")
return
if not h.verifyYearValidity(int(startPeriod[0:4]), int(endPeriod[0:4])):
return
# Create a string with indicators concatenated
indicators = ""
indicators_list_numeric = []
nb_indicators = 0
for ind in indicators_list:
if ind in l.numeric_columns:
if nb_indicators == 0:
indicators += ind
indicators_list_numeric.append(ind)
nb_indicators += 1
else:
indicators += "," + ind
indicators_list_numeric.append(ind)
nb_indicators += 1
table = getDatasForPeriod(startPeriod, endPeriod, indicators)
table = list(table)
# Get coordinates
coord = dict()
for t in table:
coord[t[3]]=(t[4], t[5])
# Get the map with all deciles for all stations and indicators
table_deciles = getDecileForAllStations(startPeriod, endPeriod, table, nb_indicators, indicators_list_numeric)
station_with_center = applyKmeans(table_deciles, nb_indicators, indicators_list_numeric, startPeriod, endPeriod)
if station_with_center != None:
file_name = f"{startPeriod} to {endPeriod}.html"
# Create map
m = folium.Map(location=[64.2815, 27.6753])
# Add Marker for each station
for key, value in station_with_center.items():
folium.Marker([coord[key][0], coord[key][1]], popup=f"<b>{key}</b>", icon=folium.Icon(color=colours[value])).add_to(m)
# Save map
m.save(file_name)
print(f"La carte a été enregistrée à {file_name}")
else:
print(f"Aucune clusterisation déterminée")
if __name__ == '__main__':
cluster = Cluster()
session = cluster.connect()
session.set_keyspace("bazinsim_roisinos_metar")
print()
# kmeans("2001-01-01", "2010-12-31", ["tmpf", "skyc1"])
kmeans("2001-01-01", "2010-12-31", ["tmpf", "dwpf", "skyc1"])
print()
cluster = Cluster()
session = cluster.connect()
session.set_keyspace("bazinsim_roisinos_metar")
print()
# kmeans("2001-01-01", "2010-12-31", ["tmpf", "skyc1"])
kmeans("2001-01-01", "2010-12-31", ["tmpf", "dwpf", "skyc1"])
print()
......@@ -51,345 +51,345 @@ LAST_DAY = "2010-12-31"
# metar: unprocessed reported observation in METAR format
def loadata(filename):
dateparser = re.compile(
"(?P<year>\d+)-(?P<month>\d+)-(?P<day>\d+) (?P<hour>\d+):(?P<minute>\d+)"
)
with open(filename) as f:
for r in csv.DictReader(f):
data = {}
data["station"] = r["station"]
valid = dateparser.match(r["valid"]).groupdict()
data["year"] = int(valid["year"])
data["month"] = int(valid["month"])
data["day"] = int(valid["day"])
data["hour"] = int(valid["hour"])
data["minute"] = int(valid["minute"])
data["lon"] = float(r["lon"])
data["lat"] = float(r["lat"])
if r["tmpf"] == 'null':
data["tmpf"] = 'null'
else:
data["tmpf"] = float(r["tmpf"])
if r["dwpf"] == 'null':
data["dwpf"] = 'null'
else:
data["dwpf"] = float(r["dwpf"])
if r["relh"] == 'null':
data["relh"] = 'null'
else:
data["relh"] = float(r["relh"])
if r["drct"] == 'null':
data["drct"] = 'null'
else:
data["drct"] = float(r["drct"])
if r["sknt"] == 'null':
data["sknt"] = 'null'
else:
data["sknt"] = float(r["sknt"])
if r["p01i"] == 'null':
data["p01i"] = 'null'
else:
data["p01i"] = float(r["p01i"])