Commit 67ccb854 authored by sim-baz's avatar sim-baz

Starting kmeans function

Getting deciles for stations and indicators
parent 563b636a
...@@ -24,7 +24,7 @@ def getMeanByDay(table, dateMin, dateMax): ...@@ -24,7 +24,7 @@ def getMeanByDay(table, dateMin, dateMax):
# Verify the measures is for the period chosen by user and have a value # Verify the measures is for the period chosen by user and have a value
if year >= dateMin and year < dateMax and r[len(r) - 1] != None: if year >= dateMin and year < dateMax and r[len(r) - 1] != None:
# convert attributes to date format as string # convert attributes to date format as string
date = str(r[0]) + "-" + str(r[1]) + "-" + str(r[2]) date = str(r[0]) + "-" + "0" * (2 - len(str(r[1]))) + str(r[1]) + "-" + "0" * (2 - len(str(r[2]))) + str(r[2])
if date not in table_date.keys(): if date not in table_date.keys():
table_date[date] = 0,0 table_date[date] = 0,0
table_date[date] = (table_date[date][0] + r[len(r) - 1], table_date[date][1] + 1) table_date[date] = (table_date[date][0] + r[len(r) - 1], table_date[date][1] + 1)
...@@ -52,17 +52,23 @@ def getMeanByMonth(table): ...@@ -52,17 +52,23 @@ def getMeanByMonth(table):
return table_month return table_month
# Verify the validity of the years given
def verifyYearValidity(dateMin, dateMax):
# Verification to ensure the validity of parameters, dates not equal
if dateMin == dateMax:
print(f"Les dates ne doivent pas être égales")
return False
# Verification to ensure the validity of parameters, dates in the right period
if dateMin < MIN_DATE or dateMin > (MAX_DATE + 1) or dateMax < MIN_DATE or dateMax > (MAX_DATE + 1):
print(f"Les dates doivent être comprises entre {MIN_DATE} et {MAX_DATE}")
return False
return True
def plotHistory(station, indicator, dateMin, dateMax): def plotHistory(station, indicator, dateMin, dateMax):
# Accept only indicator with numeric values (not factors) # Accept only indicator with numeric values (not factors)
if indicator in numeric_columns: if indicator in numeric_columns:
# Verification to ensure the validity of parameters, dates not equal if not verifyYearValidity(dateMin, dateMax):
if dateMin == dateMax:
print(f"Les dates ne doivent pas être égales")
return
# Verification to ensure the validity of parameters, dates in the right period
if dateMin < MIN_DATE or dateMin > (MAX_DATE + 1) or dateMax < MIN_DATE or dateMax > (MAX_DATE + 1):
print(f"Les dates doivent être comprises entre {MIN_DATE} et {MAX_DATE}")
return return
# Get datas from cassandra table # Get datas from cassandra table
...@@ -71,12 +77,15 @@ def plotHistory(station, indicator, dateMin, dateMax): ...@@ -71,12 +77,15 @@ def plotHistory(station, indicator, dateMin, dateMax):
# If no data for the period selected # If no data for the period selected
if not table: if not table:
print(f"Aucune donnée pour la station {station} et pour l'indicateur {indicator}") print(f"Aucune donnée pour la station {station} et pour l'indicateur {indicator} et pour la période {dateMin} - {dateMax}")
return return
# Treat datas # Treat datas
table_mean = getMeanByDay(table, dateMin, dateMax) table_mean = getMeanByDay(table, dateMin, dateMax)
table_mean_by_month = getMeanByMonth(table) table_mean_by_month = getMeanByMonth(table)
if not table_mean or not table_mean_by_month:
print(f"Aucune donnée pour la station {station} et pour l'indicateur {indicator} et pour la période {dateMin} - {dateMax}")
return
# Duplicate list for each year in the period required # Duplicate list for each year in the period required
liste = [] liste = []
...@@ -105,7 +114,7 @@ def plotHistory(station, indicator, dateMin, dateMax): ...@@ -105,7 +114,7 @@ def plotHistory(station, indicator, dateMin, dateMax):
graduation = ["20" + "0" * (2 - len(str(i))) + str(i) + "-01-01" for i in range(int(str(dateMin)[2:4]), int(str(dateMax)[2:4]), step_graduation)] graduation = ["20" + "0" * (2 - len(str(i))) + str(i) + "-01-01" for i in range(int(str(dateMin)[2:4]), int(str(dateMax)[2:4]), step_graduation)]
# Add the last value of graduation for the last day of measures # Add the last value of graduation for the last day of measures
graduation.append("20" + "0" * (2 - len(str(dateMax - 1)[2:4])) + str(dateMax - 1)[2:4] + "-12-31") graduation.append("20" + "0" * (2 - len(str(dateMax - 1)[2:4])) + str(dateMax - 1)[2:4] + "-12-31")
# Plot, with both measures and season mean # Plot, with both measures and season mean
fig, ax1 = plt.subplots() fig, ax1 = plt.subplots()
# Measures on axis 1 # Measures on axis 1
...@@ -140,5 +149,5 @@ if __name__ == '__main__': ...@@ -140,5 +149,5 @@ if __name__ == '__main__':
session.set_keyspace("bazinsim_roisinos_metar") session.set_keyspace("bazinsim_roisinos_metar")
print() print()
plotHistory("EFKI", "tmpf", 2001, 2005) plotHistory("EFKI", "tmpf", 2001, 2004)
print() print()
\ No newline at end of file
from cassandra.cluster import Cluster
from datetime import datetime
import loading as l
import history as h
def getDatasForPeriod(startPeriod, endPeriod, indicators):
datas = []
for i in range(int(startPeriod[0:4]), int(endPeriod[0:4]) + 1):
datas += session.execute(f"SELECT year, month, day, station, {indicators} FROM {l.table_name_date} where year = {i}")
return datas
def verifyDateInPeriod(startPeriod, endPeriod, year, month, day):
date = datetime.strptime(year + "-" + month + "-" + day, "%Y-%m-%d")
dateStart = datetime.strptime(startPeriod, "%Y-%m-%d")
dateEnd = datetime.strptime(endPeriod, "%Y-%m-%d")
if date < dateStart or date > dateEnd:
return False
return True
def getDecileForAllStations(startPeriod, endPeriod, table, nb_indicators, indicators_list):
# map with station and list of maps
# the list of maps is used for all indicators
# the second map contains the indicator with the list of values for this indicator
l = {}
for t in table:
if verifyDateInPeriod(startPeriod, endPeriod, str(t[0]), str(t[1]), str(t[2])):
if t[3] not in l.keys():
l[t[3]] = []
for i in range(nb_indicators):
if t[4 + i] != None:
l[t[3]].append({indicators_list[i] : [float(t[4 + i])]})
else:
for i in range(nb_indicators):
if t[4 + i] != None:
l[t[3]][i][indicators_list[i]].append(float(t[4 + i]))
# Sort all lists of values
for station in l.keys():
for i in range(nb_indicators):
l[station][i][indicators_list[i]].sort()
# Deciles is a map mapping station with a list of maps containing indicators and their deciles
# example for 2 stations with 2 indicators
# {'EFKI': [{'tmpf': [-23.8, 6.8, 17.6, 26.6, 32.0, 39.2, 44.6, 48.2, 53.6, 62.6, 91.4]}, {'dwpf': [-31.0, 5.0, 14.0, 24.8, 32.0, 35.6, 39.2, 42.8, 50.0, 55.4, 69.8]}], 'EFHA': [{'tmpf': [-23.8, 6.8, 17.6, 26.6, 32.0, 39.2, 44.6, 48.2, 53.6, 62.6, 91.4]}, {'dwpf': [-31.0, 5.0, 14.0, 24.8, 32.0, 35.6, 39.2, 42.8, 50.0, 55.4, 69.8]}]}
deciles = {}
for station in l.keys():
deciles[station] = []
for i in range(nb_indicators):
deciles[station].append({indicators_list[i] : []})
# Compute deciles, from 0 to 10 (= includes min and max)
for d in range(11):
deciles[station][i][indicators_list[i]].append(l[t[3]][i][indicators_list[i]][len(l[t[3]][i][indicators_list[i]]) // 10 * d])
return deciles
def kmeans(startPeriod, endPeriod, indicators_list):
startDate = datetime.strptime(startPeriod, "%Y-%m-%d")
endDate = datetime.strptime(endPeriod, "%Y-%m-%d")
firstDate = datetime.strptime(l.FIRST_DAY, "%Y-%m-%d")
lastDate = datetime.strptime(l.LAST_DAY, "%Y-%m-%d")
if startDate < firstDate or startDate > lastDate or endDate < firstDate or endDate > lastDate:
print(f"Les dates doivent être comprises entre {l.FIRST_DAY} et {l.LAST_DAY}")
return
if not h.verifyYearValidity(int(startPeriod[0:4]), int(endPeriod[0:4])):
return
# Create a string with indicators concatenated
indicators = ""
indicators_list_numeric = []
nb_indicators = 0
for ind in indicators_list:
if ind in l.numeric_columns:
if nb_indicators == 0:
indicators += ind
indicators_list_numeric.append(ind)
nb_indicators += 1
else:
indicators += "," + ind
indicators_list_numeric.append(ind)
nb_indicators += 1
# print(indicators, nb_indicators)
table = getDatasForPeriod(startPeriod, endPeriod, indicators)
table = list(table)
# Get the map with all deciles for all stations and indicators
table_decile = getDecileForAllStations(startPeriod, endPeriod, table, nb_indicators, indicators_list_numeric)
print (table_decile)
if __name__ == '__main__':
cluster = Cluster()
session = cluster.connect()
session.set_keyspace("bazinsim_roisinos_metar")
print()
kmeans("2001-01-01", "2010-12-31", ["tmpf", "dwpf", "skyc1"])
print()
...@@ -11,6 +11,9 @@ numeric_columns = ["lon","lat","tmpf","dwpf","relh","drct","sknt","p01i","alti", ...@@ -11,6 +11,9 @@ numeric_columns = ["lon","lat","tmpf","dwpf","relh","drct","sknt","p01i","alti",
MIN_DATE = 2001 MIN_DATE = 2001
MAX_DATE = 2010 MAX_DATE = 2010
FIRST_DAY = "2001-01-01"
LAST_DAY = "2010-12-31"
# Country: Finland # Country: Finland
# Dates : 2001 to 2010 # Dates : 2001 to 2010
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment