Commit 67ccb854 by sim-baz

Starting kmeans function

`Getting deciles for stations and indicators`
parent 563b636a
 ... @@ -24,7 +24,7 @@ def getMeanByDay(table, dateMin, dateMax): ... @@ -24,7 +24,7 @@ def getMeanByDay(table, dateMin, dateMax): # Verify the measures is for the period chosen by user and have a value # Verify the measures is for the period chosen by user and have a value if year >= dateMin and year < dateMax and r[len(r) - 1] != None: if year >= dateMin and year < dateMax and r[len(r) - 1] != None: # convert attributes to date format as string # convert attributes to date format as string date = str(r[0]) + "-" + str(r[1]) + "-" + str(r[2]) date = str(r[0]) + "-" + "0" * (2 - len(str(r[1]))) + str(r[1]) + "-" + "0" * (2 - len(str(r[2]))) + str(r[2]) if date not in table_date.keys(): if date not in table_date.keys(): table_date[date] = 0,0 table_date[date] = 0,0 table_date[date] = (table_date[date][0] + r[len(r) - 1], table_date[date][1] + 1) table_date[date] = (table_date[date][0] + r[len(r) - 1], table_date[date][1] + 1) ... @@ -52,17 +52,23 @@ def getMeanByMonth(table): ... @@ -52,17 +52,23 @@ def getMeanByMonth(table): return table_month return table_month # Verify the validity of the years given def verifyYearValidity(dateMin, dateMax): # Verification to ensure the validity of parameters, dates not equal if dateMin == dateMax: print(f"Les dates ne doivent pas être égales") return False # Verification to ensure the validity of parameters, dates in the right period if dateMin < MIN_DATE or dateMin > (MAX_DATE + 1) or dateMax < MIN_DATE or dateMax > (MAX_DATE + 1): print(f"Les dates doivent être comprises entre {MIN_DATE} et {MAX_DATE}") return False return True def plotHistory(station, indicator, dateMin, dateMax): def plotHistory(station, indicator, dateMin, dateMax): # Accept only indicator with numeric values (not factors) # Accept only indicator with numeric values (not factors) if indicator in numeric_columns: if indicator in numeric_columns: # Verification to ensure the validity of parameters, dates not equal if not verifyYearValidity(dateMin, dateMax): if dateMin == dateMax: print(f"Les dates ne doivent pas être égales") return # Verification to ensure the validity of parameters, dates in the right period if dateMin < MIN_DATE or dateMin > (MAX_DATE + 1) or dateMax < MIN_DATE or dateMax > (MAX_DATE + 1): print(f"Les dates doivent être comprises entre {MIN_DATE} et {MAX_DATE}") return return # Get datas from cassandra table # Get datas from cassandra table ... @@ -71,12 +77,15 @@ def plotHistory(station, indicator, dateMin, dateMax): ... @@ -71,12 +77,15 @@ def plotHistory(station, indicator, dateMin, dateMax): # If no data for the period selected # If no data for the period selected if not table: if not table: print(f"Aucune donnée pour la station {station} et pour l'indicateur {indicator}") print(f"Aucune donnée pour la station {station} et pour l'indicateur {indicator} et pour la période {dateMin} - {dateMax}") return return # Treat datas # Treat datas table_mean = getMeanByDay(table, dateMin, dateMax) table_mean = getMeanByDay(table, dateMin, dateMax) table_mean_by_month = getMeanByMonth(table) table_mean_by_month = getMeanByMonth(table) if not table_mean or not table_mean_by_month: print(f"Aucune donnée pour la station {station} et pour l'indicateur {indicator} et pour la période {dateMin} - {dateMax}") return # Duplicate list for each year in the period required # Duplicate list for each year in the period required liste = [] liste = [] ... @@ -105,7 +114,7 @@ def plotHistory(station, indicator, dateMin, dateMax): ... @@ -105,7 +114,7 @@ def plotHistory(station, indicator, dateMin, dateMax): graduation = ["20" + "0" * (2 - len(str(i))) + str(i) + "-01-01" for i in range(int(str(dateMin)[2:4]), int(str(dateMax)[2:4]), step_graduation)] graduation = ["20" + "0" * (2 - len(str(i))) + str(i) + "-01-01" for i in range(int(str(dateMin)[2:4]), int(str(dateMax)[2:4]), step_graduation)] # Add the last value of graduation for the last day of measures # Add the last value of graduation for the last day of measures graduation.append("20" + "0" * (2 - len(str(dateMax - 1)[2:4])) + str(dateMax - 1)[2:4] + "-12-31") graduation.append("20" + "0" * (2 - len(str(dateMax - 1)[2:4])) + str(dateMax - 1)[2:4] + "-12-31") # Plot, with both measures and season mean # Plot, with both measures and season mean fig, ax1 = plt.subplots() fig, ax1 = plt.subplots() # Measures on axis 1 # Measures on axis 1 ... @@ -140,5 +149,5 @@ if __name__ == '__main__': ... @@ -140,5 +149,5 @@ if __name__ == '__main__': session.set_keyspace("bazinsim_roisinos_metar") session.set_keyspace("bazinsim_roisinos_metar") print() print() plotHistory("EFKI", "tmpf", 2001, 2005) plotHistory("EFKI", "tmpf", 2001, 2004) print() print() \ No newline at end of file
kmeans.py 0 → 100644
 from cassandra.cluster import Cluster from datetime import datetime import loading as l import history as h def getDatasForPeriod(startPeriod, endPeriod, indicators): datas = [] for i in range(int(startPeriod[0:4]), int(endPeriod[0:4]) + 1): datas += session.execute(f"SELECT year, month, day, station, {indicators} FROM {l.table_name_date} where year = {i}") return datas def verifyDateInPeriod(startPeriod, endPeriod, year, month, day): date = datetime.strptime(year + "-" + month + "-" + day, "%Y-%m-%d") dateStart = datetime.strptime(startPeriod, "%Y-%m-%d") dateEnd = datetime.strptime(endPeriod, "%Y-%m-%d") if date < dateStart or date > dateEnd: return False return True def getDecileForAllStations(startPeriod, endPeriod, table, nb_indicators, indicators_list): # map with station and list of maps # the list of maps is used for all indicators # the second map contains the indicator with the list of values for this indicator l = {} for t in table: if verifyDateInPeriod(startPeriod, endPeriod, str(t[0]), str(t[1]), str(t[2])): if t[3] not in l.keys(): l[t[3]] = [] for i in range(nb_indicators): if t[4 + i] != None: l[t[3]].append({indicators_list[i] : [float(t[4 + i])]}) else: for i in range(nb_indicators): if t[4 + i] != None: l[t[3]][i][indicators_list[i]].append(float(t[4 + i])) # Sort all lists of values for station in l.keys(): for i in range(nb_indicators): l[station][i][indicators_list[i]].sort() # Deciles is a map mapping station with a list of maps containing indicators and their deciles # example for 2 stations with 2 indicators # {'EFKI': [{'tmpf': [-23.8, 6.8, 17.6, 26.6, 32.0, 39.2, 44.6, 48.2, 53.6, 62.6, 91.4]}, {'dwpf': [-31.0, 5.0, 14.0, 24.8, 32.0, 35.6, 39.2, 42.8, 50.0, 55.4, 69.8]}], 'EFHA': [{'tmpf': [-23.8, 6.8, 17.6, 26.6, 32.0, 39.2, 44.6, 48.2, 53.6, 62.6, 91.4]}, {'dwpf': [-31.0, 5.0, 14.0, 24.8, 32.0, 35.6, 39.2, 42.8, 50.0, 55.4, 69.8]}]} deciles = {} for station in l.keys(): deciles[station] = [] for i in range(nb_indicators): deciles[station].append({indicators_list[i] : []}) # Compute deciles, from 0 to 10 (= includes min and max) for d in range(11): deciles[station][i][indicators_list[i]].append(l[t[3]][i][indicators_list[i]][len(l[t[3]][i][indicators_list[i]]) // 10 * d]) return deciles def kmeans(startPeriod, endPeriod, indicators_list): startDate = datetime.strptime(startPeriod, "%Y-%m-%d") endDate = datetime.strptime(endPeriod, "%Y-%m-%d") firstDate = datetime.strptime(l.FIRST_DAY, "%Y-%m-%d") lastDate = datetime.strptime(l.LAST_DAY, "%Y-%m-%d") if startDate < firstDate or startDate > lastDate or endDate < firstDate or endDate > lastDate: print(f"Les dates doivent être comprises entre {l.FIRST_DAY} et {l.LAST_DAY}") return if not h.verifyYearValidity(int(startPeriod[0:4]), int(endPeriod[0:4])): return # Create a string with indicators concatenated indicators = "" indicators_list_numeric = [] nb_indicators = 0 for ind in indicators_list: if ind in l.numeric_columns: if nb_indicators == 0: indicators += ind indicators_list_numeric.append(ind) nb_indicators += 1 else: indicators += "," + ind indicators_list_numeric.append(ind) nb_indicators += 1 # print(indicators, nb_indicators) table = getDatasForPeriod(startPeriod, endPeriod, indicators) table = list(table) # Get the map with all deciles for all stations and indicators table_decile = getDecileForAllStations(startPeriod, endPeriod, table, nb_indicators, indicators_list_numeric) print (table_decile) if __name__ == '__main__': cluster = Cluster() session = cluster.connect() session.set_keyspace("bazinsim_roisinos_metar") print() kmeans("2001-01-01", "2010-12-31", ["tmpf", "dwpf", "skyc1"]) print()
 ... @@ -11,6 +11,9 @@ numeric_columns = ["lon","lat","tmpf","dwpf","relh","drct","sknt","p01i","alti", ... @@ -11,6 +11,9 @@ numeric_columns = ["lon","lat","tmpf","dwpf","relh","drct","sknt","p01i","alti", MIN_DATE = 2001 MIN_DATE = 2001 MAX_DATE = 2010 MAX_DATE = 2010 FIRST_DAY = "2001-01-01" LAST_DAY = "2010-12-31" # Country: Finland # Country: Finland # Dates : 2001 to 2010 # Dates : 2001 to 2010 ... ...
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!