kmeans.py 6.21 KB
Newer Older
sim-baz's avatar
sim-baz committed
1
2
3
from cassandra.cluster import Cluster
from datetime import datetime

4
5
from sklearn.cluster import KMeans
import numpy as np
Oscar Roisin's avatar
Oscar Roisin committed
6
import folium
7

sim-baz's avatar
sim-baz committed
8
9
10
import loading as l
import history as h

Oscar Roisin's avatar
Oscar Roisin committed
11
colours = ['blue', 'red', 'green', 'orange', 'pink', 'white', 'purple', 'gray']
sim-baz's avatar
sim-baz committed
12
13

def getDatasForPeriod(startPeriod, endPeriod, indicators):
Oscar Roisin's avatar
Oscar Roisin committed
14
15
16
    datas = []
    for i in range(int(startPeriod[0:4]), int(endPeriod[0:4]) + 1):
        datas += session.execute(f"SELECT year, month, day, station, lat, lon, {indicators} FROM {l.table_name_date} where year = {i}")
sim-baz's avatar
sim-baz committed
17

Oscar Roisin's avatar
Oscar Roisin committed
18
    return datas
sim-baz's avatar
sim-baz committed
19
20

def verifyDateInPeriod(startPeriod, endPeriod, year, month, day):
Oscar Roisin's avatar
Oscar Roisin committed
21
    isDate = year.isdigit() and month.isdigit() and day.isdigit()
Oscar Roisin's avatar
Oscar Roisin committed
22
23
24
25
26
27
    if isDate:
        date = datetime.strptime(year + "-" + month + "-" + day, "%Y-%m-%d")
        dateStart = datetime.strptime(startPeriod, "%Y-%m-%d")
        dateEnd = datetime.strptime(endPeriod, "%Y-%m-%d")
        if date >= dateStart and date <= dateEnd:
            return True
Oscar Roisin's avatar
Oscar Roisin committed
28
    return False
sim-baz's avatar
sim-baz committed
29
30

def getDecileForAllStations(startPeriod, endPeriod, table, nb_indicators, indicators_list):
Oscar Roisin's avatar
Oscar Roisin committed
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
    # map with station and list of maps
    # the list of maps is used for all indicators
    # the second map contains the indicator with the list of values for this indicator
    l = {}
    for t in table:
        if verifyDateInPeriod(startPeriod, endPeriod, str(t[0]), str(t[1]), str(t[2])):
            if t[3] not in l.keys():
                l[t[3]] = []
                for i in range(nb_indicators):
                    if t[6 + i] != None:
                        l[t[3]].append({indicators_list[i] : [float(t[6 + i])]})
            else:
                for i in range(nb_indicators):
                    if t[6 + i] != None:
                        l[t[3]][i][indicators_list[i]].append(float(t[6 + i]))

    # Sort all lists of values
    for station in l.keys():
        for i in range(nb_indicators):
            l[station][i][indicators_list[i]].sort()

    # Deciles is a map mapping station with a list of maps containing indicators and their deciles
    # example for 2 stations with 2 indicators
    # {'EFKI': [{'tmpf': [-23.8, 6.8, 17.6, 26.6, 32.0, 39.2, 44.6, 48.2, 53.6, 62.6, 91.4]}, {'dwpf': [-31.0, 5.0, 14.0, 24.8, 32.0, 35.6, 39.2, 42.8, 50.0, 55.4, 69.8]}], 'EFHA': [{'tmpf': [-23.8, 6.8, 17.6, 26.6, 32.0, 39.2, 44.6, 48.2, 53.6, 62.6, 91.4]}, {'dwpf': [-31.0, 5.0, 14.0, 24.8, 32.0, 35.6, 39.2, 42.8, 50.0, 55.4, 69.8]}]}
    deciles = {}
    for station in l.keys():
        deciles[station] = []
        for i in range(nb_indicators):
            deciles[station].append({indicators_list[i] : []})
            # Compute deciles, from 0 to 10 (= includes min and max)
            for d in range(11):
                if d == 10:
                    deciles[station][i][indicators_list[i]].append(l[station][i][indicators_list[i]][len(l[station][i][indicators_list[i]]) - 1])
                else:
                    deciles[station][i][indicators_list[i]].append(l[station][i][indicators_list[i]][len(l[station][i][indicators_list[i]]) // 10 * d])

    return deciles
sim-baz's avatar
sim-baz committed
68

69
def applyKmeans(deciles, nb_indicators, indicators_list, startPeriod, endPeriod):
Oscar Roisin's avatar
Oscar Roisin committed
70
71
72
73
    # Create table without map
    table = []
    # Create list with stations name
    stations_name = []
74

Oscar Roisin's avatar
Oscar Roisin committed
75
76
77
78
79
80
    for station in deciles.keys():
        t = []
        stations_name.append(station)
        for i in range(nb_indicators):
            t += deciles[station][i][indicators_list[i]]
        table.append(t)
81

Oscar Roisin's avatar
Oscar Roisin committed
82
83
84
85
    nb_clusters = 4
    if len(stations_name) < nb_clusters:
        print(f"Le nombre de villes ayant des données est trop inférieur ({len(stations_name)}) pour appliquer les kmeans pour la période du {startPeriod} au {endPeriod}")
        return None
86

Oscar Roisin's avatar
Oscar Roisin committed
87
    kmeans = KMeans(n_clusters = nb_clusters, max_iter = 100).fit(table)
88

Oscar Roisin's avatar
Oscar Roisin committed
89
90
91
92
93
    res = {}
    i = 0
    for station in stations_name:
        res[station] = kmeans.labels_[i]
        i += 1
94

Oscar Roisin's avatar
Oscar Roisin committed
95
    return res
96

sim-baz's avatar
sim-baz committed
97
98

def kmeans(startPeriod, endPeriod, indicators_list):
Oscar Roisin's avatar
Oscar Roisin committed
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
    startDate = datetime.strptime(startPeriod, "%Y-%m-%d")
    endDate = datetime.strptime(endPeriod, "%Y-%m-%d")

    firstDate = datetime.strptime(l.FIRST_DAY, "%Y-%m-%d")
    lastDate = datetime.strptime(l.LAST_DAY, "%Y-%m-%d")

    if startDate < firstDate or startDate > lastDate or endDate < firstDate or endDate > lastDate:
        print(f"Les dates doivent être comprises entre {l.FIRST_DAY} et {l.LAST_DAY}")
        return

    if not h.verifyYearValidity(int(startPeriod[0:4]), int(endPeriod[0:4])):
        return

    # Create a string with indicators concatenated
    indicators = ""
    indicators_list_numeric = []
    nb_indicators = 0
    for ind in indicators_list:
        if ind in l.numeric_columns:
            if nb_indicators == 0:
                indicators += ind
                indicators_list_numeric.append(ind)
                nb_indicators += 1
            else:
                indicators += "," + ind
                indicators_list_numeric.append(ind)
                nb_indicators += 1

    table = getDatasForPeriod(startPeriod, endPeriod, indicators)
    table = list(table)
    # Get coordinates
    coord = dict()
    for t in table:
        coord[t[3]]=(t[4], t[5])
    # Get the map with all deciles for all stations and indicators
    table_deciles = getDecileForAllStations(startPeriod, endPeriod, table, nb_indicators, indicators_list_numeric)

    station_with_center = applyKmeans(table_deciles, nb_indicators, indicators_list_numeric, startPeriod, endPeriod)
    if station_with_center != None:
        file_name = f"{startPeriod} to {endPeriod}.html"
        # Create map
        m = folium.Map(location=[64.2815, 27.6753])
        # Add Marker for each station
        for key, value in station_with_center.items():
            folium.Marker([coord[key][0], coord[key][1]], popup=f"<b>{key}</b>", icon=folium.Icon(color=colours[value])).add_to(m)
        # Save map
        m.save(file_name)
        print(f"La carte a été enregistrée à {file_name}")
    else:
        print(f"Aucune clusterisation déterminée")
sim-baz's avatar
sim-baz committed
149
150

if __name__ == '__main__':
Oscar Roisin's avatar
Oscar Roisin committed
151
152
153
154
155
156
157
158
    cluster = Cluster()
    session = cluster.connect()
    session.set_keyspace("bazinsim_roisinos_metar")

    print()
    # kmeans("2001-01-01", "2010-12-31", ["tmpf", "skyc1"])
    kmeans("2001-01-01", "2010-12-31", ["tmpf", "dwpf", "skyc1"])
    print()