question3_compare_month.py 8.94 KB
Newer Older
Tianyang's avatar
Tianyang committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce
from database_pre3 import connection
import matplotlib.pyplot as plt
import re
import folium
import random
import datetime


table_variable = [
    'date',
    'lon',
    'lat',
    'station',
    'time',
    'alti',
    'drct',
    'dwpf', 
    'feel',
    'gust',
    'ice_accretion_1hr',
    'ice_accretion_3hr',
    'ice_accretion_6hr',
    'metar',
    'mslp',
    'p01i',
    'peak_wind_drct',
    'peak_wind_gust',
    'peak_wind_time',
    'relh',
    'sknt',
    'skyc1',
    'skyc2',
    'skyc3',
    'skyc4',
    'skyl1',
    'skyl2',
    'skyl3',
    'skyl4',
    'tmpf',
    'vsby',
    'wxcodes']




def add (x,y):
    return x+y


def abs_diff(x,y):
    longth = len(x)
    result = []
    for i in range(longth):
        result.append(abs(x[i]-y[i]))
    total = 0
    for each in result:
        total += each
    return total

def diff(x,y):
    return x-y



#caculate mean reduce
#input [count,mean]
def reduceFonction (x,y):
    result = []
    for i in range(2):
        result.append(reduce(add,[x[i],y[i]]))
    return result

#input [valeur] -> [count,mean]
def mapFonction1 (x):
    return [1,x]

#input [count,mean] -> [mean]
def mapFonction2 (x):
    return x[1]/x[0]


#Map reduce to caculate the means of each month of each station
def mapReduce_kmeans(targetNB,start,end):
    results = dict()
    for row in session.execute("select * from caitiany.database_kmeans where date >= '%s' and date <= '%s' ALLOW FILTERING"%(start,end)):
        data_target = row[targetNB]
        if data_target == None:
            continue
        #We want to take out the month correspondant and add month into keys
        dateparser = re.compile("(\d+)-(?P<month>\d+)-(\d+) (?P<time>\d+:\d+)")
        match_month = dateparser.match(str(row[0]))
        time = match_month.groupdict()
        month = time["month"]
        data_espace = (row[1],row[2],row[3],month)
        if results.get(data_espace) is None:
            results[data_espace] = mapFonction1(data_target)
        else:
            mapresult = mapFonction1(data_target)
            results[data_espace] = reduceFonction(mapresult,results[data_espace])
    for eachEspace in results:
        results[eachEspace] = mapFonction2(results[eachEspace])
    #Now we need to put the data of each station all together
    newResult = dict()
    for each in results:
        newKey = each[0:3]
        if newResult.get(newKey) is None:
            newResult[newKey] = [results[each]]
        else:
            newResult[newKey] = newResult[newKey]+[results[each]]
    max = 0
    for each in newResult:
        if len(newResult[each]) >max:
            max = len(newResult[each])
    deletList = []
    #We want to check if there is a missing data so we check the number of month max
    #Then we delete all the station who have the missing data
    for each in newResult:
        if len(newResult[each]) < max:
            deletList.append(each)
    for delete in deletList:
        newResult.pop(delete)
    return newResult

#Caculate the difference of the number of the clusters 
def cluster_nb_diff(centre_new,centre):
    sum = 0
    for i in range(3):
        sum += abs(centre_new[i][0]-centre[i][0])
    return sum/3



#input [tmpt] -> [tmpt,tmpt,tmpt,mean]
def map1_kmeans(x):
    return [x,x,x,x]


#input [tmpt,tmpt,tmpt,tmpt] and [c1,c2,c3,0] -> [|tmpt - c1|,|tmpt - c2|,|tmpt - c3|,tmpt]
def reduceKmeans (x,y):
    result = []
    for i in range(3):
        result.append(reduce(abs_diff,[x[i],y[i]]))
    result.append(x[3])
    return result


#input [|tmpt - c1|,|tmpt - c2|,|tmpt - c3|,tmpt] -> [cluster number, min(|tmpt - c|), tmpt]
def map2_kmeans(x):
    min_value = 10000000000000
    index = 0
    for each in range(3):
        if min_value > x[each]:
            min_value = x[each]
            index = each
    return [index,min_value,x[3]]



#Update the new center by means
def MapnewCentre(x):
    result = []
    longth = len(x[1])
    for i in range(longth):
        if x[0] != 0:
            result.append(x[1][i]/x[0])
        else:
            result.append(0)
    return result

#caculate moyen
def caculateMoyen(x):
    total = 0
    count = 0
    for i in x:
        count += 1
        total += i
    return total/count



#caculate sum of two list
def caculateSumofList(x,y):
    if x == []:
        return y
    longth = len(x)
    result = []
    for i in range(longth):
        result.append(x[i] + y[i])
    return result


#The main algorithm of Kmeans
def kmeans (targetNB,target,start,end):
    #cluster est pour stocler lat, lon de chaque point de chaque cluster
    cluster = [[],[],[]]

    result = mapReduce_kmeans(targetNB,start,end)

    if len(result) < 3:
        raise Exception ("We\'ve just searched less than 3 station!!")

    #mettre ramdom 3 point comme le centres init
    center1 = random.choice(list(result))
    center2 = random.choice(list(result))
    center3 = random.choice(list(result))
    init_point_values = [result[center1],result[center2],result[center3]]
    #init_point_values = [result[i] for i in result.keys()][:3]
    init_point_keys = [center1,center2,center3]

    #3centre with [point count, temprature centre]
    centre = {0:[0,0],1:[0,0],2:[0,0]}
    centre_new = {0:[0,0],1:[0,0],2:[0,0]}

    for key in centre.keys():
        centre[key] = [1,init_point_values[key]]
        centre_new[key] = [1,init_point_values[key]]
        cluster[key].append(init_point_keys[key])
    

    #init the centre new and result new for mapreduce
    result_new = dict()
    #When the number of point of cluster don't change,stop
    while True:
        for eachkey in result:
            if eachkey in cluster[0] or eachkey in cluster[1] or eachkey in cluster[2]:
                continue
            
            #caculate the distance between the data of this lingne and the centre

            #Map1_kemeans
            result_new[eachkey] = map1_kmeans(result[eachkey])
            centre_values = [centre[0][1],centre[1][1],centre[2][1]]

            #Reduce
            result_new[eachkey] = reduceKmeans(result_new[eachkey],centre_values)

            #Map2_kmeans
            result_new[eachkey] = map2_kmeans(result_new[eachkey])

        #Put all the distance and points into the clusters
        #Result format [cluster number, min(|tmpt - c|),tmpt - c]
        for eachpoint in result_new:
            clusterNB = result_new[eachpoint][0]
            centre_new[clusterNB][0] += 1
            centre_new[clusterNB][1] = caculateSumofList(centre_new[clusterNB][1],result_new[eachpoint][2])
            cluster[clusterNB].append(eachpoint)
        #compare centre_new and centre, if
        if not cluster_nb_diff(centre_new,centre) > 1:
            break
        else:
            #caculate the new centre
            print ("Cluster number differ:  ",cluster_nb_diff(centre_new,centre))
            for eachculster in centre_new:
                centre_new[eachculster][1] = MapnewCentre(centre_new[eachculster])
            print("center new: ",centre_new)
            print("center old: ",centre)
            centre = centre_new
            centre_new = {0:[0,[]],1:[0,[]],2:[0,[]]}
            result_new = dict()
            cluster = [[],[],[]]
    createMap(cluster,result)



#Create the map of the cluster
def createMap (cluster,result):
    mean_lat = 0
    count = 0
    for each in [cluster[0],cluster[1],cluster[2]]:
        for each_pos in each:
            mean_lat += each_pos[0]
            count += 1
    mean_lat = mean_lat/count

    mean_lon = 0
    count = 0
    for each in [cluster[0],cluster[1],cluster[2]]:
        for each_pos in each:
            mean_lon += each_pos[1]
            count += 1
    mean_lon = mean_lon/count

    m = folium.Map(location=[mean_lon,mean_lat],zoom_start=6)

    color = {0:'blue',1:'red',2:'green'}
    i = 0
    for each in [cluster[0],cluster[1],cluster[2]]:
        for each_pos in each:
            label = str(each_pos[2])+"\n"+""+target+":"+str(round(caculateMoyen(result[(each_pos[0],each_pos[1],each_pos[2])]),2))
            folium.Marker([each_pos[1],each_pos[0]],
                        popup=label,
                        icon=folium.Icon(color=color[i])).add_to(m)
        i +=1
    m.save("Projet-NF26/map.html")
    print("Generate successfully")


#Check which number of the indicateur
def checkNBvariable (x):
    i=0
    for each in table_variable:
        if x == each:
            return i
        i += 1
    print ('Doesn\'t exist!!')


if __name__ == "__main__":
    session = connection()
Tianyang's avatar
Tianyang committed
313 314
    start = input("Please enter the start time [form: AAAA-MM-DD (From 2008-12-19 to 2017-12-30)]:  ")
    end = input("Please enter the end time [form: AAAA-MM-DD (From 2008-12-19 to 2017-12-30)]:  ")
Tianyang's avatar
Tianyang committed
315 316 317 318 319 320
    target = input("Which indicator do you want to check [tmpf,dwpf,etc]:  ")
    targetNB = checkNBvariable(target)
    #start = '2008-01-01'
    #end = '2013-12-12'
    #data = session.execute_async("select * from caitiany.database_kmeans where date >= '%s' and date <= '%s' ALLOW FILTERING"%(start,end))
    kmeans(targetNB,target,start,end)