database_pre3.py 4.14 KB
Newer Older
Tianyang's avatar
Tianyang committed
1 2 3 4 5 6 7 8 9 10 11
import cassandra.cluster
import csv
import re


def connection():
    import cassandra.cluster
    cluster = cassandra.cluster.Cluster(['localhost'])
    session = cluster.connect('caitiany')
    return session

Tianyang's avatar
Tianyang committed
12
def databaseCreate_Q3(session):
Tianyang's avatar
Tianyang committed
13
    query = """ 
Tianyang's avatar
Tianyang committed
14 15
	CREATE TABLE database_kmeans ( 
        date timestamp,
Tianyang's avatar
Tianyang committed
16 17
        lon float,
        lat float,
Tianyang's avatar
Tianyang committed
18
        station varchar,
Tianyang's avatar
Tianyang committed
19
        time varchar,
Tianyang's avatar
Tianyang committed
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
		tmpf float,
		dwpf float,
		relh float,
		drct float,
		sknt float,
		p01i float,
		alti float,
		mslp float,
		vsby float,
		gust float,
		skyc1 varchar,
		skyc2 varchar,
		skyc3 varchar,
		skyc4 varchar,
		skyl1 float,
		skyl2 float,
		skyl3 float,
		skyl4 float,
		wxcodes varchar,
		ice_accretion_1hr float,
		ice_accretion_3hr float,
		ice_accretion_6hr float,
		peak_wind_gust float,
		peak_wind_drct float,
		peak_wind_time varchar,
		feel float,
		metar varchar,
Tianyang's avatar
Tianyang committed
47
		PRIMARY KEY ((date),lon,lat,station,time)
Tianyang's avatar
Tianyang committed
48 49
	)"""
    session.execute(query)
Tianyang's avatar
Tianyang committed
50
    print("DATA BASE database_kmeans created!")
Tianyang's avatar
Tianyang committed
51 52 53


def load_data(filename):
Tianyang's avatar
Tianyang committed
54
    dateparser = re.compile("(?P<date>\d+-\d+-\d+) (?P<time>\d+:\d+)")
Tianyang's avatar
Tianyang committed
55 56
    with open(filename) as f:
        for r in csv.DictReader(f):
Tianyang's avatar
Tianyang committed
57 58
            match_time = dateparser.match(r["valid"])
            if not match_time:
Tianyang's avatar
Tianyang committed
59
                continue
Tianyang's avatar
Tianyang committed
60
            time = match_time.groupdict()
Tianyang's avatar
Tianyang committed
61 62 63 64 65
            for collonne in r:
                if r[collonne] == "M":
                    r[collonne]= "nan"
            
            data = {}
Tianyang's avatar
Tianyang committed
66
            data["date"] = time["date"]
Tianyang's avatar
Tianyang committed
67 68 69
            data["station"] = r["station"]
            data["lon"] = float(r["lon"])
            data["lat"] = float(r["lat"])
Tianyang's avatar
Tianyang committed
70
            data["time"] = time["time"]
Tianyang's avatar
Tianyang committed
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
            data["tmpf"] = float(r["tmpf"])
            data["dwpf"] = float(r["dwpf"])
            data["relh"] = float(r["relh"])
            data["drct"] = float(r["drct"])
            data["sknt"] = float(r["sknt"])
            data["p01i"] = float(r["p01i"])
            data["alti"] = float(r["alti"])
            data["mslp"] = float(r["mslp"])
            data["vsby"] = float(r["vsby"])
            data["gust"] = float(r["gust"])

            data["skyc1"] = r["skyc1"]
            data["skyc2"] = r["skyc2"]
            data["skyc3"] = r["skyc3"]
            data["skyc4"] = r["skyc4"]

            data["skyl1"] = float(r["skyl1"])
            data["skyl2"] = float(r["skyl2"])
            data["skyl3"] = float(r["skyl3"])
            data["skyl4"] = float(r["skyl4"])
            data["wxcodes"] = r["wxcodes"]

            data["ice_accretion_1hr"] = float(r["ice_accretion_1hr"])
            data["ice_accretion_3hr"] = float(r["ice_accretion_3hr"])
            data["ice_accretion_6hr"] = float(r["ice_accretion_6hr"])
            data["peak_wind_gust"] = float(r["peak_wind_gust"])
            data["peak_wind_drct"] = float(r["peak_wind_drct"])
            data["peak_wind_time"] = r["peak_wind_time"]

            data["feel"] = float(r["feel"])
            data["metar"] = r["metar"]

            yield data



Tianyang's avatar
Tianyang committed
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141


#Create the query according to if each collonne's value is null or not
def createQuery(data):
    result = dict()
    for each in data:
        if data[each] != "nan" and  str(data[each]) != 'nan':
            result[each] = data[each]

    ligne_value = []
    for each in result:
        ligne_value.append(result[each])
    ligne_value = tuple(ligne_value)

    ligne = []
    for each in result:
        ligne.append(each)
    ligne = tuple(ligne)

    #connect the query together
    query = "INSERT INTO database_kmeans("
    for eachc in ligne:
        query += str(eachc)+","
    query = "".join(list(query)[:-1]) + ") VALUES("
    longth = len(ligne)
    for _ in range(longth):
        query += "%s,"
    query = "".join(list(query)[:-1]) + ");"

    return query, ligne_value





Tianyang's avatar
Tianyang committed
142
def insection_sql_Q3(filename,session):
Tianyang's avatar
Tianyang committed
143 144 145 146
    target = load_data(filename)
    i = 1
    for data in target:
        i += 1
Tianyang's avatar
Tianyang committed
147
        k = 0
Tianyang's avatar
Tianyang committed
148
        if (i % 500 == 0):
Tianyang's avatar
Tianyang committed
149 150
            k += 1
            print(k,". 500 finished.....")
Tianyang's avatar
Tianyang committed
151
        query, ligne = createQuery(data)
Tianyang's avatar
Tianyang committed
152 153 154 155 156 157 158 159
        session.execute(query, ligne)





if __name__ == "__main__":
    session = connection()
Tianyang's avatar
Tianyang committed
160 161
    databaseCreate_Q3(session)
    insection_sql_Q3("Projet-NF26/data.csv",session)