Commit af286816 authored by Lorys Hamadache's avatar Lorys Hamadache

Version Quasi Finale

parent 7911df21
# Projet NF26 - P19 - UTC
# Projet NF26 - P19 - UTC
Authors : Lorys Hamadache et Romain Creuzenet
### ToDos
1. Ajouter la nouvelle version du rapport
2. Ajouter la nouvelle version de la présentation
3. Clean Old Version presentation + rapport
4. Mettre la Nouvelle version du main.py
5. Clean les csv dans data et remplacer par le fichier Asos?
6. Rendre PUBLIC
7. Envoyer au prof
1. Verifier l'orthographe
2. Rendre le projet public
3. Confirmer les nouvelles versions rapports + code
4. Fichier Asos vs CSVs?
5. Envoyer au prof
## Comment utiliser le projet
• Pour installer le projet :
......
This diff is collapsed.
......@@ -8,6 +8,8 @@ import warnings
import re
import os
import random
#parallelize
from pyspark import SparkContext
# Stats
import statsmodels.graphics as stm_graphs
import pandas as pd
......@@ -16,7 +18,7 @@ import numpy as np
# Graph map
from mpl_toolkits.basemap import Basemap
from pandas.plotting import register_matplotlib_converters
from datetime import datetime, timedelta
from datetime import datetime
register_matplotlib_converters()
warnings.filterwarnings("ignore")
......@@ -27,7 +29,7 @@ def execute_query(query):
yield row
def ask_q(possibilities, text="Réponse : "):
def ask_q(possibilities, text=">>> "):
"""Demande une question"""
answer = None
while answer not in possibilities:
......@@ -35,7 +37,7 @@ def ask_q(possibilities, text="Réponse : "):
return answer
def ask_d(text="Réponse : "):
def ask_d(text=">>> "):
"""Demande une date"""
print("Entrez une date sous la forme YYYY-MM-DD HH:mm")
print("Comprise entre {} et {}".format(START.strftime('%Y-%m-%d'), END.strftime('%Y-%m-%d')))
......@@ -112,8 +114,7 @@ class Manager:
def run(self):
"""Chose objective"""
# Initialisation
for i in "123":
os.makedirs(os.path.join(DIR_OUT, "objectif_{}".format(i)), exist_ok=True)
os.makedirs(DIR_OUT, exist_ok=True)
# Chose objective
print("Choisissez ce que vous voulez faire")
......@@ -146,6 +147,8 @@ class Manager:
station = ask_q(stations)
attr = chose_attr()
# Base
ts = pd.Series()
query = "SELECT time, {} FROM {} WHERE station={}".format(attr, self.table, station.__repr__())
for row in execute_query(query):
......@@ -162,20 +165,79 @@ class Manager:
plt.plot(ts, label=attr)
plt.title("Donnees de {} pour la station : {}".format(attr, station))
plt.legend()
path = os.path.join(DIR_OUT, 'objectif_1', 'graph_{}_{}.png'.format(station, attr))
path = os.path.join(DIR_OUT, 'graph_{}_{}.png'.format(station, attr))
plt.savefig(path)
plt.show()
res = stm.tsa.seasonal_decompose(ts, freq=15, extrapolate_trend='freq')
#Initialisation SPARK
sc = SparkContext()
#INITIALISATION BY DAY
plt.figure(figsize=(25, 16))
axes = plt.subplot()
axes.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.xticks(rotation=90)
date_rng = pd.date_range(start='2011-01-01', end='2013-12-31', freq='D')
# Maximum of the Day
ts_max = pd.Series(index = date_rng)
query = "SELECT time, {} FROM {} WHERE station={}".format(attr, self.table, station.__repr__())
gen_max = sc.parallelize(execute_query(query))
gen_max = gen_max.map(lambda line: ((getattr(line,"time")[0],getattr(line,"time")[1],getattr(line,"time")[2]), getattr(line,attr))).reduceByKey(lambda x,y: max(x,y)).collect()
for a,b in gen_max:
ts_max.loc[datetime(*list(a))] = b
ts_max = ts_max.sort_index()
ts_max = ts_max.interpolate()
plt.plot(ts_max , label="Maximum")
# Minimum of the Day
ts_min = pd.Series(index = date_rng)
query2 = "SELECT time, {} FROM {} WHERE station={}".format(attr, self.table, station.__repr__())
gen_min = sc.parallelize(execute_query(query2))
gen_min = gen_min.map(lambda line: ((getattr(line,"time")[0],getattr(line,"time")[1],getattr(line,"time")[2]), getattr(line,attr))).reduceByKey(lambda x,y: min(x,y)).collect()
for a,b in gen_min:
ts_min.loc[datetime(*list(a))] = b
ts_min = ts_min.sort_index()
ts_min = ts_min.interpolate()
plt.plot(ts_min , label="Minimum")
# Average of the Day
ts_avg = pd.Series(index = date_rng)
query3 = "SELECT time, {} FROM {} WHERE station={}".format(attr, self.table, station.__repr__())
gen_avg = sc.parallelize(execute_query(query2))
gen_avg = gen_avg.map(lambda line: ((getattr(line,"time")[0],getattr(line,"time")[1],getattr(line,"time")[2]), (getattr(line,attr),1))).reduceByKey(lambda x,y: (x[0] +y[0], x[1]+ y[1])).map(lambda x: (x[0],x[1][0]/x[1][1])).collect()
for a,b in gen_avg:
ts_avg.loc[datetime(*list(a))] = b
ts_avg = ts_avg.sort_index()
ts_avg = ts_avg.interpolate()
plt.plot(ts_avg , label="Moyenne")
# Global Plotting
plt.title("Donnees de {} pour la station : {}".format(attr, station))
plt.legend()
path = os.path.join(DIR_OUT, 'graph_{}_{}_byday.png'.format(station, attr))
plt.savefig(path)
plt.show()
res = stm.tsa.seasonal_decompose(ts_avg.dropna(), freq=365 , extrapolate_trend='freq')
res.plot()
path = os.path.join(DIR_OUT, 'objectif_1', 'decompose_{}_{}.png'.format(station, attr))
path = os.path.join(DIR_OUT, 'decompose_{}_{}.png'.format(station, attr))
plt.savefig(path)
plt.show()
stm_graphs.tsaplots.plot_acf(ts, lags=30)
path = os.path.join(DIR_OUT, 'objectif_1', 'acf_{}_{}.png'.format(station, attr))
stm_graphs.tsaplots.plot_acf(ts_avg.dropna(), lags=365)
path = os.path.join(DIR_OUT, 'acf_{}_{}.png'.format(station, attr))
plt.savefig(path)
plt.show()
def map(self):
self.table = "TABLE_TIME"
......@@ -218,7 +280,7 @@ class Manager:
plt.title(title)
for elt in ' :-':
title = title.replace(elt, '_')
path = os.path.join(DIR_OUT, 'objectif_2', title.lower() + '.png')
path = os.path.join(DIR_OUT, title.lower() + '.png')
plt.savefig(path)
plt.show()
......@@ -259,31 +321,25 @@ class Manager:
print("Entrez le nombre de cluster voulus")
nb_cluster = ask_int()
# Calc of mean
query = "SELECT station, lon, lat, {attr} FROM {table} WHERE time >= {begin} AND time <= {end} " \
"ALLOW FILTERING".format(
attr=", ".join(ATTRIBUTS.keys()),
table=self.table,
begin=date_begin,
end=date_end
)
stations = {} # station: {'nb': 3, 'attr1': 5, 'attr2': 7, ..., 'lon': 3.27, 'lat': 12}
datetime_begin = datetime(*list(date_begin)) # Convert datetime
datetime_end = datetime(*list(date_end)) # Convert datetime
while datetime_begin <= datetime_end:
print("Données récupérée pour {}".format(datetime_begin.strftime("%Y-%m-%d %H:%M")), end="\r")
# Calc of mean
query = "SELECT station, lon, lat, {attr} FROM {table} WHERE time = {date}".format(
attr=", ".join(ATTRIBUTS.keys()),
table=self.table,
date=(datetime_begin.year, datetime_begin.month, datetime_begin.day, datetime_begin.hour,
datetime_begin.minute)
)
for row in execute_query(query):
if None in [row.station, row.lon, row.lat] + [getattr(row, attr) for attr in ATTRIBUTS.keys()]:
continue
if row.station not in stations:
stations[row.station] = {'nb': 0, 'lon': row.lon, 'lat': row.lat,
**{key: 0 for key in ATTRIBUTS.keys()}}
for row in execute_query(query):
if None in [row.station, row.lon, row.lat] + [getattr(row, attr) for attr in ATTRIBUTS.keys()]:
continue
if row.station in stations:
for attr in ATTRIBUTS.keys():
stations[row.station][attr] += getattr(row, attr)
stations[row.station]['nb'] += 1
datetime_begin += timedelta(minutes=1)
else:
stations[row.station] = {'nb': 1, 'lon': row.lon, 'lat': row.lat,
**{key: 0 for key in ATTRIBUTS.keys()}}
for value in stations.values():
for attr in ATTRIBUTS.keys():
value[attr] = value[attr] / value['nb']
......@@ -295,8 +351,6 @@ class Manager:
for _ in range(nb_cluster)
]
print()
print("Clusterisation...")
while old_centroids != new_centroids:
old_centroids = new_centroids
data = [
......@@ -370,7 +424,7 @@ class Manager:
plt.title(title)
for elt in ' :-':
title = title.replace(elt, '_')
path = os.path.join(DIR_OUT, 'objectif_3', title.lower() + '.png')
path = os.path.join(DIR_OUT, title.lower() + '.png')
plt.savefig(path)
plt.show()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment