Commit 2c17dd6c authored by Romain Creuzenet's avatar Romain Creuzenet

cluster all attributs

parent b1dc8612
......@@ -2,8 +2,8 @@
<project version="4">
<component name="ChangeListManager">
<list default="true" id="dec891dc-2fad-4291-af33-64d4fd64029d" name="Default Changelist" comment="">
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/main.py" beforeDir="false" afterPath="$PROJECT_DIR$/main.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/parameters.py" beforeDir="false" afterPath="$PROJECT_DIR$/parameters.py" afterDir="false" />
</list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" />
......@@ -16,7 +16,7 @@
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/create_table.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="912">
<state>
<caret line="51" column="17" selection-start-line="51" selection-start-column="17" selection-end-line="51" selection-end-column="17" />
</state>
</provider>
......@@ -25,10 +25,12 @@
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/main.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="273">
<caret line="328" column="34" selection-start-line="328" selection-start-column="34" selection-end-line="328" selection-end-column="34" />
<state relative-caret-position="659">
<caret line="297" column="51" selection-start-line="297" selection-start-column="51" selection-end-line="297" selection-end-column="51" />
<folding>
<element signature="e#45#96#0" expanded="true" />
<element signature="e#3287#4955#0" />
<element signature="e#4984#7500#0" />
</folding>
</state>
</provider>
......@@ -46,8 +48,8 @@
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/parameters.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1216">
<caret line="67" column="11" selection-start-line="67" selection-start-column="5" selection-end-line="67" selection-end-column="16" />
<state relative-caret-position="228">
<caret line="15" column="1" lean-forward="true" selection-start-line="15" selection-start-column="1" selection-end-line="15" selection-end-column="1" />
</state>
</provider>
</entry>
......@@ -213,7 +215,7 @@
</entry>
<entry file="file://$PROJECT_DIR$/create_table.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="912">
<state>
<caret line="51" column="17" selection-start-line="51" selection-start-column="17" selection-end-line="51" selection-end-column="17" />
</state>
</provider>
......@@ -225,19 +227,28 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env_nf26/lib/python3.6/site-packages/cassandra/cluster.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="291">
<caret line="140" column="8" selection-start-line="140" selection-start-column="8" selection-end-line="140" selection-end-column="8" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/parameters.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1216">
<caret line="67" column="11" selection-start-line="67" selection-start-column="5" selection-end-line="67" selection-end-column="16" />
<state relative-caret-position="228">
<caret line="15" column="1" lean-forward="true" selection-start-line="15" selection-start-column="1" selection-end-line="15" selection-end-column="1" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/main.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="273">
<caret line="328" column="34" selection-start-line="328" selection-start-column="34" selection-end-line="328" selection-end-column="34" />
<state relative-caret-position="659">
<caret line="297" column="51" selection-start-line="297" selection-start-column="51" selection-end-line="297" selection-end-column="51" />
<folding>
<element signature="e#45#96#0" expanded="true" />
<element signature="e#3287#4955#0" />
<element signature="e#4984#7500#0" />
</folding>
</state>
</provider>
......
"""File to execute to show results"""
# Data
from parameters import SESSION, DIR_OUT, START, END
from parameters import SESSION, DIR_OUT, START, END, ATTRIBUTS
# Basic
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
......@@ -62,14 +62,10 @@ def ask_d(text=">>> "):
def chose_attr():
"""Permet de demander un attribut dans la table"""
# Search element
decision = {
"tmpf": "La témparature (en Fahrenheit)",
"relh": "L'humidité ( en %)"
}
print("Choisissez un élément parmis les suivant :")
for code, text in decision.items():
for code, text in ATTRIBUTS.items():
print("\t-", text, ":", code)
return ask_q(decision.keys())
return ask_q(ATTRIBUTS.keys())
def ask_int(text=">>> "):
......@@ -88,6 +84,21 @@ def generate_color(i):
return "#{:06x}".format(random.randint(0, 0xFFFFFF))
def initialisation_centroid(data):
"""
generate 1 centroide
:param data: station : {attr1: 1, attr : 2...}
:return: {attr1: 1, attr : 2...}
"""
r = {}
for attr in ATTRIBUTS.keys():
all_attr = [elt[attr] for elt in data.values()]
mini = int(min(all_attr))
maxi = int(max(all_attr))
r[attr] = random.randint(mini, maxi)
return r
class Manager:
table = None # table name use by the function
......@@ -236,8 +247,7 @@ class Manager:
# Ask information from user
print("=== Choix 3 : CLUSTER ===")
attr = chose_attr()
print("Vous allez devoir choisir une période de temps. On considéra la moyenne de l'attribut sur cette "
print("Vous allez devoir choisir une période de temps. On considéra la moyenne des attributs sur cette "
"période de temps")
date_begin = date_end = None
while date_begin is None or date_begin >= date_end:
......@@ -249,60 +259,61 @@ class Manager:
nb_cluster = ask_int()
# Calc of mean
query = "SELECT station, lon, lat, {attr} FROM {table} WHERE time >= {begin} AND time <= {end} ALLOW FILTERING" \
"".format(
attr=attr,
query = "SELECT station, lon, lat, {attr} FROM {table} WHERE time >= {begin} AND time <= {end} " \
"ALLOW FILTERING".format(
attr=", ".join(ATTRIBUTS.keys()),
table=self.table,
begin=date_begin,
end=date_end
)
stations = {} # station: {'nb': 3, 'sum': 5, 'lon': 3.27, 'lat': 12}
stations = {} # station: {'nb': 3, 'attr1': 5, 'attr2': 7, ..., 'lon': 3.27, 'lat': 12}
for row in execute_query(query):
if None in (row.station, row.lon, row.lat, getattr(row, attr)):
if None in [row.station, row.lon, row.lat] + [getattr(row, attr) for attr in ATTRIBUTS.keys()]:
continue
if row.station in stations:
stations[row.station]['sum'] += getattr(row, attr)
for attr in ATTRIBUTS.keys():
stations[row.station][attr] += getattr(row, attr)
stations[row.station]['nb'] += 1
else:
stations[row.station] = {'nb': 1, 'sum': getattr(row, attr), 'lon': row.lon, 'lat': row.lat}
stations[row.station] = {'nb': 1, 'lon': row.lon, 'lat': row.lat,
**{key: 0 for key in ATTRIBUTS.keys()}}
for value in stations.values():
value['mean'] = value['sum'] / value['nb']
means = [elt['mean'] for elt in stations.values()]
for attr in ATTRIBUTS.keys():
value[attr] = value[attr] / value['nb']
# Initialisation mean
mini = int(min(means))
maxi = int(max(means))
old_centroids = None
new_centroids = [
random.randint(mini, maxi)
initialisation_centroid(stations)
for _ in range(nb_cluster)
]
while old_centroids != new_centroids:
old_centroids = new_centroids
data = [
{'sum': 0, 'nb': 0}
{**{attr: 0 for attr in ATTRIBUTS.keys()}, 'nb': 0}
for _ in range(nb_cluster)
]
# could be parallelize
for m in means:
for value_station in stations.values():
distances = [
(m - centroid) ** 2
sum([(centroid[attr] - value_station[attr]) ** 2 for attr in ATTRIBUTS.keys()])
for centroid in old_centroids
]
i = distances.index(min(distances))
data[i]['sum'] += m
for attr in ATTRIBUTS.keys():
data[i][attr] += value_station[attr]
data[i]['nb'] += 1
# end calc parallelize
if 0 in [value['nb'] for value in data]:
# cluster empty do it again
new_centroids = [
random.randint(int(min(means), int(max(means))))
initialisation_centroid(stations)
for _ in range(nb_cluster)
]
else:
new_centroids = [
float("{0:.2f}".format(elt['sum'] / elt['nb']))
{attr: float("{0:.2f}".format(elt[attr] / elt['nb'])) for attr in ATTRIBUTS.keys()}
for elt in data
]
......@@ -332,7 +343,7 @@ class Manager:
for station, value in stations.items():
# Analyse the point
distances = [
(value['mean'] - centroid) ** 2
sum([(centroid[attr] - value[attr]) ** 2 for attr in ATTRIBUTS.keys()])
for centroid in old_centroids
]
i = distances.index(min(distances))
......@@ -340,11 +351,10 @@ class Manager:
# Add the point
x, y = the_map(value['lon'], value['lat'])
the_map.plot(x, y, marker=".", color=colors[i])
plt.annotate("{} : {:.2f}".format(station, value['mean']), (x, y), color=colors[i])
plt.annotate("{}".format(station), (x, y), color=colors[i])
title = "{nb_cluster} clusters de {attr} du {begin} au {end}".format(
title = "{nb_cluster} clusters du {begin} au {end}".format(
nb_cluster=nb_cluster,
attr=attr,
begin=datetime(*list(date_begin)).strftime('%Y-%m-%d %H:%M'),
end=datetime(*list(date_end)).strftime('%Y-%m-%d %H:%M')
)
......
......@@ -9,6 +9,11 @@ import datetime
# Can be specified
KEY_SPACE = ("lhamadac_projet", "nf26") # For many computers
TABLE = "Spain"
ATTRIBUTS = {
"tmpf": "La témparature (en Fahrenheit)",
"relh": "L'humidité ( en %)",
"alti": "Altimètre de pression en pouces"
}
# Don't change
SESSION = None
......@@ -20,7 +25,7 @@ for key in KEY_SPACE:
else:
break
if SESSION is None:
raise NoHostAvailable("Erreur de connection à cassandra")
raise NoHostAvailable("Erreur de connection à cassandra", None)
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # path of project
DIR_DATA = os.path.join(BASE_DIR, "data") # folder with all data station
......@@ -68,3 +73,5 @@ TABLES = {
'TABLE_SPACE': "station, time",
'TABLE_TIME': "time, lon, lat",
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment