Commit e4ffbda5 authored by Rémy Huet's avatar Rémy Huet 💻
Browse files

TP7

parent 5d87da76
......@@ -13,6 +13,7 @@ pandas = "*"
seaborn = "*"
scikit-learn = "*"
nb-clean = "*"
statsmodels = "*"
[dev-packages]
......
This diff is collapsed.
,V1
1970-01-01,0.61227692
1970-04-01,0.454929794
1970-07-01,0.874673021
1970-10-01,-0.272514385
1971-01-01,1.892186993
1971-04-01,0.913378185
1971-07-01,0.792857898
1971-10-01,1.649995662
1972-01-01,1.32724825
1972-04-01,1.889905062
1972-07-01,1.532724163
1972-10-01,2.317057775
1973-01-01,1.813855694
1973-04-01,-0.050557716
1973-07-01,0.359667223
1973-10-01,-0.293315461
1974-01-01,-0.878770942
1974-04-01,0.346720029
1974-07-01,0.411953557
1974-10-01,-1.47820468
1975-01-01,0.837359874
1975-04-01,1.653973693
1975-07-01,1.414318836
1975-10-01,1.053109931
1976-01-01,1.97774749
1976-04-01,0.915072184
1976-07-01,1.050746068
1976-10-01,1.295196193
1977-01-01,1.135458885
1977-04-01,0.551532397
1977-07-01,0.950159598
1977-10-01,1.496161496
1978-01-01,0.582299782
1978-04-01,2.11467168
1978-07-01,0.418698865
1978-10-01,0.802764302
1979-01-01,0.50412878
1979-04-01,-0.058551127
1979-07-01,0.977555967
1979-10-01,0.26591209
1980-01-01,-0.173684254
1980-04-01,-2.296562996
1980-07-01,1.066919835
1980-10-01,1.32441742
1981-01-01,0.54583283
1981-04-01,0.0
1981-07-01,0.404821844
1981-10-01,-0.758748833
1982-01-01,0.643998142
1982-04-01,0.356859498
1982-07-01,0.76412375
1982-10-01,1.807886611
1983-01-01,0.975937338
1983-04-01,1.96559809
1983-07-01,1.751349705
1983-10-01,1.573740048
1984-01-01,0.853227275
1984-04-01,1.420025736
1984-07-01,0.769502001
1984-10-01,1.307478026
1985-01-01,1.681281553
1985-04-01,0.907910806
1985-07-01,1.880850439
1985-10-01,0.219864034
1986-01-01,0.831533592
1986-04-01,1.059663705
1986-07-01,1.732441717
1986-10-01,0.600062434
1987-01-01,-0.152288
1987-04-01,1.329357293
1987-07-01,1.110416851
1987-10-01,0.240125469
1988-01-01,1.656928516
1988-04-01,0.723060308
1988-07-01,0.786814118
1988-10-01,1.170680141
1989-01-01,0.365226244
1989-04-01,0.446943251
1989-07-01,1.031342865
1989-10-01,0.487945312
1990-01-01,0.787867942
1990-04-01,0.329588878
1990-07-01,0.379094008
1990-10-01,-0.782282375
1991-01-01,-0.283580869
1991-04-01,0.758193783
1991-07-01,0.382567416
1991-10-01,-0.044932041
1992-01-01,1.70442848
1992-04-01,0.591033456
1992-07-01,1.099312177
1992-10-01,1.212615834
1993-01-01,0.405115351
1993-04-01,0.955401523
1993-07-01,1.079080891
1993-10-01,0.886099341
1994-01-01,1.107815847
1994-04-01,0.73801073
1994-07-01,0.79832641
1994-10-01,0.982355812
1995-01-01,0.116703636
1995-04-01,0.816439436
1995-07-01,0.890123271
1995-10-01,0.700256683
1996-01-01,0.909995107
1996-04-01,1.125174154
1996-07-01,0.597491051
1996-10-01,0.811049811
1997-01-01,1.002314791
1997-04-01,0.40370845
1997-07-01,1.685618761
1997-10-01,1.137796247
1998-01-01,0.989350157
1998-04-01,1.706687585
1998-07-01,1.316901051
1998-10-01,1.522383594
1999-01-01,0.981498554
1999-04-01,1.56147049
1999-07-01,1.194790348
1999-10-01,1.400264206
2000-01-01,1.505040637
2000-04-01,0.935882738
2000-07-01,0.974321842
2000-10-01,0.88064976
2001-01-01,0.398685386
2001-04-01,0.376512295
2001-07-01,0.439188594
2001-10-01,1.553691
2002-01-01,0.343826889
2002-04-01,0.506654039
2002-07-01,0.675711941
2002-10-01,0.354724654
2003-01-01,0.503872727
2003-04-01,0.985555729
2003-07-01,1.337666705
2003-10-01,0.542546733
2004-01-01,0.880747948
2004-04-01,0.443979489
2004-07-01,0.870378697
2004-10-01,1.073951523
2005-01-01,0.793938878
2005-04-01,0.984778893
2005-07-01,0.756278018
2005-10-01,0.248197873
2006-01-01,1.01902713
2006-04-01,0.600482189
2006-07-01,0.59799998
2006-10-01,0.925841126
2007-01-01,0.554241245
2007-04-01,0.38257957
2007-07-01,0.43929546
2007-10-01,0.294658721
2008-01-01,-0.252665213
2008-04-01,-0.03553182
2008-07-01,-0.971774467
2008-10-01,-1.313503998
2009-01-01,-0.387483995
2009-04-01,-0.470083019
2009-07-01,0.574000955
2009-10-01,0.109328853
2010-01-01,0.671017951
2010-04-01,0.717718191
2010-07-01,0.653143257
2010-10-01,0.875352148
,V1
2000-01-01,7204.0
2000-02-01,7335.0
2000-03-01,7812.0
2000-04-01,7413.0
2000-05-01,9136.0
2000-06-01,8725.0
2000-07-01,8751.0
2000-08-01,9609.0
2000-09-01,8601.0
2000-10-01,8930.0
2000-11-01,8835.0
2000-12-01,11688.0
2001-01-01,8078.0
2001-02-01,7892.0
2001-03-01,8151.0
2001-04-01,8738.0
2001-05-01,9416.0
2001-06-01,9533.0
2001-07-01,9943.0
2001-08-01,10859.0
2001-09-01,8789.0
2001-10-01,9960.0
2001-11-01,9619.0
2001-12-01,12900.0
2002-01-01,8620.0
2002-02-01,8401.0
2002-03-01,8546.0
2002-04-01,10004.0
2002-05-01,10675.0
2002-06-01,10115.0
2002-07-01,11206.0
2002-08-01,11555.0
2002-09-01,10453.0
2002-10-01,10421.0
2002-11-01,9950.0
2002-12-01,13975.0
2003-01-01,9315.0
2003-02-01,9366.0
2003-03-01,9910.0
2003-04-01,10302.0
2003-05-01,11371.0
2003-06-01,11857.0
2003-07-01,12387.0
2003-08-01,12421.0
2003-09-01,12073.0
2003-10-01,11963.0
2003-11-01,10666.0
2003-12-01,15613.0
2004-01-01,10586.0
2004-02-01,10558.0
2004-03-01,12064.0
2004-04-01,11899.0
2004-05-01,12077.0
2004-06-01,13918.0
2004-07-01,13611.0
2004-08-01,14132.0
2004-09-01,13509.0
2004-10-01,13152.0
2004-11-01,13993.0
2004-12-01,18203.0
2005-01-01,14262.0
2005-02-01,13024.0
2005-03-01,14062.0
2005-04-01,14718.0
2005-05-01,16544.0
2005-06-01,16732.0
2005-07-01,16230.0
2005-08-01,18126.0
2005-09-01,16016.0
2005-10-01,15601.0
2005-11-01,15394.0
2005-12-01,20439.0
2006-01-01,14991.0
2006-02-01,14908.0
2006-03-01,17459.0
2006-04-01,14501.0
2006-05-01,18271.0
2006-06-01,17963.0
2006-07-01,17026.0
2006-08-01,18111.0
2006-09-01,15989.0
2006-10-01,16735.0
2006-11-01,15949.0
2006-12-01,20216.0
2007-01-01,16198.0
2007-02-01,15060.0
2007-03-01,16168.0
2007-04-01,16376.0
2007-05-01,18403.0
2007-06-01,19113.0
2007-07-01,19303.0
2007-08-01,20560.0
2007-09-01,16621.0
2007-10-01,18788.0
2007-11-01,17970.0
2007-12-01,22464.0
2008-01-01,16658.0
2008-02-01,16214.0
2008-03-01,16043.0
2008-04-01,16418.0
2008-05-01,17644.0
2008-06-01,17705.0
2008-07-01,18107.0
2008-08-01,17975.0
2008-09-01,17598.0
2008-10-01,17658.0
2008-11-01,15750.0
2008-12-01,22414.0
2009-01-01,16065.0
2009-02-01,15467.0
2009-03-01,16297.0
2009-04-01,16530.0
2009-05-01,18410.0
2009-06-01,20274.0
2009-07-01,21311.0
2009-08-01,20991.0
2009-09-01,18305.0
2009-10-01,17832.0
2009-11-01,18223.0
2009-12-01,23987.0
2010-01-01,15964.0
2010-02-01,16606.0
2010-03-01,19216.0
2010-04-01,16419.0
2010-05-01,19638.0
2010-06-01,19773.0
2010-07-01,21052.0
2010-08-01,22011.0
2010-09-01,19039.0
2010-10-01,17893.0
2010-11-01,19276.0
2010-12-01,25167.0
2011-01-01,16699.0
2011-02-01,16619.0
2011-03-01,17851.0
2011-04-01,18160.0
2011-05-01,22032.0
2011-06-01,21395.0
2011-07-01,22217.0
2011-08-01,24565.0
2011-09-01,21095.0
2011-10-01,20114.0
2011-11-01,19931.0
2011-12-01,26120.0
2012-01-01,18580.0
2012-02-01,18492.0
2012-03-01,19724.0
2012-04-01,20123.0
2012-05-01,22582.0
2012-06-01,22595.0
2012-07-01,23379.0
2012-08-01,24920.0
2012-09-01,20325.0
2012-10-01,22038.0
2012-11-01,20988.0
2012-12-01,26675.0
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# AOS1 - TP7\n",
"## Time series analysis"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Qu1\n",
"\n",
"Load the dataset consumption.csv\n",
"\n",
"We want to index by time and not numbered id."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ts = pd.read_csv('data/consumption.csv', index_col=0, parse_dates=True)\n",
"\n",
"# Fast check the data\n",
"assert(ts.shape == (164, 1))\n",
"assert(type(ts.index) is pd.core.indexes.datetimes.DatetimeIndex)\n",
"\n",
"# Set the frequency of the time series as quarterly\n",
"ts.index.freq = \"QS\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Qu 2\n",
"\n",
"Display the time series"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.plot(ts)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The time series does not seems to follow a trend.\n",
"It looks like it is stationary\n",
"\n",
"### Question 3\n",
"\n",
"We use a test to verify the stationarity of the model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from statsmodels.tsa.stattools import adfuller\n",
"\n",
"res = adfuller(ts['V1'])\n",
"\n",
"print(f'p-value of the test : {res[1]}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can conclude from the p-value that the Dicker-Fuller test is **negative**.\n",
"This means that there is no \"unit root\", so the time series is stationary."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Question 4\n",
"\n",
"Plot the ACF and PACF of the time series."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from statsmodels.graphics.tsaplots import plot_acf, plot_pacf\n",
"\n",
"plot_acf(ts)\n",
"_ = plot_pacf(ts, method='ywm') # Discard the output of it will print it another time"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The ACF and PACF are decreasing but not cut off.\n",
"\n",
"The ts might follow an AR(3) model."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Question 5\n",
"\n",
"Decompose the data into train and test data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"ts_train, ts_test = ts[:'2000-01-01'], ts['2000-01-01':]\n",
"\n",
"print(len(ts_train))\n",
"print(len(ts_test))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Question 6\n",
"\n",
"Fit an ARIMA model.\n",
"For this, we use sarimax that can handle seasonality and exogenous variables.\n",
"\n",
"From the ACF and PACF plots, we choose p = q = 3"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from statsmodels.tsa.statespace.sarimax import SARIMAX as sarimax\n",
"\n",
"model_fit = sarimax(ts_train, order=(3, 0, 0))\n",
"fit_res = model_fit.fit()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Question 7\n",
"\n",
"Diacnostics"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.rcParams['figure.figsize'] = [10, 10]\n",
"_ = fit_res.plot_diagnostics()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The residuals seems to be normally distributed and not correlated.\n",
"\n",
"The model seems to be a good model.\n",
"\n",
"We can try with another model :"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = sarimax(ts_train, order=(1, 0, 0))\n",
"res = model.fit()\n",
"_ = res.plot_diagnostics()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here the KDE is strange and not normal and we have a correlation between the residuals. This is not a good model."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now that the parameters have been learned, we want to make some predictions on the test set."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# First define the same model on the whole time series\n",
"model_test = sarimax(ts, order=(3, 0, 0))\n",
"\n",
"# Use the learned parameters\n",
"model_test_res = model_test.filter(fit_res.params)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Qu 8\n",
"\n",
"Make predictions on the time series using the model.\n",
"\n",
"We use one-step ahead prediction only so we correct each time with the observation.\n",
"\n",
"ie we predict t, then we observe t, then wue predict t+1 using observation."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pred = model_test_res.get_prediction(start='2000-01-01')\n",
"pred_ci = pred.conf_int()\n",
"\n",
"print(pred_ci)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Qu 9\n",
"\n",
"Plot the predictions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ax = ts_test.plot(y=\"V1\", label='Observed', figsize=(10, 6))\n",
"pred.predicted_mean.plot(ax=ax, label='One step ahead forecast', alpha=0.7, linestyle='', marker='o')\n",
"ax.fill_between(pred_ci.index, pred_ci['lower V1'], pred_ci['upper V1'], color='k', alpha=0.2)\n",
"plt.legend()\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Qu 10\n",
"\n",
"Compute the MSE of the predictions."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mse = np.mean((pred.predicted_mean - ts_test.V1)**2)\n",
"\n",
"print(mse)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can compare with other models, for ex MA(3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_fit = sarimax(ts_test, order=(0, 0, 3))\n",
"res_fit = model_fit.fit()\n",
"\n",
"model_test = sarimax(ts, order=(0 ,0, 3))\n",
"model_test_res = model_test.filter(res_fit.params)\n",
"pred = model_test_res.get_prediction(start='2000-01-01')\n",