Commit 3a023ebf authored by Mathilde Rineau's avatar Mathilde Rineau 🙂
Browse files

Update Devoir2.ipynb

parent 84c5a384
......@@ -78,7 +78,29 @@
" \n",
" y = np.mean(X, axis=1)\n",
"\n",
" return X, y\n"
" return X, y\n",
"# /!\\ THIS IS A THIRD TEST VERSION, COULD (AND WILL CERTAINLY) CHANGE\n",
"\n",
"\n",
"import random\n",
"def generate_data_2(n_samples, n_features):\n",
" X = []\n",
" y = np.ndarray((n_samples,))\n",
" X.append(np.random.geometric(p = 0.5, size = n_features))\n",
" sum_X = np.ndarray((n_features,))\n",
" for i in range(n_samples):\n",
" p = random.random()\n",
" temp = np.random.geometric(p = p, size = n_features)\n",
" #print(temp)\n",
" sum_X = sum_X + temp\n",
" #print(sum_X)\n",
" X.append(sum_X)\n",
" X = np.array(X)\n",
" \n",
" \n",
" y = np.mean(X, axis=1)\n",
"\n",
" return X, y"
]
},
{
......@@ -135,6 +157,30 @@
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Arrays to store results\n",
"n_selected = []\n",
"\n",
"# We test our regression n_test time\n",
"for i in range(n_tests):\n",
" # Generate the data\n",
" X, y = generate_data_2(n_samples, n_features,)\n",
" # Fit the model (pipeline with the data)\n",
" model.fit(X, y)\n",
" # We can now retrieve selected features :\n",
" selected_features = lasso.coef_ != 0\n",
" n_selected.append(np.count_nonzero(selected_features))\n",
"\n",
"uniq, count = np.unique(n_selected, return_counts=True)\n",
"plt.bar(uniq, count, label='Number of selected features per training')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
......@@ -177,7 +223,41 @@
"\n",
"uniq, count = np.unique(n_selected, return_counts=True)\n",
"print(f'Features selected : {uniq}, count : {count}')\n",
"print(f'Number of time fist feature was ignored : {zero_removed}')"
"print(f'Number of time fist feature was ignored : {zero_removed}')\n",
"plt.bar(uniq, count, label='Number of selected features per training')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Arrays to store results\n",
"n_selected = []\n",
"zero_removed = 0\n",
"\n",
"# We test our regression n_test time\n",
"for i in range(n_tests):\n",
" # Generate the data\n",
" X, y = generate_data_2(n_samples, n_features)\n",
" # Fit the model (pipeline with the data)\n",
" model.fit(X, y)\n",
" # We can now retrieve selected features :\n",
" selected_features = elastic_net.coef_ != 0\n",
" n_selected.append(np.count_nonzero(selected_features))\n",
"\n",
" # Fastly show that we always remove X[0]\n",
" if not selected_features[0]:\n",
" zero_removed += 1\n",
"\n",
"\n",
"uniq, count = np.unique(n_selected, return_counts=True)\n",
"print(f'Features selected : {uniq}, count : {count}')\n",
"print(f'Number of time fist feature was ignored : {zero_removed}')\n",
"plt.bar(uniq, count, label='Number of selected features per training')\n",
"plt.show()"
]
},
{
......@@ -194,6 +274,13 @@
"\n",
"It is **like** the elastic net « found » that each $X[i], i > 0$ were generated from $X[0]$ but did not « found » a link between the elements.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
......@@ -201,7 +288,8 @@
"hash": "3abb0a1ef4892304d86bb3a3dfd052bcca35057beadba016173999c775e8d3ba"
},
"kernelspec": {
"display_name": "Python 3.9.7 64-bit ('AOS1-QteoCFsS': pipenv)",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
......@@ -213,7 +301,8 @@
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3"
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
......
%% Cell type:markdown id: tags:
# AOS1 assignment
## Make elastic net outshine the Lasso
authors : Mathilde Rineau, Rémy Huet
### Introduction
The aim of this work is to demonstrate experimentally that the elastic net regularization outshine the Lasso regulation in some cases.
We know that the Lasso regularization may be unstable when used on high-correlated data.
Indeed, the Lasso regularization may lead to ignore some features (by setting their weight in the regression to 0).
When the data is highly correlated, small changes on the sample could lead to changes in the selection of the features (what whe call instability).
At the opposite, elastic net regression should be able to ignore some features but with more stability than Lasso.
In this work, we will construct a dataset with highly correlated data to demonstrate that.
%% Cell type:code id: tags:
```
``` python
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
```
%% Cell type:markdown id: tags:
### Data generation
First, we will generate highly correlated data, containing a sample X (multidim) and a target y (one dim).
We write a function for this.
Its parameters are :
- n_samples the number of samples
- n_features the number of features in X
- m, s the parameters of the normal law used for the generation of the first feature
and the outputs X and y
For this purpose, we will proceed in X steps :
- First, we will generate the first dimension of X randomly from a normal law (m, s)
- For the other dimensions of X, noted i, the value will be calculated as follow :
- We generate a number from a normal law N(i, 1)
- We add it to the value of the first column
- For Y, the value is calculated as the mean of the values we generated for X
%% Cell type:code id: tags:
```
``` python
# /!\ THIS IS A THIRD TEST VERSION, COULD (AND WILL CERTAINLY) CHANGE
def generate_data(n_samples, n_features, m, s):
X = np.ndarray((n_samples, n_features))
y = np.ndarray((n_samples,))
for i in range(n_samples):
X[i, 0] = np.random.normal(m, s)
for j in range(1, n_features):
X[i, j] = X[i, 0] + np.random.normal(i / 2, 1)
y = np.mean(X, axis=1)
return X, y
# /!\ THIS IS A THIRD TEST VERSION, COULD (AND WILL CERTAINLY) CHANGE
import random
def generate_data_2(n_samples, n_features):
X = []
y = np.ndarray((n_samples,))
X.append(np.random.geometric(p = 0.5, size = n_features))
sum_X = np.ndarray((n_features,))
for i in range(n_samples):
p = random.random()
temp = np.random.geometric(p = p, size = n_features)
#print(temp)
sum_X = sum_X + temp
#print(sum_X)
X.append(sum_X)
X = np.array(X)
y = np.mean(X, axis=1)
return X, y
```
%% Cell type:markdown id: tags:
### Demonstrate instability of Lasso
Now that we have a way to generate highly correlated data, we will use a lasso regression on it.
The aim of this part is to demonstrate the instability of the Lasso regression on this data.
For this purpose, we will use a loop to generate several datasets using **the same params** on our `generate_data` function.
We will demonstrate instability by counting the number of selected features each time, and registe which features are selected.
**Note :** the model auto-correct intercept by default `fit_intercept=True`.
The parameter `normalize` is deprecated, so we use a pipeline to normalize the data before the regression, as suggested in the deprecation message. We set the `with_mean` parameter to `False` because we just scale the data and do not center it, we leave this work to the Lasso.
%% Cell type:code id: tags:
```
``` python
# Params for data generation:
n_samples = 500
n_features = 50
m = 30
s = 3
# Number of tests
n_tests = 100
standard_scaler = StandardScaler(with_mean=False)
lasso = Lasso(alpha=1.0, fit_intercept=True, max_iter=5000)
model = make_pipeline(standard_scaler, lasso)
# Arrays to store results
n_selected = []
# We test our regression n_test time
for i in range(n_tests):
# Generate the data
X, y = generate_data(n_samples, n_features, m, s)
# Fit the model (pipeline with the data)
model.fit(X, y)
# We can now retrieve selected features :
selected_features = lasso.coef_ != 0
n_selected.append(np.count_nonzero(selected_features))
uniq, count = np.unique(n_selected, return_counts=True)
plt.bar(uniq, count, label='Number of selected features per training')
plt.show()
```
%% Cell type:code id: tags:
``` python
# Arrays to store results
n_selected = []
# We test our regression n_test time
for i in range(n_tests):
# Generate the data
X, y = generate_data_2(n_samples, n_features,)
# Fit the model (pipeline with the data)
model.fit(X, y)
# We can now retrieve selected features :
selected_features = lasso.coef_ != 0
n_selected.append(np.count_nonzero(selected_features))
uniq, count = np.unique(n_selected, return_counts=True)
plt.bar(uniq, count, label='Number of selected features per training')
plt.show()
```
%% Cell type:markdown id: tags:
### Demonstrate stability of elastic net
In a second time, we can do the same test with an elastic net regression model to highlight the difference between the two methods
%% Cell type:code id: tags:
```
``` python
# We use the same alpha as the lasso regression
# Assume we really want to select features, we give the priority to l1
elastic_net = ElasticNet(alpha=1.0, l1_ratio=0.8, fit_intercept=True, max_iter=10000)
model = make_pipeline(standard_scaler, elastic_net)
# Arrays to store results
n_selected = []
zero_removed = 0
# We test our regression n_test time
for i in range(n_tests):
# Generate the data
X, y = generate_data(n_samples, n_features, m, s)
# Fit the model (pipeline with the data)
model.fit(X, y)
# We can now retrieve selected features :
selected_features = elastic_net.coef_ != 0
n_selected.append(np.count_nonzero(selected_features))
# Fastly show that we always remove X[0]
if not selected_features[0]:
zero_removed += 1
uniq, count = np.unique(n_selected, return_counts=True)
print(f'Features selected : {uniq}, count : {count}')
print(f'Number of time fist feature was ignored : {zero_removed}')
plt.bar(uniq, count, label='Number of selected features per training')
plt.show()
```
%% Cell type:code id: tags:
``` python
# Arrays to store results
n_selected = []
zero_removed = 0
# We test our regression n_test time
for i in range(n_tests):
# Generate the data
X, y = generate_data_2(n_samples, n_features)
# Fit the model (pipeline with the data)
model.fit(X, y)
# We can now retrieve selected features :
selected_features = elastic_net.coef_ != 0
n_selected.append(np.count_nonzero(selected_features))
# Fastly show that we always remove X[0]
if not selected_features[0]:
zero_removed += 1
uniq, count = np.unique(n_selected, return_counts=True)
print(f'Features selected : {uniq}, count : {count}')
print(f'Number of time fist feature was ignored : {zero_removed}')
plt.bar(uniq, count, label='Number of selected features per training')
plt.show()
```
%% Cell type:markdown id: tags:
Notes : **migh change if we found a better dataset**
- Instability of Lasso is proved
- Stability of elastic_net is OK for this sample.
BUT :
- Feature selection w/ elastic net for this sample is not satisfying (we only remove the first one)
It is **like** the elastic net « found » that each $X[i], i > 0$ were generated from $X[0]$ but did not « found » a link between the elements.
%% Cell type:code id: tags:
``` python
```
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment