Commit c685a2d1 authored by TheophilePACE's avatar TheophilePACE

Correction Jour 0

parent 0a071b58
......@@ -291,7 +291,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
"version": "3.6.6"
}
},
"nbformat": 4,
......
......@@ -17,10 +17,83 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": []
"source": [
"import numpy as np\n",
"from sklearn import datasets\n",
"from matplotlib import pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"breast_cancer = datasets.load_breast_cancer()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"X = breast_cancer.data\n",
"y = breast_cancer.target\n",
"feature_names = breast_cancer.feature_names"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(569, 30) (569,)\n",
"[0 0 0 0 0]\n",
"['mean radius' 'mean texture' 'mean perimeter' 'mean area'\n",
" 'mean smoothness' 'mean compactness' 'mean concavity'\n",
" 'mean concave points' 'mean symmetry' 'mean fractal dimension'\n",
" 'radius error' 'texture error' 'perimeter error' 'area error'\n",
" 'smoothness error' 'compactness error' 'concavity error'\n",
" 'concave points error' 'symmetry error' 'fractal dimension error'\n",
" 'worst radius' 'worst texture' 'worst perimeter' 'worst area'\n",
" 'worst smoothness' 'worst compactness' 'worst concavity'\n",
" 'worst concave points' 'worst symmetry' 'worst fractal dimension']\n"
]
}
],
"source": [
"print(X.shape, y.shape)\n",
"print(y[:5])\n",
"print(feature_names)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([2.057e+01, 1.777e+01, 1.329e+02, 1.326e+03, 8.474e-02])"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X[1][:5]"
]
},
{
"cell_type": "markdown",
......@@ -41,15 +114,29 @@
"\n",
"Pourquoi est-ce nécessaire?\n",
"\n",
"Pour cela, utilisez la fonction scikit-learn `sklearn.model_selection.train_test_split`. Importez cette méthode, appliquer là à nos données."
"Pour cela, utilisez la fonction scikit-learn `sklearn.model_selection.train_test_split`. Importez cette méthode, appliquer là à nos données.\n",
"\n",
"On utilise 2 fois train_test_split, afin de séparer 2 fois l'ensemble: une fois entre train_validation d'une part, unee fois entre train et validation."
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": []
"source": [
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"X_tv,X_test, y_tv,y_test = train_test_split(X,y,test_size=.2, random_state=42)\n",
"X_train,X_validation,y_train,y_validation = train_test_split(X_tv,y_tv,test_size=.25,random_state=42)"
]
},
{
"cell_type": "markdown",
......@@ -61,24 +148,165 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": []
"source": [
"from sklearn.neighbors import KNeighborsClassifier \n",
"from sklearn.metrics import confusion_matrix, accuracy_score"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 0.9298245614035088\n",
"[[40 4]\n",
" [ 4 66]]\n",
"4 0.9210526315789473\n",
"[[40 4]\n",
" [ 5 65]]\n",
"7 0.9385964912280702\n",
"[[40 4]\n",
" [ 3 67]]\n",
"10 0.9385964912280702\n",
"[[40 4]\n",
" [ 3 67]]\n",
"13 0.9298245614035088\n",
"[[39 5]\n",
" [ 3 67]]\n",
"16 0.9210526315789473\n",
"[[39 5]\n",
" [ 4 66]]\n",
"19 0.9298245614035088\n",
"[[39 5]\n",
" [ 3 67]]\n"
]
}
],
"source": [
"# hyperparamter\n",
"K_max = 20\n",
"for K in range(1,K_max,3):\n",
" # declare classifier with hyperparameters\n",
" knn = KNeighborsClassifier(n_neighbors=K)\n",
" # train (aka fit) the classifier on the train dataset\n",
" knn.fit(X_train,y_train)\n",
" # predict the validation dataset\n",
" y_validation_hat = knn.predict(X_validation)\n",
" # check the result\n",
" print(K,accuracy_score(y_pred=y_validation_hat,y_true=y_validation))\n",
" print(confusion_matrix(y_pred=y_validation_hat,y_true=y_validation))\n",
" # Now, adjust hyperparamaeters"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Comment choisir K? Essayez différents K, regardez les résultats."
"Comment choisir K? Essayez différents K, regardez les résultats.\n",
"\n",
"Notre objectif est de minimiseer le taux d'erreur. On va tracer 1 - accuracy en fonction de K, et choisir le K le plus faibble:"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"data": {
"text/plain": [
"[<matplotlib.lines.Line2D at 0x10cea2748>]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# hyperparamter\n",
"K_max = len(X_train)\n",
"accuracies = []\n",
"for K in range(1,K_max,1):\n",
" # declare classifier with hyperparameters\n",
" knn = KNeighborsClassifier(n_neighbors=K)\n",
" # train (aka fit) the classifier on the train dataset\n",
" knn.fit(X_train,y_train)\n",
" # predict the validation dataset\n",
" y_validation_hat = knn.predict(X_validation)\n",
" # check the result\n",
" accuracies.append(accuracy_score(y_pred=y_validation_hat,y_true=y_validation))\n",
"# si on trace juste le tableau, on sera décalé de 1\n",
"plt.plot(range(1,K_max),accuracies)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<matplotlib.lines.Line2D at 0x10ce03748>]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.plot(range(1,50),accuracies[:49])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0.9385964912280702, 3)"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# On cherche le k ayant la precision maximale: argument du maximum, + 1 car les index sont décalés de 1\n",
"np.max(accuracies),np.argmax(accuracies)+1"
]
},
{
"cell_type": "markdown",
......@@ -93,11 +321,168 @@
"Bref, commencez par importer le NBC depuis scikit-learn. https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\u001b[0;31mInit signature:\u001b[0m \u001b[0mGaussianNB\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpriors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvar_smoothing\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1e-09\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mDocstring:\u001b[0m \n",
"Gaussian Naive Bayes (GaussianNB)\n",
"\n",
"Can perform online updates to model parameters via `partial_fit` method.\n",
"For details on algorithm used to update feature means and variance online,\n",
"see Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque:\n",
"\n",
" http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf\n",
"\n",
"Read more in the :ref:`User Guide <gaussian_naive_bayes>`.\n",
"\n",
"Parameters\n",
"----------\n",
"priors : array-like, shape (n_classes,)\n",
" Prior probabilities of the classes. If specified the priors are not\n",
" adjusted according to the data.\n",
"\n",
"var_smoothing : float, optional (default=1e-9)\n",
" Portion of the largest variance of all features that is added to\n",
" variances for calculation stability.\n",
"\n",
"Attributes\n",
"----------\n",
"class_prior_ : array, shape (n_classes,)\n",
" probability of each class.\n",
"\n",
"class_count_ : array, shape (n_classes,)\n",
" number of training samples observed in each class.\n",
"\n",
"theta_ : array, shape (n_classes, n_features)\n",
" mean of each feature per class\n",
"\n",
"sigma_ : array, shape (n_classes, n_features)\n",
" variance of each feature per class\n",
"\n",
"epsilon_ : float\n",
" absolute additive value to variances\n",
"\n",
"Examples\n",
"--------\n",
">>> import numpy as np\n",
">>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])\n",
">>> Y = np.array([1, 1, 1, 2, 2, 2])\n",
">>> from sklearn.naive_bayes import GaussianNB\n",
">>> clf = GaussianNB()\n",
">>> clf.fit(X, Y)\n",
"GaussianNB(priors=None, var_smoothing=1e-09)\n",
">>> print(clf.predict([[-0.8, -1]]))\n",
"[1]\n",
">>> clf_pf = GaussianNB()\n",
">>> clf_pf.partial_fit(X, Y, np.unique(Y))\n",
"GaussianNB(priors=None, var_smoothing=1e-09)\n",
">>> print(clf_pf.predict([[-0.8, -1]]))\n",
"[1]\n",
"\u001b[0;31mFile:\u001b[0m ~/miniconda3/lib/python3.6/site-packages/sklearn/naive_bayes.py\n",
"\u001b[0;31mType:\u001b[0m ABCMeta\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.naive_bayes import GaussianNB\n",
"# no hyperparamter\n",
"nbc = GaussianNB?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"nbc = GaussianNB"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[40 4]\n",
" [ 5 65]]\n",
"0.9210526315789473\n"
]
}
],
"source": [
"clf = GaussianNB()\n",
"clf.fit(X=X_train,y=y_train)\n",
"y_validation_hat = clf.predict(X_validation)\n",
"print(confusion_matrix(y_pred=y_validation_hat, y_true=y_validation))\n",
"print(accuracy_score(y_pred=y_validation_hat, y_true=y_validation))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Consultez la doc pour connaître les arguments demandés. Utilisez cet algorithme sur le jeu de données `boston`."
"__On compare toujours les modèles sur l'ensemble de test!__\n",
"Pour chaque modèle\n",
"- On entraine sur X_train\n",
"- On prédit X_validation, on ajuste ses paramètres\n",
"- on réentraine sur X_train\n",
"- On prédit X_validation, et ainsi de suite tant que le résultat n'est pas satisfaisant\n",
"- __Finalement, une seule fois__, on lance sur X_test. \n",
"- On compare avec les autres modèle"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"K-NN With k= 6\n",
"0.956140350877193\n",
"[[42 1]\n",
" [ 4 67]]\n",
"NBC\n",
"0.9649122807017544\n",
"[[40 3]\n",
" [ 1 70]]\n"
]
}
],
"source": [
"# COMPARAISON DES MODELES\n",
"# K-NN\n",
"# declare classifier with hyperparameters\n",
"knn = KNeighborsClassifier(n_neighbors=6)\n",
"# train (aka fit) the classifier on the train dataset\n",
"knn.fit(X_train,y_train)\n",
"# predict the validation dataset\n",
"y_test_hat_knn = knn.predict(X_test)\n",
"# check the result\n",
"print(\"K-NN With k= 6\")\n",
"print(accuracy_score(y_pred=y_test_hat_knn,y_true=y_test))\n",
"print(confusion_matrix(y_pred=y_test_hat_knn,y_true=y_test))\n",
"\n",
"# NBC\n",
"y_test_hat_nbc = clf.predict(X_test)\n",
"print(\"NBC\")\n",
"print(accuracy_score(y_pred=y_test_hat_nbc,y_true=y_test))\n",
"print(confusion_matrix(y_pred=y_test_hat_nbc,y_true=y_test))"
]
},
{
......@@ -122,7 +507,47 @@
"\n",
"Et, de manière assez inattendue... Scikit propose un implémentation de la régression logistique. La doc est consultable ici: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html\n",
"\n",
"Appliquez la régression logistique toujours sur les données `boston`."
"Appliquez la régression logistique toujours sur les données `breast_cancer`."
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.linear_model import LogisticRegression"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9649122807017544\n",
"[[42 2]\n",
" [ 2 68]]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/theophilepace/miniconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
" FutureWarning)\n"
]
}
],
"source": [
"logreg = LogisticRegression()\n",
"logreg.fit(X_train,y_train)\n",
"y_hat = logreg.predict(X_validation)\n",
"print(accuracy_score(y_pred=y_hat,y_true=y_validation))\n",
"print(confusion_matrix(y_pred=y_hat,y_true=y_validation))"
]
},
{
......@@ -132,6 +557,27 @@
"Quels sont vos résultats (calculer l'accuracy) ? Sont-ils meilleurs que pour le NBC?"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.956140350877193\n",
"[[39 4]\n",
" [ 1 70]]\n"
]
}
],
"source": [
"pred= logreg.predict(X_test)\n",
"print(accuracy_score(y_pred=pred,y_true=y_test))\n",
"print(confusion_matrix(y_pred=pred,y_true=y_test))"
]
},
{
"cell_type": "markdown",
"metadata": {},
......@@ -139,6 +585,42 @@
"Le principal avantage de la régression logistique est son interprétabilité, grâce aux poids. Quelles sont les features qui vous ont permis de discriminer entre les classes? Regardez et comparer pour cela les poids du vecteur $w$."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 2.16929585e+00, 1.03505165e-01, -1.69351567e-01,\n",
" -4.09229286e-04, -1.26894192e-01, -4.17556456e-01,\n",
" -6.57815732e-01, -3.19770431e-01, -1.87089660e-01,\n",
" -2.83143858e-02, -1.98836785e-02, 1.43346281e+00,\n",
" -2.24578570e-01, -6.55814280e-02, -1.53238693e-02,\n",
" -2.51797663e-02, -7.41128784e-02, -3.60638850e-02,\n",
" -4.08750126e-02, 1.59879611e-03, 1.23695888e+00,\n",
" -3.83174555e-01, -1.67220956e-02, -2.81166697e-02,\n",
" -2.40073214e-01, -1.25834089e+00, -1.67423372e+00,\n",
" -5.79028509e-01, -6.93277983e-01, -1.24763925e-01]])"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"logreg.coef_"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
......@@ -146,6 +628,13 @@
"**optionel** Essayez de tracer vos classes dans $R^2$, en utilisant les 2 features les plus discriminantes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
......@@ -187,10 +676,27 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.956140350877193\n",
"[[42 2]\n",
" [ 3 67]]\n"
]
}
],
"source": [
"from sklearn.svm import SVC\n",
"svm = SVC(kernel='linear')\n",
"svm.fit(X_train,y_train)\n",
"y_hat = svm.predict(X_validation)\n",
"print(accuracy_score(y_pred=y_hat,y_true=y_validation))\n",
"print(confusion_matrix(y_pred=y_hat,y_true=y_validation))"
]
},
{
"cell_type": "markdown",
......@@ -206,6 +712,50 @@
"On va maintenant utiliser un noyeau non linéaire, le `rbf`. Réutilisez votre code précédent, en changeant simplement le kernel."
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9298245614035088\n",
"[[39 5]\n",
" [ 3 67]]\n"
]
}
],
"source": [
"svm = SVC(kernel='rbf', gamma='scale')\n",
"svm.fit(X_train,y_train)\n",
"y_hat = svm.predict(X_validation)\n",
"print(accuracy_score(y_pred=y_hat,y_true=y_validation))\n",
"print(confusion_matrix(y_pred=y_hat,y_true=y_validation))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.956140350877193\n",
"[[39 4]\n",
" [ 1 70]]\n"
]
}
],
"source": [
"y_pred = svm.predict(X_test)\n",
"print(accuracy_score(y_pred=pred,y_true=y_test))\n",
"print(confusion_matrix(y_pred=pred,y_true=y_test))"
]
},
{
"cell_type": "markdown",
"metadata": {},
......@@ -224,10 +774,22 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"204 µs ± 6.73 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
"829 µs ± 56.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
]
}
],
"source": [
"%timeit clf.predict(X_train)\n",
"%timeit svm.predict(X_test)"
]
},
{
"cell_type": "markdown",
......@@ -274,18 +836,65 @@
"Consulter la doc string."
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"ename": "NotImplementedError",
"evalue": "INSTALLEZ GRAPHVIZ",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-43-e8b47aed4505>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtree\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mDecisionTreeClassifier\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"INSTALLEZ GRAPHVIZ\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;31m# ne vous occupez pas de cette fonction, c'est juste de la visu\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;31m#!pip install graphviz\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNotImplementedError\u001b[0m: INSTALLEZ GRAPHVIZ"
]
}
],
"source": [
"from sklearn.tree import DecisionTreeClassifier\n",
"\n",
"raise NotImplementedError(\"INSTALLEZ GRAPHVIZ\")\n",
"# ne vous occupez pas de cette fonction, c'est juste de la visu\n",
"\n",
"from graphviz import Source\n",
"from sklearn.tree import export_graphviz\n",
"from IPython.display import SVG\n",
"\n",
"\n",
"def visualize_tree(clf):\n",
" dotefile_string = export_graphviz(clf, out_file=None,feature_names=feature_names, class_names=breast_cancer.target_names)\n",
" graph = Source(dotefile_string)\n",
" return SVG(graph.pipe('svg'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [