Commit dd7785d7 authored by Rémy Huet's avatar Rémy Huet 💻
Browse files

Some introduction

parent 8e23baf1
......@@ -5,27 +5,32 @@
"id": "5c8980bd",
"metadata": {},
"source": [
"# AOS1 Problem\n",
"# AOS1 - Assignment\n",
"## Improving the accuracy and speed of support vector machines\n",
"\n",
"## Mathilde Rineau, Remy Huet \n",
"## 17/10/2021\n"
"Authors : Mathilde Rineau, Rémy Huet\n",
"\n",
"### Abstract\n",
"\n",
"The paper \"Improving the Accuracy and Speed of Support Vector Machines\" by Burges and Schölkopf is investigating a method to improve ht speed an accuracy of a support vector machine.\n",
"\n",
"As the authors say, SVM are wildly used for several applications.\n",
"To improve this method, the authors make the difference between two types of improvements to achieve :\n",
"- improving the generalization performance;\n",
"- improving the speed in test phase.\n",
"\n",
"The authors propose and combine two methods to improve SVM performances : the \"virtual support vector\" method and the \"reduced set\" method.\n",
"With those two improvements, they announce a machine much faster (22 times than the original one) and more precise (1.1% vs 1.4% error) than the original one.\n",
"\n",
"In this work, we will describe and program the two techniques they are used to see if these method are working as they say."
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"id": "9f152334",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(70000, 784)\n",
"(70000,)\n"
]
}
],
"outputs": [],
"source": [
"#We will work on the mnist data set\n",
"#We load it from fetch_openml\n",
......@@ -42,7 +47,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"id": "4d3fa1c7",
"metadata": {},
"outputs": [],
......@@ -56,21 +61,10 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"id": "d809fc87",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"SVC(C=10, degree=5, kernel='poly')"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"#First, we perform a SVC without preprocessing or improving in terms of accuracy or speed\n",
"from sklearn.svm import SVC\n",
......@@ -85,7 +79,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"id": "8cb28178",
"metadata": {},
"outputs": [],
......@@ -96,27 +90,10 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"id": "c1248238",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 923 1 2 0 0 2 3 1 3 0]\n",
" [ 0 1157 4 1 0 1 1 3 2 0]\n",
" [ 7 10 925 4 0 0 5 2 1 0]\n",
" [ 3 7 3 1000 0 10 0 0 7 5]\n",
" [ 1 11 5 1 952 0 1 0 3 8]\n",
" [ 6 9 1 8 0 875 3 1 3 1]\n",
" [ 7 8 0 0 2 7 952 0 1 0]\n",
" [ 1 7 5 1 1 1 0 1070 2 11]\n",
" [ 3 8 4 8 0 10 0 2 905 4]\n",
" [ 2 6 2 5 6 3 0 11 6 957]]\n"
]
}
],
"outputs": [],
"source": [
"#We compute the confusion matrix \n",
"print(confusion_matrix(y_test, y_pred))"
......@@ -124,34 +101,10 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"id": "ba4e38ac",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.97 0.99 0.98 935\n",
" 1 0.95 0.99 0.97 1169\n",
" 2 0.97 0.97 0.97 954\n",
" 3 0.97 0.97 0.97 1035\n",
" 4 0.99 0.97 0.98 982\n",
" 5 0.96 0.96 0.96 907\n",
" 6 0.99 0.97 0.98 977\n",
" 7 0.98 0.97 0.98 1099\n",
" 8 0.97 0.96 0.96 944\n",
" 9 0.97 0.96 0.96 998\n",
"\n",
" accuracy 0.97 10000\n",
" macro avg 0.97 0.97 0.97 10000\n",
"weighted avg 0.97 0.97 0.97 10000\n",
"\n"
]
}
],
"outputs": [],
"source": [
"#We print the classification report\n",
"print(classification_report(y_test, y_pred))"
......@@ -159,19 +112,10 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": null,
"id": "947b0895",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.9716\n",
"Error rate: 2.839999999999998 %\n"
]
}
],
"outputs": [],
"source": [
"#We print the accuracy of the SVC and the error rate \n",
"print(\"Accuracy: \",accuracy_score(y_test, y_pred))\n",
......@@ -180,27 +124,10 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"id": "81b09df7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[0. 0. 0. ... 0. 0. 0.]\n",
" [0. 0. 0. ... 0. 0. 0.]\n",
" [0. 0. 0. ... 0. 0. 0.]\n",
" ...\n",
" [0. 0. 0. ... 0. 0. 0.]\n",
" [0. 0. 0. ... 0. 0. 0.]\n",
" [0. 0. 0. ... 0. 0. 0.]]\n",
"(8164, 784)\n",
"23.246173469387756\n",
"0.0\n"
]
}
],
"outputs": [],
"source": [
"#We then generated new training data by translating the resulting support vectors \n",
"#by one pixel in each of four directions\n",
......@@ -213,7 +140,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": null,
"id": "0e648133",
"metadata": {},
"outputs": [],
......@@ -234,19 +161,10 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": null,
"id": "aa5535c9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[2 3 4 5 1]\n",
" [2 3 4 4 1]]\n"
]
}
],
"outputs": [],
"source": [
"m = []\n",
"m.append([1,2,3,4,5])\n",
......@@ -272,9 +190,11 @@
}
],
"metadata": {
"interpreter": {
"hash": "78ff2a7d75990e26f7862f23aec114522929670ec71bbfd9a70bdb18a9100993"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"display_name": "Python 3.9.7 64-bit ('AOS1-3HAiNONq': pipenv)",
"name": "python3"
},
"language_info": {
......@@ -286,8 +206,7 @@
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
"pygments_lexer": "ipython3"
}
},
"nbformat": 4,
......
%% Cell type:markdown id:5c8980bd tags:
# AOS1 Problem
# AOS1 - Assignment
## Improving the accuracy and speed of support vector machines
## Mathilde Rineau, Remy Huet
## 17/10/2021
Authors : Mathilde Rineau, Rémy Huet
### Abstract
The paper "Improving the Accuracy and Speed of Support Vector Machines" by Burges and Schölkopf is investigating a method to improve ht speed an accuracy of a support vector machine.
As the authors say, SVM are wildly used for several applications.
To improve this method, the authors make the difference between two types of improvements to achieve :
- improving the generalization performance;
- improving the speed in test phase.
The authors propose and combine two methods to improve SVM performances : the "virtual support vector" method and the "reduced set" method.
With those two improvements, they announce a machine much faster (22 times than the original one) and more precise (1.1% vs 1.4% error) than the original one.
In this work, we will describe and program the two techniques they are used to see if these method are working as they say.
%% Cell type:code id:9f152334 tags:
``` python
```
#We will work on the mnist data set
#We load it from fetch_openml
from sklearn.datasets import fetch_openml
import pandas as pd
import matplotlib.pyplot as plt
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
#We print the caracteristics of X and Y
print(X.shape)
print(y.shape)
```
%%%% Output: stream
(70000, 784)
(70000,)
%% Cell type:code id:4d3fa1c7 tags:
``` python
```
#We divide the data set in two parts: train set and test set
#According to the recommended values the train set's size is 60000 and the test set's size is 10000
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, train_size=60000, test_size=10000)
```
%% Cell type:code id:d809fc87 tags:
``` python
```
#First, we perform a SVC without preprocessing or improving in terms of accuracy or speed
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
#we perform the default SVC, with the hyperparameter C=10 and a polynomial kernel of degree 5
#according to the recommandations
svc = SVC(C=10, kernel = 'poly', degree = 5)
svc.fit(X_train, y_train)
```
%%%% Output: execute_result
SVC(C=10, degree=5, kernel='poly')
%% Cell type:code id:8cb28178 tags:
``` python
```
#We predict the values for our test set
y_pred = svc.predict(X_test)
```
%% Cell type:code id:c1248238 tags:
``` python
```
#We compute the confusion matrix
print(confusion_matrix(y_test, y_pred))
```
%%%% Output: stream
[[ 923 1 2 0 0 2 3 1 3 0]
[ 0 1157 4 1 0 1 1 3 2 0]
[ 7 10 925 4 0 0 5 2 1 0]
[ 3 7 3 1000 0 10 0 0 7 5]
[ 1 11 5 1 952 0 1 0 3 8]
[ 6 9 1 8 0 875 3 1 3 1]
[ 7 8 0 0 2 7 952 0 1 0]
[ 1 7 5 1 1 1 0 1070 2 11]
[ 3 8 4 8 0 10 0 2 905 4]
[ 2 6 2 5 6 3 0 11 6 957]]
%% Cell type:code id:ba4e38ac tags:
``` python
```
#We print the classification report
print(classification_report(y_test, y_pred))
```
%%%% Output: stream
precision recall f1-score support
0 0.97 0.99 0.98 935
1 0.95 0.99 0.97 1169
2 0.97 0.97 0.97 954
3 0.97 0.97 0.97 1035
4 0.99 0.97 0.98 982
5 0.96 0.96 0.96 907
6 0.99 0.97 0.98 977
7 0.98 0.97 0.98 1099
8 0.97 0.96 0.96 944
9 0.97 0.96 0.96 998
accuracy 0.97 10000
macro avg 0.97 0.97 0.97 10000
weighted avg 0.97 0.97 0.97 10000
%% Cell type:code id:947b0895 tags:
``` python
```
#We print the accuracy of the SVC and the error rate
print("Accuracy: ",accuracy_score(y_test, y_pred))
print("Error rate: ",(1-accuracy_score(y_test, y_pred))*100,"%")
```
%%%% Output: stream
Accuracy: 0.9716
Error rate: 2.839999999999998 %
%% Cell type:code id:81b09df7 tags:
``` python
```
#We then generated new training data by translating the resulting support vectors
#by one pixel in each of four directions
import numpy as np
print(svc.support_vectors_)
print(svc.support_vectors_.shape)
print(np.mean(svc.support_vectors_[0]))
print(svc.support_vectors_[0][1])
```
%%%% Output: stream
[[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
...
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]]
(8164, 784)
23.246173469387756
0.0
%% Cell type:code id:0e648133 tags:
``` python
```
def right_side_rescaling(support_vectors):
n,m = support_vectors.shape
#print(n,m)
support_vector_lin =support_vectors.reshape((-1, n*m))
#print(support_vector_lin.shape)
temp = support_vector_lin[0][0]
for i in range (n*m-2):
#print(support_vector_lin[0][i])
support_vector_lin[0][i] = support_vector_lin[0][i+1]
support_vector_lin[0][n*m-1] = temp
support_vectors = support_vector_lin.reshape(n,m)
return support_vectors
```
%% Cell type:code id:aa5535c9 tags:
``` python
```
m = []
m.append([1,2,3,4,5])
m.append([1,2,3,4,5])
print(right_side_rescaling(np.array(m)))
```
%%%% Output: stream
[[2 3 4 5 1]
[2 3 4 4 1]]
%% Cell type:code id:21db8ae3 tags:
``` python
```
```
%% Cell type:code id:9bb8ab5a tags:
``` python
```
```
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment