From b64018c5f706f99fdd33f89890a4e471a57316d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?H=C3=A9lo=C3=AFse=20Chevalier?=
 <heloise.chevalier@etu.utc.fr>
Date: Fri, 19 Oct 2018 12:07:18 +0200
Subject: [PATCH] cross validation regression

---
 cv_regression.r       | 231 ++++++++++++++++++++++++++++++++++++++++++
 "r\303\251gression.r" |  89 ++++++++++++++--
 2 files changed, 309 insertions(+), 11 deletions(-)
 create mode 100644 cv_regression.r

diff --git a/cv_regression.r b/cv_regression.r
new file mode 100644
index 0000000..698890c
--- /dev/null
+++ b/cv_regression.r
@@ -0,0 +1,231 @@
+library(MASS)
+library(leaps)
+library(corrplot)
+library(glmnet)
+
+#chargement des données
+reg.data <- read.table("tp3_a18_reg_app.txt")
+
+x.reg <- reg.data[,1:50]
+y.reg <- reg.data[,"y"]
+
+n = dim(reg.data)[1]
+p = dim(reg.data)[2]
+
+
+models.list <- c("subset13", "subset18", "subset.stepwise15", "subset.stepwise12", "subset.stepwise20", "subset.stepwise22", "subset.stepwise18", "ridge13", "ridge18", "ridge.stepwise15", "ridge.stepwise12", "ridge.stepwise20", "ridge.stepwise22", "ridge.stepwise18", "lasso13", "lasso18", "lasso.stepwise15", "lasso.stepwise12", "lasso.stepwise20", "lasso.stepwise22", "lasso.stepwise18") # Liste des modeles
+models.error <- c()
+trials <- 10 # Nombre d'essais de cross-validation par modèle
+n_folds <- 5 # Nombre de blocs pour la cross-validation
+
+# Test pour chaque modèle
+
+for(model in models.list) # Pour chaque modele
+{
+  models.error[model] <- 0
+  for(i in c(1:trials)) # On fait plusieurs essais
+  {
+    folds <- sample(rep(1:n_folds, length.out=n))
+    for(k in 1:n_folds) # Sur chaque segment de la cross-validation
+    {
+      reg.train <- reg.data[folds!=k,]
+      reg.train.X <- reg.train[,-p]
+      reg.train.y <- reg.train$y
+      reg.test <- reg.data[folds==k,]
+      reg.test.X <- reg.test[,-p]
+      reg.test.y <- reg.test$y
+      
+      # Apprentissage du modele
+      if(model == "subset13")
+      {
+        reg.model <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49, data = reg.train)
+      }
+      else if(model == "subset18")
+      {
+        reg.model <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X40 + X43 + X46, data = reg.train)
+      }
+      else if(model == "subset.stepwise15")
+      {
+        reg.model <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X41 + X24 + X46 + X49, data = reg.train)
+      }
+      else if(model == "subset.stepwise12")
+      {
+        reg.model <- lm(y ~ X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X1 + X49 , data = reg.train)
+      }
+      else if(model == "subset.stepwise20")
+      {
+        reg.model <- lm(y ~ X1 + X2 + X3 + X10 + X14 + X16 + X17 + X19 + X24 + X32 + X34 + X35 + X37 + X38 + X39 + X40 + X41 + X43 + X46 + X49, data = reg.train)
+      }
+      else if(model == "subset.stepwise22")
+      {
+        reg.model <- lm(y ~ X1 + X2 + X3 + X10 + X14 + X16 + X17 + X19 + X24 + X32 + X34 + X35 + X37 + X38 + X39 + X40 + X41 + X43 + X46 + X49, data = reg.train)
+      }
+      else if(model == "subset.stepwise18")
+      {
+        reg.model <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X17 + X43 + X46, data = reg.train)
+      }
+      else if(model == "ridge13")
+      {
+        x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X28 + X31 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49, reg.data)
+        xapp <- x[folds!=k,]
+        xtst <- x[folds == k,]
+        
+        cv.out<-cv.glmnet(xapp,reg.train.y,alpha=0)
+        
+        reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=0)
+      }
+      else if(model == "ridge18")
+      {
+        x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X40 + X43 + X46, reg.data)
+        xapp <- x[folds!=k,]
+        xtst <- x[folds == k,]
+        
+        cv.out<-cv.glmnet(xapp,reg.train.y,alpha=0)
+        
+        reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=0)
+      }
+      else if(model == "ridge.stepwise15")
+      {
+        x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X41 + X24 + X46 + X49, reg.data)
+        xapp <- x[folds!=k,]
+        xtst <- x[folds == k,]
+        
+        cv.out<-cv.glmnet(xapp,reg.train.y,alpha=0)
+        
+        reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=0)
+      }
+      else if(model == "ridge.stepwise12")
+      {
+        x <- model.matrix(y ~ X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X1 + X49, reg.data)
+        xapp <- x[folds!=k,]
+        xtst <- x[folds == k,]
+      
+        cv.out<-cv.glmnet(xapp,reg.train.y,alpha=0)
+      
+        reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=0)
+      }
+      else if(model == "ridge.stepwise20")
+      {
+        x <- model.matrix(y ~ X1 + X2 + X3 + X10 + X14 + X16 + X17 + X19 + X24 + X32 + X34 + X35 + X37 + X38 + X39 + X40 + X41 + X43 + X46 + X49, reg.data)
+        xapp <- x[folds!=k,]
+        xtst <- x[folds == k,]
+        
+        cv.out<-cv.glmnet(xapp,reg.train.y,alpha=0)
+        
+        reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=0)
+      }
+      else if(model == "ridge.stepwise22")
+      {
+        x <- model.matrix(y ~ X1 + X2 + X3 + X10 + X14 + X16 + X17 + X19 + X24 + X32 + X34 + X35 + X37 + X38 + X39 + X40 + X41 + X43 + X46 + X49, reg.data)
+        xapp <- x[folds!=k,]
+        xtst <- x[folds == k,]
+        
+        cv.out<-cv.glmnet(xapp,reg.train.y,alpha=0)
+        
+        reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=0)
+      }
+      else if(model == "ridge.stepwise18")
+      {
+        x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X17 + X43 + X46, reg.data)
+        xapp <- x[folds!=k,]
+        xtst <- x[folds == k,]
+        
+        cv.out<-cv.glmnet(xapp,reg.train.y,alpha=0)
+        
+        reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=0)
+      }
+      else if(model == "lasso13")
+      {
+        x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49, reg.data)
+        xapp <- x[folds!=k,]
+        xtst <- x[folds == k,]
+        
+        cv.out<-cv.glmnet(xapp,reg.train.y,alpha=1)
+        
+        reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=1)
+      }
+      else if(model == "lasso18")
+      {
+        x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X40 + X43 + X46, reg.data)
+        xapp <- x[folds!=k,]
+        xtst <- x[folds == k,]
+        
+        cv.out<-cv.glmnet(xapp,reg.train.y,alpha=1)
+        
+        reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=1)
+      }
+      else if(model == "lasso.stepwise15")
+      {
+        x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X41 + X24 + X46 + X49, reg.data)
+        xapp <- x[folds!=k,]
+        xtst <- x[folds == k,]
+        
+        cv.out<-cv.glmnet(xapp,reg.train.y,alpha=1)
+        
+        reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=1)
+      }
+      else if(model == "lasso.stepwise12")
+      {
+        x <- model.matrix(y ~ X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X1 + X49, reg.data)
+        xapp <- x[folds!=k,]
+        xtst <- x[folds == k,]
+      
+        cv.out<-cv.glmnet(xapp,reg.train.y,alpha=1)
+      
+        reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=1)
+      }
+      else if(model == "lasso.stepwise20")
+      {
+        x <- model.matrix(y ~ X1 + X2 + X3 + X10 + X14 + X16 + X17 + X19 + X24 + X32 + X34 + X35 + X37 + X38 + X39 + X40 + X41 + X43 + X46 + X49, reg.data)
+        xapp <- x[folds!=k,]
+        xtst <- x[folds == k,]
+        
+        cv.out<-cv.glmnet(xapp,reg.train.y,alpha=1)
+        
+        reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=1)
+      }
+      else if(model == "lasso.stepwise22")
+      {
+        x <- model.matrix(y ~ X1 + X2 + X3 + X10 + X14 + X16 + X17 + X19 + X24 + X32 + X34 + X35 + X37 + X38 + X39 + X40 + X41 + X43 + X46 + X49, reg.data)
+        xapp <- x[folds!=k,]
+        xtst <- x[folds == k,]
+        
+        cv.out<-cv.glmnet(xapp,reg.train.y,alpha=1)
+        
+        reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=1)
+      }
+      else if(model == "lasso.stepwise18")
+      {
+        x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X17 + X43 + X46, reg.data)
+        xapp <- x[folds!=k,]
+        xtst <- x[folds == k,]
+        
+        cv.out<-cv.glmnet(xapp,reg.train.y,alpha=1)
+        
+        reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=1)
+      }
+      else
+      {
+        clus.model <- NULL
+      }
+      
+      if(!is.null(reg.model))
+      {
+        if(model == "subset13" || model == "subset18" || model == "subset.stepwise15" || model == "subset.stepwise12" || model == "subset.stepwise20" || model == "subset.stepwise22" || model == "subset.stepwise18")
+        {
+          reg.predict <- predict(reg.model, newdata = reg.test)
+          models.error[model] <- models.error[model] + mean((reg.test.y - reg.predict)^2)
+        }
+        else
+        {
+          reg.predict <- predict(reg.model,s=cv.out$lambda.min,newx=xtst)
+          models.error[model] <- models.error[model] + mean((reg.test.y - reg.predict)^2)
+          
+        }
+      }
+    }
+  }
+  models.error[model] <- models.error[model]/(n_folds*trials)
+}
+
+print(models.error)
\ No newline at end of file
diff --git "a/r\303\251gression.r" "b/r\303\251gression.r"
index 9d5e8a9..a03102d 100644
--- "a/r\303\251gression.r"
+++ "b/r\303\251gression.r"
@@ -36,16 +36,17 @@ cor <- cor(reg.data)
 corrplot(cor) #prédicteurs pas corrélés entre eux (à part X19 corrélé avec y maybe???)
 
 
-#première régression test pour voir prédicteurs les plus significatifs
-reg <- lm(y ~ ., data = reg.appr)
+#première régression test pour voir prédicteurs les plus significatifs 
+#sur toutes les données du coup
+reg <- lm(y ~ ., data = reg.data)
 summary(reg)
 confint(reg)
 
-plot(y.appr,reg$fitted.values)
+plot(reg.data$y,reg$fitted.values)
 abline(0,1)
 
 #prédicteurs les plus significatifs:
-# X1, X2, X3, -X10, X14, X19, -X24, X32, X34, X35, X37, X38, X39, -X40, X41, -X43
+# X1, X2, X3, -X10, X14, X19, -X24, X32, X34, X35, X37, X38, X39, -X40, X41, -X43, -X46, X49
 
 #espérance de l'erreur quadratique :
 pred <- predict(reg,newdata = reg.test)
@@ -78,11 +79,43 @@ qqline(rstu, dist = qnorm)
 #influence globale
 plot(reg, which = 4, cook.levels = c(0, 0.1))
 plot(reg, which = 5, cook.levels = c(0, 0.1))
-#128 aberrant
+
+#prédicteurs les plus significatifs:
+# X1, X2, X3, -X10, X14, X19, -X24, X32, X34, X35, X37, X38, X39, -X40, X41, -X43, -X46, X49
+
+model.13pred <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49, data = reg.appr)
+summary(model.13pred) 
+confint(model.13pred)
+
+plot(y.appr,model.13pred$fitted.values)
+abline(0,1)
+
+#espérance de l'erreur quadratique :
+pred.model.13 <- predict(model.13pred,newdata = reg.test)
+plot(y.test,pred.model.13)
+abline(0,1)
+
+mean((y.test - pred.model.13)^2)
+#1792.472 euh ouais bof
+
+model.18pred <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X40 + X43 + X46, data = reg.appr)
+summary(model.18pred) 
+confint(model.18pred)
+
+plot(y.appr,model.18pred$fitted.values)
+abline(0,1)
+
+#espérance de l'erreur quadratique :
+pred.model.18 <- predict(model.18pred,newdata = reg.test)
+plot(y.test,pred.model.18)
+abline(0,1)
+
+mean((y.test - pred.model.18)^2)
+#1730.831 
 
 
-#stepwise selection
-reg.fit.exh <-regsubsets(y ~ .,data=reg.data,method="exhaustive",nvmax=15, really.big = T)
+#stepwise selection (sur toutes les données)
+reg.fit.exh <-regsubsets(y ~ .,data=reg.data,method="exhaustive",nvmax=22, really.big = T)
 x11()
 plot(reg.fit.exh,scale="r2", main ="")
 title(main = "best subset" )
@@ -94,12 +127,13 @@ plot(reg.fit.f,scale="r2", main = "")
 title(main ="forward")
 
 #backward
-reg.fit.b<-regsubsets(y ~ .,data=reg.data,method="backward",nvmax=15, really.big = T)
+reg.fit.b<-regsubsets(y ~ .,data=reg.data,method="backward",nvmax=22, really.big = T)
+X11()
 plot(reg.fit.b,scale="r2", main = "")
 title(main = "backward")
 
 #AIC et BIC
-reg.fit <- regsubsets(y ~., data = reg.data, method = "exhaustive", nvmax = 15, really.big = T)
+reg.fit <- regsubsets(y ~., data = reg.data, method = "exhaustive", nvmax = 18, really.big = T)
 x11()
 plot(reg.fit,scale="adjr2", main = "") 
 title(main = "AIC")
@@ -109,7 +143,7 @@ title(main = "BIC")
 
 #pas de différence entre tous ces résultats, on part sur un premier subset avec tous les prédicteurs au dessus de 0.9
 
-reg.model <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X41, data = reg.appr)
+reg.model <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X41 + X24 + X46 + X49, data = reg.data)
 summary(reg.model) #X24, X46 et X49 pas très significatif ici, ils dégagent
 confint(reg.model)
 
@@ -196,4 +230,37 @@ abline(0,1)
 
 mean((y.test - pred.lasso)^2)
 #1754.978 
-#meilleur resultat so far (pas de beaucoup par rapport au modèle avec subset)
\ No newline at end of file
+#meilleur resultat so far (pas de beaucoup par rapport au modèle avec subset)
+
+
+
+
+##ridge regression
+x18 <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X40 + X43 + X46, reg.data)
+xapp18 <- x18[reg.mask,]
+xtst18 <- x18[-reg.mask,]
+
+cv.out18<-cv.glmnet(xapp18,y.appr,alpha=0)
+plot(cv.out18)
+
+fit18<-glmnet(xapp18,y.appr,lambda=cv.out18$lambda.min,alpha=0)
+
+#esperance de l'erreur quadratique :
+pred.ridge18 <- predict(fit18,s=cv.out18$lambda.min,newx = xtst18)
+plot(y.test,pred.ridge18)
+abline(0,1)
+
+mean((y.test - pred.ridge18)^2)
+#1699.158
+
+
+#lasso regression
+cv.out.lasso18 <- cv.glmnet(xapp18,y.appr,alpha=1)
+plot(cv.out.lasso18)
+fit.lasso18 <- glmnet(xapp18,y.appr,lambda=cv.out.lasso18$lambda.min,alpha=1)
+pred.lasso18 <- predict(fit.lasso18,s=cv.out.lasso18$lambda.min,newx=xtst18)
+plot(y.test,pred.lasso18)
+abline(0,1)
+
+mean((y.test - pred.lasso18)^2)
+#1707.968 
\ No newline at end of file
-- 
GitLab