From b64018c5f706f99fdd33f89890a4e471a57316d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A9lo=C3=AFse=20Chevalier?= <heloise.chevalier@etu.utc.fr> Date: Fri, 19 Oct 2018 12:07:18 +0200 Subject: [PATCH] cross validation regression --- cv_regression.r | 231 ++++++++++++++++++++++++++++++++++++++++++ "r\303\251gression.r" | 89 ++++++++++++++-- 2 files changed, 309 insertions(+), 11 deletions(-) create mode 100644 cv_regression.r diff --git a/cv_regression.r b/cv_regression.r new file mode 100644 index 0000000..698890c --- /dev/null +++ b/cv_regression.r @@ -0,0 +1,231 @@ +library(MASS) +library(leaps) +library(corrplot) +library(glmnet) + +#chargement des données +reg.data <- read.table("tp3_a18_reg_app.txt") + +x.reg <- reg.data[,1:50] +y.reg <- reg.data[,"y"] + +n = dim(reg.data)[1] +p = dim(reg.data)[2] + + +models.list <- c("subset13", "subset18", "subset.stepwise15", "subset.stepwise12", "subset.stepwise20", "subset.stepwise22", "subset.stepwise18", "ridge13", "ridge18", "ridge.stepwise15", "ridge.stepwise12", "ridge.stepwise20", "ridge.stepwise22", "ridge.stepwise18", "lasso13", "lasso18", "lasso.stepwise15", "lasso.stepwise12", "lasso.stepwise20", "lasso.stepwise22", "lasso.stepwise18") # Liste des modeles +models.error <- c() +trials <- 10 # Nombre d'essais de cross-validation par modèle +n_folds <- 5 # Nombre de blocs pour la cross-validation + +# Test pour chaque modèle + +for(model in models.list) # Pour chaque modele +{ + models.error[model] <- 0 + for(i in c(1:trials)) # On fait plusieurs essais + { + folds <- sample(rep(1:n_folds, length.out=n)) + for(k in 1:n_folds) # Sur chaque segment de la cross-validation + { + reg.train <- reg.data[folds!=k,] + reg.train.X <- reg.train[,-p] + reg.train.y <- reg.train$y + reg.test <- reg.data[folds==k,] + reg.test.X <- reg.test[,-p] + reg.test.y <- reg.test$y + + # Apprentissage du modele + if(model == "subset13") + { + reg.model <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49, data = reg.train) + } + else if(model == "subset18") + { + reg.model <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X40 + X43 + X46, data = reg.train) + } + else if(model == "subset.stepwise15") + { + reg.model <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X41 + X24 + X46 + X49, data = reg.train) + } + else if(model == "subset.stepwise12") + { + reg.model <- lm(y ~ X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X1 + X49 , data = reg.train) + } + else if(model == "subset.stepwise20") + { + reg.model <- lm(y ~ X1 + X2 + X3 + X10 + X14 + X16 + X17 + X19 + X24 + X32 + X34 + X35 + X37 + X38 + X39 + X40 + X41 + X43 + X46 + X49, data = reg.train) + } + else if(model == "subset.stepwise22") + { + reg.model <- lm(y ~ X1 + X2 + X3 + X10 + X14 + X16 + X17 + X19 + X24 + X32 + X34 + X35 + X37 + X38 + X39 + X40 + X41 + X43 + X46 + X49, data = reg.train) + } + else if(model == "subset.stepwise18") + { + reg.model <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X17 + X43 + X46, data = reg.train) + } + else if(model == "ridge13") + { + x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X28 + X31 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49, reg.data) + xapp <- x[folds!=k,] + xtst <- x[folds == k,] + + cv.out<-cv.glmnet(xapp,reg.train.y,alpha=0) + + reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=0) + } + else if(model == "ridge18") + { + x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X40 + X43 + X46, reg.data) + xapp <- x[folds!=k,] + xtst <- x[folds == k,] + + cv.out<-cv.glmnet(xapp,reg.train.y,alpha=0) + + reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=0) + } + else if(model == "ridge.stepwise15") + { + x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X41 + X24 + X46 + X49, reg.data) + xapp <- x[folds!=k,] + xtst <- x[folds == k,] + + cv.out<-cv.glmnet(xapp,reg.train.y,alpha=0) + + reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=0) + } + else if(model == "ridge.stepwise12") + { + x <- model.matrix(y ~ X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X1 + X49, reg.data) + xapp <- x[folds!=k,] + xtst <- x[folds == k,] + + cv.out<-cv.glmnet(xapp,reg.train.y,alpha=0) + + reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=0) + } + else if(model == "ridge.stepwise20") + { + x <- model.matrix(y ~ X1 + X2 + X3 + X10 + X14 + X16 + X17 + X19 + X24 + X32 + X34 + X35 + X37 + X38 + X39 + X40 + X41 + X43 + X46 + X49, reg.data) + xapp <- x[folds!=k,] + xtst <- x[folds == k,] + + cv.out<-cv.glmnet(xapp,reg.train.y,alpha=0) + + reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=0) + } + else if(model == "ridge.stepwise22") + { + x <- model.matrix(y ~ X1 + X2 + X3 + X10 + X14 + X16 + X17 + X19 + X24 + X32 + X34 + X35 + X37 + X38 + X39 + X40 + X41 + X43 + X46 + X49, reg.data) + xapp <- x[folds!=k,] + xtst <- x[folds == k,] + + cv.out<-cv.glmnet(xapp,reg.train.y,alpha=0) + + reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=0) + } + else if(model == "ridge.stepwise18") + { + x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X17 + X43 + X46, reg.data) + xapp <- x[folds!=k,] + xtst <- x[folds == k,] + + cv.out<-cv.glmnet(xapp,reg.train.y,alpha=0) + + reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=0) + } + else if(model == "lasso13") + { + x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49, reg.data) + xapp <- x[folds!=k,] + xtst <- x[folds == k,] + + cv.out<-cv.glmnet(xapp,reg.train.y,alpha=1) + + reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=1) + } + else if(model == "lasso18") + { + x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X40 + X43 + X46, reg.data) + xapp <- x[folds!=k,] + xtst <- x[folds == k,] + + cv.out<-cv.glmnet(xapp,reg.train.y,alpha=1) + + reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=1) + } + else if(model == "lasso.stepwise15") + { + x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X41 + X24 + X46 + X49, reg.data) + xapp <- x[folds!=k,] + xtst <- x[folds == k,] + + cv.out<-cv.glmnet(xapp,reg.train.y,alpha=1) + + reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=1) + } + else if(model == "lasso.stepwise12") + { + x <- model.matrix(y ~ X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X1 + X49, reg.data) + xapp <- x[folds!=k,] + xtst <- x[folds == k,] + + cv.out<-cv.glmnet(xapp,reg.train.y,alpha=1) + + reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=1) + } + else if(model == "lasso.stepwise20") + { + x <- model.matrix(y ~ X1 + X2 + X3 + X10 + X14 + X16 + X17 + X19 + X24 + X32 + X34 + X35 + X37 + X38 + X39 + X40 + X41 + X43 + X46 + X49, reg.data) + xapp <- x[folds!=k,] + xtst <- x[folds == k,] + + cv.out<-cv.glmnet(xapp,reg.train.y,alpha=1) + + reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=1) + } + else if(model == "lasso.stepwise22") + { + x <- model.matrix(y ~ X1 + X2 + X3 + X10 + X14 + X16 + X17 + X19 + X24 + X32 + X34 + X35 + X37 + X38 + X39 + X40 + X41 + X43 + X46 + X49, reg.data) + xapp <- x[folds!=k,] + xtst <- x[folds == k,] + + cv.out<-cv.glmnet(xapp,reg.train.y,alpha=1) + + reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=1) + } + else if(model == "lasso.stepwise18") + { + x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X17 + X43 + X46, reg.data) + xapp <- x[folds!=k,] + xtst <- x[folds == k,] + + cv.out<-cv.glmnet(xapp,reg.train.y,alpha=1) + + reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=1) + } + else + { + clus.model <- NULL + } + + if(!is.null(reg.model)) + { + if(model == "subset13" || model == "subset18" || model == "subset.stepwise15" || model == "subset.stepwise12" || model == "subset.stepwise20" || model == "subset.stepwise22" || model == "subset.stepwise18") + { + reg.predict <- predict(reg.model, newdata = reg.test) + models.error[model] <- models.error[model] + mean((reg.test.y - reg.predict)^2) + } + else + { + reg.predict <- predict(reg.model,s=cv.out$lambda.min,newx=xtst) + models.error[model] <- models.error[model] + mean((reg.test.y - reg.predict)^2) + + } + } + } + } + models.error[model] <- models.error[model]/(n_folds*trials) +} + +print(models.error) \ No newline at end of file diff --git "a/r\303\251gression.r" "b/r\303\251gression.r" index 9d5e8a9..a03102d 100644 --- "a/r\303\251gression.r" +++ "b/r\303\251gression.r" @@ -36,16 +36,17 @@ cor <- cor(reg.data) corrplot(cor) #prédicteurs pas corrélés entre eux (à part X19 corrélé avec y maybe???) -#première régression test pour voir prédicteurs les plus significatifs -reg <- lm(y ~ ., data = reg.appr) +#première régression test pour voir prédicteurs les plus significatifs +#sur toutes les données du coup +reg <- lm(y ~ ., data = reg.data) summary(reg) confint(reg) -plot(y.appr,reg$fitted.values) +plot(reg.data$y,reg$fitted.values) abline(0,1) #prédicteurs les plus significatifs: -# X1, X2, X3, -X10, X14, X19, -X24, X32, X34, X35, X37, X38, X39, -X40, X41, -X43 +# X1, X2, X3, -X10, X14, X19, -X24, X32, X34, X35, X37, X38, X39, -X40, X41, -X43, -X46, X49 #espérance de l'erreur quadratique : pred <- predict(reg,newdata = reg.test) @@ -78,11 +79,43 @@ qqline(rstu, dist = qnorm) #influence globale plot(reg, which = 4, cook.levels = c(0, 0.1)) plot(reg, which = 5, cook.levels = c(0, 0.1)) -#128 aberrant + +#prédicteurs les plus significatifs: +# X1, X2, X3, -X10, X14, X19, -X24, X32, X34, X35, X37, X38, X39, -X40, X41, -X43, -X46, X49 + +model.13pred <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49, data = reg.appr) +summary(model.13pred) +confint(model.13pred) + +plot(y.appr,model.13pred$fitted.values) +abline(0,1) + +#espérance de l'erreur quadratique : +pred.model.13 <- predict(model.13pred,newdata = reg.test) +plot(y.test,pred.model.13) +abline(0,1) + +mean((y.test - pred.model.13)^2) +#1792.472 euh ouais bof + +model.18pred <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X40 + X43 + X46, data = reg.appr) +summary(model.18pred) +confint(model.18pred) + +plot(y.appr,model.18pred$fitted.values) +abline(0,1) + +#espérance de l'erreur quadratique : +pred.model.18 <- predict(model.18pred,newdata = reg.test) +plot(y.test,pred.model.18) +abline(0,1) + +mean((y.test - pred.model.18)^2) +#1730.831 -#stepwise selection -reg.fit.exh <-regsubsets(y ~ .,data=reg.data,method="exhaustive",nvmax=15, really.big = T) +#stepwise selection (sur toutes les données) +reg.fit.exh <-regsubsets(y ~ .,data=reg.data,method="exhaustive",nvmax=22, really.big = T) x11() plot(reg.fit.exh,scale="r2", main ="") title(main = "best subset" ) @@ -94,12 +127,13 @@ plot(reg.fit.f,scale="r2", main = "") title(main ="forward") #backward -reg.fit.b<-regsubsets(y ~ .,data=reg.data,method="backward",nvmax=15, really.big = T) +reg.fit.b<-regsubsets(y ~ .,data=reg.data,method="backward",nvmax=22, really.big = T) +X11() plot(reg.fit.b,scale="r2", main = "") title(main = "backward") #AIC et BIC -reg.fit <- regsubsets(y ~., data = reg.data, method = "exhaustive", nvmax = 15, really.big = T) +reg.fit <- regsubsets(y ~., data = reg.data, method = "exhaustive", nvmax = 18, really.big = T) x11() plot(reg.fit,scale="adjr2", main = "") title(main = "AIC") @@ -109,7 +143,7 @@ title(main = "BIC") #pas de différence entre tous ces résultats, on part sur un premier subset avec tous les prédicteurs au dessus de 0.9 -reg.model <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X41, data = reg.appr) +reg.model <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X41 + X24 + X46 + X49, data = reg.data) summary(reg.model) #X24, X46 et X49 pas très significatif ici, ils dégagent confint(reg.model) @@ -196,4 +230,37 @@ abline(0,1) mean((y.test - pred.lasso)^2) #1754.978 -#meilleur resultat so far (pas de beaucoup par rapport au modèle avec subset) \ No newline at end of file +#meilleur resultat so far (pas de beaucoup par rapport au modèle avec subset) + + + + +##ridge regression +x18 <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X40 + X43 + X46, reg.data) +xapp18 <- x18[reg.mask,] +xtst18 <- x18[-reg.mask,] + +cv.out18<-cv.glmnet(xapp18,y.appr,alpha=0) +plot(cv.out18) + +fit18<-glmnet(xapp18,y.appr,lambda=cv.out18$lambda.min,alpha=0) + +#esperance de l'erreur quadratique : +pred.ridge18 <- predict(fit18,s=cv.out18$lambda.min,newx = xtst18) +plot(y.test,pred.ridge18) +abline(0,1) + +mean((y.test - pred.ridge18)^2) +#1699.158 + + +#lasso regression +cv.out.lasso18 <- cv.glmnet(xapp18,y.appr,alpha=1) +plot(cv.out.lasso18) +fit.lasso18 <- glmnet(xapp18,y.appr,lambda=cv.out.lasso18$lambda.min,alpha=1) +pred.lasso18 <- predict(fit.lasso18,s=cv.out.lasso18$lambda.min,newx=xtst18) +plot(y.test,pred.lasso18) +abline(0,1) + +mean((y.test - pred.lasso18)^2) +#1707.968 \ No newline at end of file -- GitLab