Skip to content
Snippets Groups Projects
Commit b64018c5 authored by Heloise Chevalier's avatar Heloise Chevalier
Browse files

cross validation regression

parent 5e18fb0b
No related branches found
No related tags found
No related merge requests found
library(MASS)
library(leaps)
library(corrplot)
library(glmnet)
#chargement des donnes
reg.data <- read.table("tp3_a18_reg_app.txt")
x.reg <- reg.data[,1:50]
y.reg <- reg.data[,"y"]
n = dim(reg.data)[1]
p = dim(reg.data)[2]
models.list <- c("subset13", "subset18", "subset.stepwise15", "subset.stepwise12", "subset.stepwise20", "subset.stepwise22", "subset.stepwise18", "ridge13", "ridge18", "ridge.stepwise15", "ridge.stepwise12", "ridge.stepwise20", "ridge.stepwise22", "ridge.stepwise18", "lasso13", "lasso18", "lasso.stepwise15", "lasso.stepwise12", "lasso.stepwise20", "lasso.stepwise22", "lasso.stepwise18") # Liste des modeles
models.error <- c()
trials <- 10 # Nombre d'essais de cross-validation par modle
n_folds <- 5 # Nombre de blocs pour la cross-validation
# Test pour chaque modle
for(model in models.list) # Pour chaque modele
{
models.error[model] <- 0
for(i in c(1:trials)) # On fait plusieurs essais
{
folds <- sample(rep(1:n_folds, length.out=n))
for(k in 1:n_folds) # Sur chaque segment de la cross-validation
{
reg.train <- reg.data[folds!=k,]
reg.train.X <- reg.train[,-p]
reg.train.y <- reg.train$y
reg.test <- reg.data[folds==k,]
reg.test.X <- reg.test[,-p]
reg.test.y <- reg.test$y
# Apprentissage du modele
if(model == "subset13")
{
reg.model <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49, data = reg.train)
}
else if(model == "subset18")
{
reg.model <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X40 + X43 + X46, data = reg.train)
}
else if(model == "subset.stepwise15")
{
reg.model <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X41 + X24 + X46 + X49, data = reg.train)
}
else if(model == "subset.stepwise12")
{
reg.model <- lm(y ~ X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X1 + X49 , data = reg.train)
}
else if(model == "subset.stepwise20")
{
reg.model <- lm(y ~ X1 + X2 + X3 + X10 + X14 + X16 + X17 + X19 + X24 + X32 + X34 + X35 + X37 + X38 + X39 + X40 + X41 + X43 + X46 + X49, data = reg.train)
}
else if(model == "subset.stepwise22")
{
reg.model <- lm(y ~ X1 + X2 + X3 + X10 + X14 + X16 + X17 + X19 + X24 + X32 + X34 + X35 + X37 + X38 + X39 + X40 + X41 + X43 + X46 + X49, data = reg.train)
}
else if(model == "subset.stepwise18")
{
reg.model <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X17 + X43 + X46, data = reg.train)
}
else if(model == "ridge13")
{
x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X28 + X31 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49, reg.data)
xapp <- x[folds!=k,]
xtst <- x[folds == k,]
cv.out<-cv.glmnet(xapp,reg.train.y,alpha=0)
reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=0)
}
else if(model == "ridge18")
{
x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X40 + X43 + X46, reg.data)
xapp <- x[folds!=k,]
xtst <- x[folds == k,]
cv.out<-cv.glmnet(xapp,reg.train.y,alpha=0)
reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=0)
}
else if(model == "ridge.stepwise15")
{
x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X41 + X24 + X46 + X49, reg.data)
xapp <- x[folds!=k,]
xtst <- x[folds == k,]
cv.out<-cv.glmnet(xapp,reg.train.y,alpha=0)
reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=0)
}
else if(model == "ridge.stepwise12")
{
x <- model.matrix(y ~ X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X1 + X49, reg.data)
xapp <- x[folds!=k,]
xtst <- x[folds == k,]
cv.out<-cv.glmnet(xapp,reg.train.y,alpha=0)
reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=0)
}
else if(model == "ridge.stepwise20")
{
x <- model.matrix(y ~ X1 + X2 + X3 + X10 + X14 + X16 + X17 + X19 + X24 + X32 + X34 + X35 + X37 + X38 + X39 + X40 + X41 + X43 + X46 + X49, reg.data)
xapp <- x[folds!=k,]
xtst <- x[folds == k,]
cv.out<-cv.glmnet(xapp,reg.train.y,alpha=0)
reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=0)
}
else if(model == "ridge.stepwise22")
{
x <- model.matrix(y ~ X1 + X2 + X3 + X10 + X14 + X16 + X17 + X19 + X24 + X32 + X34 + X35 + X37 + X38 + X39 + X40 + X41 + X43 + X46 + X49, reg.data)
xapp <- x[folds!=k,]
xtst <- x[folds == k,]
cv.out<-cv.glmnet(xapp,reg.train.y,alpha=0)
reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=0)
}
else if(model == "ridge.stepwise18")
{
x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X17 + X43 + X46, reg.data)
xapp <- x[folds!=k,]
xtst <- x[folds == k,]
cv.out<-cv.glmnet(xapp,reg.train.y,alpha=0)
reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=0)
}
else if(model == "lasso13")
{
x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49, reg.data)
xapp <- x[folds!=k,]
xtst <- x[folds == k,]
cv.out<-cv.glmnet(xapp,reg.train.y,alpha=1)
reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=1)
}
else if(model == "lasso18")
{
x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X40 + X43 + X46, reg.data)
xapp <- x[folds!=k,]
xtst <- x[folds == k,]
cv.out<-cv.glmnet(xapp,reg.train.y,alpha=1)
reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=1)
}
else if(model == "lasso.stepwise15")
{
x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X41 + X24 + X46 + X49, reg.data)
xapp <- x[folds!=k,]
xtst <- x[folds == k,]
cv.out<-cv.glmnet(xapp,reg.train.y,alpha=1)
reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=1)
}
else if(model == "lasso.stepwise12")
{
x <- model.matrix(y ~ X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X1 + X49, reg.data)
xapp <- x[folds!=k,]
xtst <- x[folds == k,]
cv.out<-cv.glmnet(xapp,reg.train.y,alpha=1)
reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=1)
}
else if(model == "lasso.stepwise20")
{
x <- model.matrix(y ~ X1 + X2 + X3 + X10 + X14 + X16 + X17 + X19 + X24 + X32 + X34 + X35 + X37 + X38 + X39 + X40 + X41 + X43 + X46 + X49, reg.data)
xapp <- x[folds!=k,]
xtst <- x[folds == k,]
cv.out<-cv.glmnet(xapp,reg.train.y,alpha=1)
reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=1)
}
else if(model == "lasso.stepwise22")
{
x <- model.matrix(y ~ X1 + X2 + X3 + X10 + X14 + X16 + X17 + X19 + X24 + X32 + X34 + X35 + X37 + X38 + X39 + X40 + X41 + X43 + X46 + X49, reg.data)
xapp <- x[folds!=k,]
xtst <- x[folds == k,]
cv.out<-cv.glmnet(xapp,reg.train.y,alpha=1)
reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=1)
}
else if(model == "lasso.stepwise18")
{
x <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X17 + X43 + X46, reg.data)
xapp <- x[folds!=k,]
xtst <- x[folds == k,]
cv.out<-cv.glmnet(xapp,reg.train.y,alpha=1)
reg.model<-glmnet(xapp,reg.train.y,lambda=cv.out$lambda.min,alpha=1)
}
else
{
clus.model <- NULL
}
if(!is.null(reg.model))
{
if(model == "subset13" || model == "subset18" || model == "subset.stepwise15" || model == "subset.stepwise12" || model == "subset.stepwise20" || model == "subset.stepwise22" || model == "subset.stepwise18")
{
reg.predict <- predict(reg.model, newdata = reg.test)
models.error[model] <- models.error[model] + mean((reg.test.y - reg.predict)^2)
}
else
{
reg.predict <- predict(reg.model,s=cv.out$lambda.min,newx=xtst)
models.error[model] <- models.error[model] + mean((reg.test.y - reg.predict)^2)
}
}
}
}
models.error[model] <- models.error[model]/(n_folds*trials)
}
print(models.error)
\ No newline at end of file
......@@ -36,16 +36,17 @@ cor <- cor(reg.data)
corrplot(cor) #prdicteurs pas corrls entre eux ( part X19 corrl avec y maybe???)
#premire rgression test pour voir prdicteurs les plus significatifs
reg <- lm(y ~ ., data = reg.appr)
#premire rgression test pour voir prdicteurs les plus significatifs
#sur toutes les donnes du coup
reg <- lm(y ~ ., data = reg.data)
summary(reg)
confint(reg)
plot(y.appr,reg$fitted.values)
plot(reg.data$y,reg$fitted.values)
abline(0,1)
#prdicteurs les plus significatifs:
# X1, X2, X3, -X10, X14, X19, -X24, X32, X34, X35, X37, X38, X39, -X40, X41, -X43
# X1, X2, X3, -X10, X14, X19, -X24, X32, X34, X35, X37, X38, X39, -X40, X41, -X43, -X46, X49
#esprance de l'erreur quadratique :
pred <- predict(reg,newdata = reg.test)
......@@ -78,11 +79,43 @@ qqline(rstu, dist = qnorm)
#influence globale
plot(reg, which = 4, cook.levels = c(0, 0.1))
plot(reg, which = 5, cook.levels = c(0, 0.1))
#128 aberrant
#prdicteurs les plus significatifs:
# X1, X2, X3, -X10, X14, X19, -X24, X32, X34, X35, X37, X38, X39, -X40, X41, -X43, -X46, X49
model.13pred <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49, data = reg.appr)
summary(model.13pred)
confint(model.13pred)
plot(y.appr,model.13pred$fitted.values)
abline(0,1)
#esprance de l'erreur quadratique :
pred.model.13 <- predict(model.13pred,newdata = reg.test)
plot(y.test,pred.model.13)
abline(0,1)
mean((y.test - pred.model.13)^2)
#1792.472 euh ouais bof
model.18pred <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X40 + X43 + X46, data = reg.appr)
summary(model.18pred)
confint(model.18pred)
plot(y.appr,model.18pred$fitted.values)
abline(0,1)
#esprance de l'erreur quadratique :
pred.model.18 <- predict(model.18pred,newdata = reg.test)
plot(y.test,pred.model.18)
abline(0,1)
mean((y.test - pred.model.18)^2)
#1730.831
#stepwise selection
reg.fit.exh <-regsubsets(y ~ .,data=reg.data,method="exhaustive",nvmax=15, really.big = T)
#stepwise selection (sur toutes les donnes)
reg.fit.exh <-regsubsets(y ~ .,data=reg.data,method="exhaustive",nvmax=22, really.big = T)
x11()
plot(reg.fit.exh,scale="r2", main ="")
title(main = "best subset" )
......@@ -94,12 +127,13 @@ plot(reg.fit.f,scale="r2", main = "")
title(main ="forward")
#backward
reg.fit.b<-regsubsets(y ~ .,data=reg.data,method="backward",nvmax=15, really.big = T)
reg.fit.b<-regsubsets(y ~ .,data=reg.data,method="backward",nvmax=22, really.big = T)
X11()
plot(reg.fit.b,scale="r2", main = "")
title(main = "backward")
#AIC et BIC
reg.fit <- regsubsets(y ~., data = reg.data, method = "exhaustive", nvmax = 15, really.big = T)
reg.fit <- regsubsets(y ~., data = reg.data, method = "exhaustive", nvmax = 18, really.big = T)
x11()
plot(reg.fit,scale="adjr2", main = "")
title(main = "AIC")
......@@ -109,7 +143,7 @@ title(main = "BIC")
#pas de diffrence entre tous ces rsultats, on part sur un premier subset avec tous les prdicteurs au dessus de 0.9
reg.model <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X41, data = reg.appr)
reg.model <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X41 + X24 + X46 + X49, data = reg.data)
summary(reg.model) #X24, X46 et X49 pas trs significatif ici, ils dgagent
confint(reg.model)
......@@ -196,4 +230,37 @@ abline(0,1)
mean((y.test - pred.lasso)^2)
#1754.978
#meilleur resultat so far (pas de beaucoup par rapport au modle avec subset)
\ No newline at end of file
#meilleur resultat so far (pas de beaucoup par rapport au modle avec subset)
##ridge regression
x18 <- model.matrix(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 +X41 + X49 + X10 + X24 + X40 + X43 + X46, reg.data)
xapp18 <- x18[reg.mask,]
xtst18 <- x18[-reg.mask,]
cv.out18<-cv.glmnet(xapp18,y.appr,alpha=0)
plot(cv.out18)
fit18<-glmnet(xapp18,y.appr,lambda=cv.out18$lambda.min,alpha=0)
#esperance de l'erreur quadratique :
pred.ridge18 <- predict(fit18,s=cv.out18$lambda.min,newx = xtst18)
plot(y.test,pred.ridge18)
abline(0,1)
mean((y.test - pred.ridge18)^2)
#1699.158
#lasso regression
cv.out.lasso18 <- cv.glmnet(xapp18,y.appr,alpha=1)
plot(cv.out.lasso18)
fit.lasso18 <- glmnet(xapp18,y.appr,lambda=cv.out.lasso18$lambda.min,alpha=1)
pred.lasso18 <- predict(fit.lasso18,s=cv.out.lasso18$lambda.min,newx=xtst18)
plot(y.test,pred.lasso18)
abline(0,1)
mean((y.test - pred.lasso18)^2)
#1707.968
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment