plot(reg) prostate = read.table("prostate.data") reg <- lm(lpsa ~ . - train, data = prostate) setwd("~/A18/SY19/TP02") prostate = read.table("prostate.data") reg <- lm(lpsa ~ . - train, data = prostate) plot(reg) plot(reg) #tous les plots summary(reg) reg4 <- lm(lpsa ~lcp + gleason + pgg45, data = prostate) summary(reg4) #lcp rejetté x11() plot(prostate$lpsa,reg4$fitted.values) abline(0,1) summary(reg) reg4 <- lm(lpsa ~ age + lcp + gleason + pgg45, data = prostate) summary(reg4) #lcp rejetté x11() plot(prostate$lpsa,reg4$fitted.values) abline(0,1) reg4 <- lm(lpsa ~ age + gleason + pgg45, data = prostate) summary(reg4) #lcp rejetté x11() plot(prostate$lpsa,reg4$fitted.values) abline(0,1) reg4 <- lm(lpsa ~ age + gleason, data = prostate) summary(reg4) summary(reg4) #lcp rejetté x11() plot(prostate$lpsa,reg4$fitted.values) abline(0,1) reg4 <- lm(lpsa ~ age + lcp + pgg45, data = prostate) summary(reg4) x11() plot(prostate$lpsa,reg4$fitted.values) abline(0,1) summary(reg) reg4 <- lm(lpsa ~ age + lbph + lcp + glesaon + pgg45, data = prostate) reg4 <- lm(lpsa ~ age + lbph + lcp + gleason + pgg45, data = prostate) summary(reg4) x11() plot(prostate$lpsa,reg4$fitted.values) abline(0,1) reg4 <- lm(lpsa ~ age + lbph + gleason + pgg45, data = prostate) summary(reg4) x11() plot(prostate$lpsa,reg4$fitted.values) abline(0,1) reg4 <- lm(lpsa ~ . -train -lcavol, data = prostate) summary(reg4) x11() plot(prostate$lpsa,reg4$fitted.values) abline(0,1) cor(prostate) reg3 <- lm(lpsa ~ . - train - lcavol - lbph,data = prostate) summary(reg3) summary(reg) reg3 <- lm(lpsa ~ . - train -lbph,data = prostate) summary(reg3) summary(reg) plot(prostate$lcavol,reg$residuals) plot(prostate$lweight,reg$residuals) plot(prostate$lage,reg$residuals) plot(prostate$age,reg$residuals) plot(prostate$lpsa,reg$residuals) reg5 <- lm(sqrt(lpsa) ~ .-train, data = prostate) sqrt(4) sqrt(prostate$lpsa) prostate$lpsa reg5 <- lm(sqrt(abs(lpsa)) ~ .-train, data = prostate) summary(reg5) plot(prostate$lpsa,reg5$fitted.values) abline(0,1) plot(prostate$lpsa,reg5$fitted.values) abs(prostate$lpsa) plot(prostate$lcavol,reg$residuals) plot(prostate$lweight,reg$residuals) plot(prostate$age,reg$residuals) appr = prostate[prostate$train,1:4] test = prostate[!prostate$train,] appr = prostate[prostate$train,] appr test appr = prostate[prostate$train,1:9] appr test = prostate[!prostate$train,1:9] ?predict pred <- lm(lpsa ~ ., data = appr) predict(pred,newdata = test) pred$fitted.values predict <- predict(pred,newdata = test) plot(test$lpsa,predict) abline(0,1) setwd("~/A18/SY19/TP02") prostate = read.table("prostate.data") reg <- lm(lpsa ~ . - train, data = prostate) #estimation de tous les coefficients #intercept = ordonnée à l'origine summary(reg) #pour chaque coefficient on retrouve les estimés (beta chapeau) #Std error = sigma (espérance des beta chapeau) #avant dernière colonne = stat de Student #dernière colonne = P value (plus elle est petite plus on rejette H0) #lcavol, lweight, svi rejettés # plus il y a d'étoiles plus c'est significatif #F-stat = tester si la régression est significative -> p-value faible ici #(tous les prédicteurs ne sont pas significatifs) #intervalle de confiance confint(reg, level = 0.95) x11() plot(prostate$lpsa,reg$fitted.values) abline(0,1) x11() plot(prostate$lpsa,reg$residuals) plot(prostate$lpsa,rstandard(reg)) #vérifier qu'il n'y a pas un point qui a un résidu énorme # (point aberrant) #test de Shapiro pour la normalité shapiro.test(reg$residuals) #étude de la stabilité de la régression # distance de Cook cooks.distance(reg) pred <- lm(lpsa ~ ., data = appr) predict <- predict(pred,newdata = test) plot(test$lpsa,predict) abline(0,1) appr = prostate[prostate$train,1:9] test = prostate[!prostate$train,1:9] pred <- lm(lpsa ~ ., data = appr) predict <- predict(pred,newdata = test) plot(test$lpsa,predict) abline(0,1) confint(pred) #### Classification Linéaire # spam = read.table("spambase.dat") spam ?sample dim(spam) dim(spam)[1] sample.int(n = dim(spam)[1], prob = c(2/3, 1/3)) sample.int(n = dim(spam)[1], prob = 2/3) sample.int(n = dim(spam)[1], prob = 0.6) ?trunk sample(x = 1:n, size = trunc(n*2/3)) n = dim(spam)[1] sample(x = 1:n, size = trunc(n*2/3)) appr = spam[sample,] selection = sample(x = 1:n, size = trunc(n*2/3)) appr = spam[selection,] appr test = spam - appr test = spam[!selection] test test = spam[!selection,] test ?sort selection = sort(sample(x = 1:n, size = trunc(n*2/3))) appr = spam[selection,] appr test = spam[(1:n) - selection,] test = spam[(1:n)[!selection],] test test = spam[(1:n) - (1:n)[selection],] ?sample test = spam[!appr] test test = spam[!appr,] test ?lda ??lda package(MASS) install.packages("MASS") library(MASS) ?lda test = spam[-appr,] test = spam[-selection,] test spam = read.table("spambase.dat") n = dim(spam)[1] selection = sort(sample(x = 1:n, size = trunc(n*2/3))) spam.appr = spam[selection,] spam.test = spam[-selection,] spam.lda(v58~., data = appr) spam.lda <- lda(v58~., data = appr) spam.lda <- lda(v58~., data = spam.appr) spam.appr = spam[selection,] spam.test = spam[-selection,] spam.lda <- lda(v58~., data = spam.appr) spam.appr spam.lda <- lda(V58~., data = spam.appr) spam.lda spam.pred <- predict(spam.lda, newdata = spam.test) spam.pred table(spam.test$V58, spam.pred$V58) table(spam.test$V58, spam.pred$class) spam.pred$class == spam.test$V58 library(pROC) install.packages("pROC") library(pROC) library(pROC) spam.pred$x roc_curve <- roc(spam.test$V58, as.vector(spam.pred$x)) plot(roc_curve) View(spam.pred) roc_curve <- roc(spam.test$V58, spam.pred$class) spam.glm <- glm(V58~.data = spam.appr, family = binomial) spam.glm <- glm(V58~.,data = spam.appr, family = binomial) summary(spam.glm) spam.glm.pred <- predict(spam.glm, newdata = spam.test, type ="response") table(spam.test$V58,spam.glm.pred>0.5) table(spam.test$V58, spam.pred$class) logit <- predict(spam.glm, newdata = spam.test, type = "link") plot(roc_glm, add = TRUE, col = 'red') roc_glm = roc(spam.test$V58,logit) plot(roc_glm, add = TRUE, col = 'red') summary(spam.glm) summary(spam.lda) summary(spam.glm) table(spam.test$V58,spam.glm.pred>0.5) 888+64+56+526 (56+64)/1534 table(spam.test$V58, spam.pred$class) (108+64)/1534 ?confusion.matrix library(SDMTools) install.packages("SDMTools") library(SDMTools) library(SDMTools) ?confusion.matrix #Matrices de confusion confusion.matrix(spam.test$V58,spam.pred$class) confusion.matrix(spam.test$V58,spam.glm.pred) ?rnorm ?mvrnorm pb_class <- read.table("tp3_a18_clas_app.txt") setwd("~/A18/SY19/TP03") pb_class <- read.table("tp3_a18_clas_app.txt") pb_class pb_reg <- read.table("tp3_a18_reg_app.txt") summary(pb_reg) reg <- lm(y ~ ., data = reg.appr) pb_class <- read.table("tp3_a18_clas_app.txt") pb_reg <- read.table("tp3_a18_reg_app.txt") #ensembles d'apprentissage et de test n.reg = dim(pb_reg)[1] reg.mask = sort(sample(x = 1:n, size = trunc(n*2/3))) reg.appr = pb_reg[reg.mask,] reg.test = pb_reg[-reg.mask,] n.class = dim(pb_class)[1] class.mask = sort(sample(x = 1:n, size = trunc(n*2/3))) class.appr = pb_class[class.mask,] class.test = pb_class[-class.mask,] n.reg = dim(pb_reg)[1] reg.mask = sort(sample(x = 1:n.reg, size = trunc(n*2/3))) reg.appr = pb_reg[reg.mask,] reg.test = pb_reg[-reg.mask,] n.class = dim(pb_class)[1] class.mask = sort(sample(x = 1:n.class, size = trunc(n*2/3))) class.appr = pb_class[class.mask,] class.test = pb_class[-class.mask,] #ensembles d'apprentissage et de test n.reg = dim(pb_reg)[1] reg.mask = sort(sample(x = 1:n.reg, size = trunc(n*2/3))) #ensembles d'apprentissage et de test nreg = dim(pb_reg)[1] reg.mask = sort(sample(x = 1:nreg, size = trunc(n*2/3))) reg.mask = sort(sample(x = 1:nreg, size = trunc(nreg*2/3))) reg.appr = pb_reg[reg.mask,] reg.test = pb_reg[-reg.mask,] nclass = dim(pb_class)[1] class.mask = sort(sample(x = 1:nclass, size = trunc(nclass*2/3))) class.appr = pb_class[class.mask,] class.test = pb_class[-class.mask,] summary(pb_reg) reg <- lm(y ~ ., data = reg.appr) summary(reg) confint(reg) x11() plot(pb_reg$y,reg$fitted.values) abline(0,1) x11() plot(reg.appr$y,reg$fitted.values) abline(0,1) plot((reg.appr$y,rstandard(reg)) plot(reg.appr$y,rstandard(reg)) plot(reg.appr$y,rstudent(reg)) shapiro.test(reg$residuals) summary(reg) reg2 <- lm(y ~ X1 + X2 + X3 + X14 + X19 + X32 + X34 + X35 + X37 + X38 + X39 + X41, data = reg.appr) (summary(reg2)) x11() plot(reg.appr$y, reg2$fitted.values) abline(0,1) x11() plot(reg.appr$y,reg$fitted.values) abline(0,1) #avec les données de test predict <- predict(reg,newdata = reg.test) x11() plot(reg.test$y,predict) abline(0,1) #calcul de la mse // refaire les ensembles à chaque itération? mse <- function(k_max,train,y.train, test, y.tst){ mse <- matrix(0,k_max,1) for (i in 1:k_max){ reg <- lm(y.train ~ ., data = train) pred <- predict(reg,newdata = test) err <- (pred - y.tst)**2 mse[i] <- mean(err) } mse } predict #résidus en fonction des variables explicatives attach(pb_reg) varnames <- attr(reg$terms, "term.labels") par(mfrow = c(length(varnames), 3)) par(mar=c(1,1,1,1)) for(name in varnames) { plot(get(name), rres, xlab = name) plot(get(name), rstd, xlab = name) plot(get(name), rstu, xlab = name) } rres = reg$fitted.values rstd = rstandard(reg) rstu = rstudent(reg) attach(pb_reg) varnames <- attr(reg$terms, "term.labels") par(mfrow = c(length(varnames), 3)) par(mar=c(1,1,1,1)) for(name in varnames) { plot(get(name), rres, xlab = name) plot(get(name), rstd, xlab = name) plot(get(name), rstu, xlab = name) } rres = reg$fitted.values rstd = rstandard(reg) rstu = rstudent(reg) attach(reg.appr) varnames <- attr(reg$terms, "term.labels") par(mfrow = c(length(varnames), 3)) par(mar=c(1,1,1,1)) for(name in varnames) { plot(get(name), rres, xlab = name) plot(get(name), rstd, xlab = name) plot(get(name), rstu, xlab = name) } x11() rres = reg$fitted.values rstd = rstandard(reg) rstu = rstudent(reg) attach(reg.appr) varnames <- attr(reg$terms, "term.labels") par(mfrow = c(length(varnames), 3)) par(mar=c(1,1,1,1)) for(name in varnames) { plot(get(name), rres, xlab = name) plot(get(name), rstd, xlab = name) plot(get(name), rstu, xlab = name) } plot(reg.appr$y,rstd) plot(reg.appr$y,rstu) shapiro.test(reg$residuals) rres = reg$residuals shapiro.test(rres) qqnorm(rres, asp = 1) qqline(rres, dist = qnorm) qqnorm(rstd, asp = 1) qqline(rstd, dist = qnorm) qqnorm(rstu, asp = 1) qqline(rstu, dist = qnorm) #influence globale plot(reg, which = 4, cook.levels = c(0, 0.1)) plot(lm.model, which = 5, cook.levels = c(0, 0.1)) plot(reg, which = 5, cook.levels = c(0, 0.1)) plot(reg2, which = 5, cook.levels = c(0, 0.1)) plot(reg2, which = 4, cook.levels = c(0, 0.1)) plot(reg2, which = 5, cook.levels = c(0, 0.1)) pred <- predict(reg,newdata = reg.test, interval = "prediction") plot(yi, pred[, 1], ylim = range(pred), asp = 1) lines(yi, pred[, 2]) lines(yi, pred[, 3]) abline(0, 1) yi = pb_reg$y plot(yi, pred[, 1], ylim = range(pred), asp = 1) lines(yi, pred[, 2]) lines(yi, pred[, 3]) abline(0, 1) yi = reg.test$y plot(yi, pred[, 1], ylim = range(pred), asp = 1) lines(yi, pred[, 2]) lines(yi, pred[, 3]) abline(0, 1) x11() plot(yi, pred[, 1], ylim = range(pred), asp = 1) lines(yi, pred[, 2]) lines(yi, pred[, 3]) abline(0, 1) pred[,2] knn.reg? q ?knn.reg #install.packages("FNN") library(FNN) install.packages("FNN") library(FNN) ?knn.reg x.appr = reg.appr[,-y] y.appr = reg.appr[,y] dim(reg.appr) x.appr = reg.appr[,1:50] y.appr = reg.appr[,y] x.appr = reg.appr[,1:50] y.appr = reg.appr[,'y'] x.appr reg.appr x.test = reg.test[,1:50] y.test = reg.test[,'y'] reg.knn = knn.reg(train = x.appr, y = y.appr, k = 5, test = x.test) reg.knn mean((y.test-prostate.knn$pred)^2) mean((y.test-reg.knn$pred)^2) reg.knn$pred y.test #stepwise selection reg.fit<-regsubsets(y ~ .,data=reg.appr,method="forward",nvmax=15) install.packages("leaps") #stepwise selection library(leaps) reg.fit<-regsubsets(y ~ .,data=reg.appr,method="forward",nvmax=15) plot(reg.fit,scale="r2") x11() plot(reg.fit,scale="r2") reg.fit.f<-regsubsets(y ~ .,data=reg.appr,method="forward",nvmax=15) x11() plot(reg.fit.f,scale="r2") reg.fit.b<-regsubsets(y ~ .,data=reg.appr,method="backward",nvmax=15) x11() plot(reg.fit.b,scale="r2") reg.fit <- regsubsets(y ~., data = reg.appr, method = "exhaustive", nvmax = 15) plot(reg.fit,scale="adjr2") reg.fit <- regsubsets(y ~., data = reg.appr, method = "exhaustive", nvmax = 15, really.big = T) plot(reg.fit,scale="adjr2") x11() plot(reg.fit,scale="bic") x11() plot(reg.fit.f,scale="r2") x11() plot(reg.fit.b,scale="r2") class.appr class.lda <- lda(y ~ ., data = class.appr) ??lda library(MASS) class.lda <- lda(y ~ ., data = class.appr) summary(class.lda) class.pred <- predict(class.lda, newdata = class.test)