# Exercise 5 # a ds5 <- read.table("http://www.math.ntnu.no/~erikblys/TMA4255-V14/Data/data5.txt",header=TRUE) dim(ds5) summary(ds5) cor(ds5) cor.test(ds5$y,ds5$x1) pairs(ds5) #b for (i in 1:5) { print(i) print(summary(lm(ds5[,6]~ds5[,i]))) } # only x1 significant if only one regressor present lm1 <- lm(y~x1,data=ds5) summary(lm1) plot(lm1$fitted,rstudent(lm1)) qqnorm(rstudent(lm1)) qqline(rstudent(lm1)) hist(rstudent(lm1)) plot(density(rstudent(lm1),width=1)) # heavier tails than normal, but very # c ?predict.lm predict(lm1,interval="confidence") # for all observations in data set # or, plot a grid in x1 with lines? range(ds5[,1]) newobs <- data.frame(x1=seq(min(ds5$x1),max(ds5$x1),length=100)) cimat <- predict(lm1,newdata=newobs,interval="confidence") plot(ds5$x1,ds5$y) abline(a=lm1$coef[1],b=lm1$coef[2]) lines(newobs$x1,cimat[,2],col=2) lines(newobs$x1,cimat[,3],col=2) # then predictions pimat <- predict(lm1,newdata=newobs,interval="prediction") lines(newobs$x1,pimat[,2],col=4) lines(newobs$x1,pimat[,3],col=4) # and in particular newobs<- data.frame(x1=c(1,3,5)) predict(lm1,newdata=newobs,interval="prediction") # d lm5 <- lm(y~.,data=ds5) summary(lm5) # also look at variance covariance matrix vcov(lm5) # type 1 sums of squares=sequential, in anova, not like for the t-tests, # in t-tests all other variables are assumed present- this is called type III anova(lm5) # compare these two sums of squares # can also compare two models using anova (then the models need to be nested) lm14 <- lm(y~x1+x4,data=ds5) anova(lm1,lm14) # yes, x4 significant anova(lm14,lm5) # no, lm with all 5 not sign as compared to the model with x1 and x4 # e best subsets library(leaps) ?leaps leapsCp <- leaps(x=ds5[,1:5],y=ds5[,6],method="Cp",nbest=5) names(leapsCp) cbind(leapsCp$which,leapsCp$size,leapsCp$Cp) plot(leapsCp$size,leapsCp$Cp) abline(0,1) # the model with x1 and c4 seems the best # hard to see, adjusting y-axis plot(leapsCp$size,leapsCp$Cp,ylim=c(0,10)) abline(0,1) leapsadjr <- leaps(x=ds5[,1:5],y=ds5[,6],method="adjr",nbest=5) names(leapsadjr) cbind(leapsadjr$which,leapsadjr$size,100*leapsadjr$adjr2) # x1 and x4 seems good, but also x1, x2 and x4 which has highest R2adj # compare these two? lm124 <- lm(y~x1+x2+x4,data=ds5) anova(lm124,lm14) # no, model with 1+2+4 is not significantly better than 1+4 # f lminter <- lm(y~x1*x2+x1*x3+x1*x4+x1*x5,data=ds5) summary(lminter) ds5$x1x2 <- ds5$x1*ds5$x2 ds5$x1x3 <- ds5$x1*ds5$x3 ds5$x1x4 <- ds5$x1*ds5$x4 ds5$x1x5 <- ds5$x1*ds5$x5 leapsCpinter <- leaps(x=ds5[,c(1:5,7:10)],y=ds5[,6],method="Cp",nbest=2) cbind(leapsCpinter$which,leapsCpinter$size,leapsCpinter$Cp) plot(leapsCpinter$size,leapsCpinter$Cp,ylim=c(0,10)) abline(0,1) # model with size 5 and Cp 4.3 has 0 1 0 1 0 1 0 1 0 =x2+x4+x1*x2+x1*x4, migth be the best acc to Cp? lminter <- lm(y~x2+x4+x1:x2+x1:x4,data=ds5) summary(lminter) # but, we observe that R2adj is higher for the new model than for the x1+x4 model # significant regression and all vars significant summary(lm14) # compare with model with x1 and x4 is hard since this model is not nested within our new model - # why? new model does not contain x1, only interactions with x1. # in general it is not so common to include an interaction term without adding the main effect, that is # adding x1:x2 and x1:x4 when x1 is not in the model. # What to choose in the end of the day? Well, not so clear... but if in doubt always go for the smallest model # We might add x1 to the previous model, and compare this with the x1 and x4 model.