# Exercise 5

# a
ds5 <- read.table("http://www.math.ntnu.no/~erikblys/TMA4255-V14/Data/data5.txt",header=TRUE)
dim(ds5)
summary(ds5)
cor(ds5)
cor.test(ds5$y,ds5$x1)
pairs(ds5)
#b
for (i in 1:5)
{
  print(i)
  print(summary(lm(ds5[,6]~ds5[,i])))
}
# only x1 significant if only one regressor present
lm1 <- lm(y~x1,data=ds5)
summary(lm1)
plot(lm1$fitted,rstudent(lm1))
qqnorm(rstudent(lm1))
qqline(rstudent(lm1))
hist(rstudent(lm1))
plot(density(rstudent(lm1),width=1))
# heavier tails than normal, but very
# c
?predict.lm
predict(lm1,interval="confidence")
# for all observations in data set
# or, plot a grid in x1 with lines?
range(ds5[,1])
newobs <- data.frame(x1=seq(min(ds5$x1),max(ds5$x1),length=100))
cimat <- predict(lm1,newdata=newobs,interval="confidence")
plot(ds5$x1,ds5$y)
abline(a=lm1$coef[1],b=lm1$coef[2])
lines(newobs$x1,cimat[,2],col=2)
lines(newobs$x1,cimat[,3],col=2)
# then predictions
pimat <- predict(lm1,newdata=newobs,interval="prediction")
lines(newobs$x1,pimat[,2],col=4)
lines(newobs$x1,pimat[,3],col=4)
# and in particular 
newobs<- data.frame(x1=c(1,3,5))
predict(lm1,newdata=newobs,interval="prediction")
# d
lm5 <- lm(y~.,data=ds5)
summary(lm5)
# also look at variance covariance matrix
vcov(lm5)
# type 1 sums of squares=sequential, in anova, not like for the t-tests,
# in t-tests all other variables are assumed present- this is called type III
anova(lm5)
# compare these two sums of squares
# can also compare two models using anova (then the models need to be nested)
lm14 <- lm(y~x1+x4,data=ds5)
anova(lm1,lm14)    
# yes, x4 significant
anova(lm14,lm5)
# no, lm with all 5 not sign as compared to the model with x1 and x4
# e best subsets  
library(leaps)
?leaps
leapsCp <- leaps(x=ds5[,1:5],y=ds5[,6],method="Cp",nbest=5)
names(leapsCp)
cbind(leapsCp$which,leapsCp$size,leapsCp$Cp)
plot(leapsCp$size,leapsCp$Cp)
abline(0,1)
# the model with x1 and c4 seems the best
# hard to see, adjusting y-axis
plot(leapsCp$size,leapsCp$Cp,ylim=c(0,10))
abline(0,1)
leapsadjr <- leaps(x=ds5[,1:5],y=ds5[,6],method="adjr",nbest=5)
names(leapsadjr)
cbind(leapsadjr$which,leapsadjr$size,100*leapsadjr$adjr2)
# x1 and x4 seems good, but also x1, x2 and x4 which has highest R2adj
# compare these two?
lm124 <- lm(y~x1+x2+x4,data=ds5)
anova(lm124,lm14)
# no, model with 1+2+4 is not significantly better than 1+4
# f
lminter <- lm(y~x1*x2+x1*x3+x1*x4+x1*x5,data=ds5)
summary(lminter)
ds5$x1x2 <- ds5$x1*ds5$x2
ds5$x1x3 <- ds5$x1*ds5$x3
ds5$x1x4 <- ds5$x1*ds5$x4
ds5$x1x5 <- ds5$x1*ds5$x5
leapsCpinter <- leaps(x=ds5[,c(1:5,7:10)],y=ds5[,6],method="Cp",nbest=2)
cbind(leapsCpinter$which,leapsCpinter$size,leapsCpinter$Cp)
plot(leapsCpinter$size,leapsCpinter$Cp,ylim=c(0,10))
abline(0,1)
# model with size 5 and Cp 4.3 has 0 1 0 1 0 1 0 1 0 =x2+x4+x1*x2+x1*x4, migth be the best acc to Cp?
lminter <- lm(y~x2+x4+x1:x2+x1:x4,data=ds5)
summary(lminter)
# but, we observe that R2adj is higher for the new model than for the x1+x4 model
# significant regression and all vars significant
summary(lm14)

# compare with model with x1 and x4 is hard since this model is not nested within our new model -
# why? new model does not contain x1, only interactions with x1.
# in general it is not so common to include an interaction term without adding the main effect, that is
# adding x1:x2 and x1:x4 when x1 is not in the model. 
# What to choose in the end of the day? Well, not so clear... but if in doubt always go for the smallest model

# We might add x1 to the previous model, and compare this with the x1 and x4 model.