Recommended exercise 1
Solution can be found here.
Recommended exercise 2
For the least square estimator, the solution can be found in the first session here.
For the maximum likelihood estimator, the solution can be found here.
Recommended exercise 3
library(ISLR) # Package with data for an Introduction to Statistical
# Learning with Applications in R
# Load Credit dataset
data(Credit)
# Check column names
names(Credit)
## [1] "ID" "Income" "Limit" "Rating" "Cards"
## [6] "Age" "Education" "Gender" "Student" "Married"
## [11] "Ethnicity" "Balance"
# Check dataset shape
dim(Credit)
## [1] 400 12
head(Credit)
## ID Income Limit Rating Cards Age Education Gender Student Married
## 1 1 14.891 3606 283 2 34 11 Male No Yes
## 2 2 106.025 6645 483 3 82 15 Female Yes Yes
## 3 3 104.593 7075 514 4 71 11 Male No No
## 4 4 148.924 9504 681 3 36 11 Female No No
## 5 5 55.882 4897 357 2 68 16 Male No Yes
## 6 6 80.180 8047 569 4 77 10 Male No No
## Ethnicity Balance
## 1 Caucasian 333
## 2 Asian 903
## 3 Asian 580
## 4 Asian 964
## 5 Caucasian 331
## 6 Caucasian 1151
# Select variable to plot
pairwise_scatter_data <- Credit[,c("Balance", "Age", "Cards", "Education",
"Income", "Limit", "Rating")]
# Simplest possible pairwise scatter plot
pairs(pairwise_scatter_data)
# More interesting but slower pairwise plot from package GGally
library(GGally)
ggpairs(data=pairwise_scatter_data)
Check here for quick get started to ggpairs
Recommended exercise 4
From a linear algebra perspective, when \(n < p\) we have more unknowns than equations, leading to either no solution or multiple solutions to the regression problem.
In other words, the design matrix \(X\) do not have full rank, leading \(X^TX\) not being invertible and therefore we are unable to directly compute
\[ \hat{\beta} = (X^TX)^{-1} X^T y \]
Recommended exercise 5
# Exclude 'ID' column
credit_data <- subset(Credit, select=-c(ID))
# Counting the dummy variables as well
credit_data_number_predictors <- 11
# Take a look at the data
head(credit_data)
## Income Limit Rating Cards Age Education Gender Student Married
## 1 14.891 3606 283 2 34 11 Male No Yes
## 2 106.025 6645 483 3 82 15 Female Yes Yes
## 3 104.593 7075 514 4 71 11 Male No No
## 4 148.924 9504 681 3 36 11 Female No No
## 5 55.882 4897 357 2 68 16 Male No Yes
## 6 80.180 8047 569 4 77 10 Male No No
## Ethnicity Balance
## 1 Caucasian 333
## 2 Asian 903
## 3 Asian 580
## 4 Asian 964
## 5 Caucasian 331
## 6 Caucasian 1151
# Summary statistics
summary(credit_data)
## Income Limit Rating Cards
## Min. : 10.35 Min. : 855 Min. : 93.0 Min. :1.000
## 1st Qu.: 21.01 1st Qu.: 3088 1st Qu.:247.2 1st Qu.:2.000
## Median : 33.12 Median : 4622 Median :344.0 Median :3.000
## Mean : 45.22 Mean : 4736 Mean :354.9 Mean :2.958
## 3rd Qu.: 57.47 3rd Qu.: 5873 3rd Qu.:437.2 3rd Qu.:4.000
## Max. :186.63 Max. :13913 Max. :982.0 Max. :9.000
## Age Education Gender Student Married
## Min. :23.00 Min. : 5.00 Male :193 No :360 No :155
## 1st Qu.:41.75 1st Qu.:11.00 Female:207 Yes: 40 Yes:245
## Median :56.00 Median :14.00
## Mean :55.67 Mean :13.45
## 3rd Qu.:70.00 3rd Qu.:16.00
## Max. :98.00 Max. :20.00
## Ethnicity Balance
## African American: 99 Min. : 0.00
## Asian :102 1st Qu.: 68.75
## Caucasian :199 Median : 459.50
## Mean : 520.01
## 3rd Qu.: 863.00
## Max. :1999.00
# Create train and test set indexes
set.seed(1)
train_perc <- 0.75
credit_data_train_index <- sample(1:nrow(credit_data), nrow(credit_data)*train_perc)
credit_data_test_index <- (-credit_data_train_index)
# Create train and test set
credit_data_training <- credit_data[credit_data_train_index, ]
credit_data_testing <- credit_data[credit_data_test_index, ]
library(leaps)
# Perform best subset selection using all the predictors and the training data
best_subset_method=regsubsets(Balance~.,credit_data_training,nvmax=credit_data_number_predictors)
# Save summary obj
best_subset_method_summary=summary(best_subset_method)
# Plot RSS, Adjusted R^2, C_p and BIC
par(mfrow=c(2,2))
plot(best_subset_method_summary$rss,xlab="Number of Variables",ylab="RSS",type="l")
plot(best_subset_method_summary$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")
bsm_best_adjr2 = which.max(best_subset_method_summary$adjr2)
points(bsm_best_adjr2,best_subset_method_summary$adjr2[bsm_best_adjr2], col="red",cex=2,pch=20)
plot(best_subset_method_summary$cp,xlab="Number of Variables",ylab="Cp",type='l')
bsm_best_cp=which.min(best_subset_method_summary$cp)
points(bsm_best_cp,best_subset_method_summary$cp[bsm_best_cp],col="red",cex=2,pch=20)
bsm_best_bic=which.min(best_subset_method_summary$bic)
plot(best_subset_method_summary$bic,xlab="Number of Variables",ylab="BIC",type='l')
points(bsm_best_bic,best_subset_method_summary$bic[bsm_best_bic],col="red",cex=2,pch=20)
# Create a prediction function to make predictions
# for regsubsets with id predictors included
predict.regsubsets=function(object,newdata,id,...){
form=as.formula(object$call[[2]])
mat=model.matrix(form,newdata)
coefi=coef(object,id=id)
xvars=names(coefi)
mat[,xvars]%*%coefi
}
# Create indexes to divide the data between folds
k=10
set.seed(1)
folds=sample(1:k,nrow(credit_data_training),replace=TRUE)
cv.errors=matrix(NA,k,credit_data_number_predictors, dimnames=list(NULL, paste(1:credit_data_number_predictors)))
# Perform CV
for(j in 1:k){
best_subset_method=regsubsets(Balance~.,data=credit_data_training[folds!=j,],nvmax=credit_data_number_predictors)
for(i in 1:credit_data_number_predictors){
pred=predict(best_subset_method,credit_data_training[folds==j,],id=i)
cv.errors[j,i]=mean( (credit_data_training$Balance[folds==j]-pred)^2)
}
}
# Compute mean cv errors for each model size
mean.cv.errors=apply(cv.errors,2,mean)
mean.cv.errors
## 1 2 3 4 5 6 7 8
## 54740.59 27601.37 11488.15 10073.22 10242.75 10320.48 10390.12 10302.00
## 9 10 11
## 10352.95 10282.19 10265.57
# Plot the mean cv errors
par(mfrow=c(1,1))
plot(mean.cv.errors,type='b')
# Fit the selected model using the whole training data
# and compute test error
# models selected
number_predictors_selected <- 4
# Create info for lm call
variables <- names(coef(best_subset_method,id=number_predictors_selected))
variables <- variables[!variables %in% "(Intercept)"]
bsm_formula <- as.formula(best_subset_method$call[[2]])
bsm_design_matrix <- model.matrix(bsm_formula,credit_data_training)[, variables]
bsm_data_train <- data.frame(Balance = credit_data_training$Balance, bsm_design_matrix)
# Fit a standard linear model using only the selected
# predictors on the training data
model_best_subset_method <- lm(formula = bsm_formula, bsm_data_train)
summary(model_best_subset_method)
##
## Call:
## lm(formula = bsm_formula, data = bsm_data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -199.317 -79.156 -9.477 52.443 307.503
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.107e+02 1.776e+01 -28.748 < 2e-16 ***
## Income -7.881e+00 2.651e-01 -29.727 < 2e-16 ***
## Limit 2.686e-01 4.127e-03 65.076 < 2e-16 ***
## Cards 2.379e+01 3.974e+00 5.987 6.19e-09 ***
## StudentYes 4.348e+02 1.851e+01 23.494 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 98.81 on 295 degrees of freedom
## Multiple R-squared: 0.9557, Adjusted R-squared: 0.9551
## F-statistic: 1590 on 4 and 295 DF, p-value: < 2.2e-16
# Make predictions on the test set
bsm_design_matrix_test <- model.matrix(bsm_formula,credit_data_testing)[, variables]
bsm_predictions <- predict(object = model_best_subset_method, newdata = as.data.frame(bsm_design_matrix_test))
# Compute test squared errors
bsm_squared_errors <- (credit_data_testing$Balance-bsm_predictions)^2
squared_errors <- data.frame(bsm_squared_errors=bsm_squared_errors)
# test MSE
mean(bsm_squared_errors)
## [1] 10413.11
Recommended exercise 6
Similar analysis as previous exercise, simply replace Best Subset Selection (best_subset_method=regsubsets(Balance~.,credit_data,nvmax=credit_data_number_predictors)
) by Forward Stepwise Selection (regfit.fwd=regsubsets(Balance~.,credit_data,nvmax=credit_data_number_predictors,method="forward")
), Backward Stepwise Selection (regfit.fwd=regsubsets(Balance~.,credit_data,nvmax=credit_data_number_predictors,method="backward")
) and Hybrid Stepwise Selection (regfit.fwd=regsubsets(Balance~.,credit_data,nvmax=credit_data_number_predictors,method="seqrep")
)
Recommended exercise 7
library(glmnet) # Package Lasso and Elastic-Net Regularized
# Generalized Linear Models
x_train <- model.matrix(Balance~.,credit_data_training)[,-1]
y_train <- credit_data_training$Balance
x_test <- model.matrix(Balance~.,credit_data_testing)[,-1]
y_test <- credit_data_testing$Balance
ridge_mod <- glmnet(x_train,y_train,alpha=0)
set.seed(1)
cv.out=cv.glmnet(x_train, y_train,alpha=0)
plot(cv.out)
best_lambda_ridge <- cv.out$lambda.min
best_lambda_ridge
## [1] 44.18941
ridge_predictions = predict(ridge_mod,s=best_lambda_ridge,newx=x_test)
ridge_square_errors <- as.numeric((ridge_predictions-y_test)^2)
squared_errors <- data.frame(ridge_square_errors = ridge_square_errors, squared_errors)
Recommended exercise 8
lasso_mod <- glmnet(x_train,y_train,alpha=1)
set.seed(1)
cv.out=cv.glmnet(x_train, y_train,alpha=1)
plot(cv.out)
best_lambda_lasso <- cv.out$lambda.min
best_lambda_lasso
## [1] 0.7201774
lasso_predictions = predict(lasso_mod,s=best_lambda_lasso,newx=x_test)
lasso_square_errors <- as.numeric((lasso_predictions-y_test)^2)
squared_errors <- data.frame(lasso_square_errors = lasso_square_errors, squared_errors)
Recommended exercise 9
standard_lm <- lm(formula = Balance~., data = credit_data_training)
standard_lm_predictions <- predict(object = standard_lm, newdata = credit_data_testing)
standard_lm_square_errors <- as.numeric((standard_lm_predictions-credit_data_testing$Balance)^2)
squared_errors <- data.frame(standard_lm_square_errors = standard_lm_square_errors, squared_errors)
# Comparison between methods as measures by computing MSE on a independent test set
library(reshape2)
library(ggplot2)
ggplot(melt(squared_errors)) + geom_boxplot(aes(variable, value))
Recommended exercise 10
x <- model.matrix(Balance~.,credit_data)[,-1]
credit_pca <- prcomp(x, center = TRUE, scale. = TRUE)
print(credit_pca)
## Standard deviations (1, .., p=11):
## [1] 1.66007642 1.26685832 1.05356810 1.04926273 1.00322222 0.98576693
## [7] 0.97830708 0.90714714 0.63722533 0.51174012 0.04617646
##
## Rotation (n x k) = (11 x 11):
## PC1 PC2 PC3 PC4
## Income -0.542206953 0.029036783 -0.033270648 -6.564051e-05
## Limit -0.586332930 0.017502630 -0.024351723 4.678929e-02
## Rating -0.586751867 0.014971105 -0.004630758 3.687909e-02
## Cards -0.019086978 -0.008549632 0.479005750 -2.720228e-01
## Age -0.122783390 -0.071116603 0.107188498 -4.787335e-01
## Education 0.026797471 0.096557225 -0.475418336 1.990653e-01
## GenderFemale -0.002519860 0.052811098 -0.334014058 -4.207748e-02
## StudentYes 0.002276904 0.125422970 -0.618650527 -2.963169e-01
## MarriedYes -0.026218561 0.094278214 0.125718135 7.389864e-01
## EthnicityAsian 0.032769895 0.696759512 0.105703127 6.686132e-03
## EthnicityCaucasian -0.004070799 -0.686505857 -0.100240068 1.338718e-01
## PC5 PC6 PC7 PC8
## Income -0.02816858 0.02297156 -0.04086888 0.03502243
## Limit 0.02393728 0.06109959 0.02753603 -0.07998103
## Rating 0.03044748 0.04901285 0.06298342 -0.07474080
## Cards 0.07450235 -0.28313105 0.77070237 -0.10917776
## Age -0.29468570 -0.58353604 -0.35860755 0.41270188
## Education -0.58335540 -0.40244676 0.21601791 -0.41794930
## GenderFemale 0.74620452 -0.51375214 -0.10203846 -0.22746095
## StudentYes 0.05874438 0.20236658 0.42777847 0.53366278
## MarriedYes 0.04850438 -0.32419986 0.13571418 0.53676497
## EthnicityAsian 0.02125450 0.01284830 -0.04334986 0.01824866
## EthnicityCaucasian 0.04400214 -0.02306227 0.10322555 0.06987098
## PC9 PC10 PC11
## Income -0.016018928 0.836411394 0.0017092799
## Limit -0.010697575 -0.379489022 0.7053633132
## Rating -0.005366527 -0.373834509 -0.7081335719
## Cards 0.005357720 0.059511066 0.0305564113
## Age -0.048994454 -0.102540342 0.0005901693
## Education -0.021973159 0.014172918 -0.0036133922
## GenderFemale 0.014513597 0.027300122 0.0001327203
## StudentYes 0.022068488 -0.032119354 0.0044219212
## MarriedYes 0.119017609 -0.018248384 0.0051766487
## EthnicityAsian -0.706522468 -0.014783578 -0.0035849536
## EthnicityCaucasian -0.694731116 0.008145839 -0.0004464620
plot(credit_pca, type = "l")
summary(credit_pca)
## Importance of components%s:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.6601 1.2669 1.0536 1.0493 1.0032 0.98577 0.97831
## Proportion of Variance 0.2505 0.1459 0.1009 0.1001 0.0915 0.08834 0.08701
## Cumulative Proportion 0.2505 0.3964 0.4973 0.5974 0.6889 0.77727 0.86427
## PC8 PC9 PC10 PC11
## Standard deviation 0.90715 0.63723 0.51174 0.04618
## Proportion of Variance 0.07481 0.03691 0.02381 0.00019
## Cumulative Proportion 0.93908 0.97600 0.99981 1.00000
The first PC explain along \(25\%\) of the variability in the data. Then the second PC explain an extra \(15\%\) of the variability in the data. From the third PC until \(8\)th PC the extra variability explained per PC varies between \(7.5\%\) to \(10\%\), dropping to \(3.6\%\) on the \(9\)th PCA. So I would likely use \(8\) PCs for the Credit dataset.
Recommended exercise 11
library(pls)
set.seed(1)
pcr_model <- pcr(Balance~., data=credit_data_training,scale=TRUE, validation="CV")
validationplot(pcr_model,val.type="MSEP")
pcr_predictions = predict(pcr_model,credit_data_testing,ncomp=10)
pcr_square_errors <- as.numeric((pcr_predictions-credit_data_testing$Balance)^2)
squared_errors <- data.frame(pcr_square_errors = pcr_square_errors, squared_errors)
mean(pcr_square_errors)
## [1] 9724.954
ggplot(melt(squared_errors)) + geom_boxplot(aes(variable, value))
Recommended exercise 12
library(pls)
set.seed(1)
plsr_model <- plsr(Balance~., data=credit_data_training,scale=TRUE, validation="CV")
validationplot(plsr_model,val.type="MSEP")
plsr_predictions = predict(plsr_model,credit_data_testing,ncomp=3)
plsr_square_errors <- as.numeric((plsr_predictions-credit_data_testing$Balance)^2)
squared_errors <- data.frame(plsr_square_errors = plsr_square_errors, squared_errors)
mean(plsr_square_errors)
## [1] 10779.06
ggplot(melt(squared_errors)) + geom_boxplot(aes(variable, value))
colMeans(squared_errors)
## plsr_square_errors pcr_square_errors
## 10779.063 9724.954
## standard_lm_square_errors lasso_square_errors
## 9854.299 9776.880
## ridge_square_errors bsm_squared_errors
## 13730.108 10413.105