--- title: "Lecture 4: Regression" author: "Bob O'Hara" date: "bob.ohara@ntnu.no" output: beamer_presentation --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = FALSE) ``` ## Regression Now we will get to modelling ```{r GetData} Data.all <- read.csv("../Data/EggBodySize.csv") Data <- Data.all[!apply(Data.all, 1, function(x) any(is.na(x))),c("species", "development.category", "Egg.mass..g.", "female.mass..g.", "mm..n..mean.femur")] Data$logEggMass <- log(Data$Egg.mass..g.) Data$logFemaleMass <- log(Data$female.mass..g.) ``` ## Two Reasons for Models Inference - Does giving a hurricane a male name increase the amount of damage it does? - How much difference is there between hurricanes with male or female names? Prediction - if we call the next hurricane Donald, how much damage will it do? ## The Model Idea We have a variable, $Y$, which we want to explain with a covariate, $X$ e.g. we want to explain egg size in species of bird by body size Here we will use a straight line - more complex models come later: they are sums of straight lines ## Some Data ```{r PlotData} par(mar=c(4.1,4.1,1,1)) plot(Data$logFemaleMass, Data$logEggMass, xlab="Log body mass", ylab="Log egg mass", main="", lwd=3, col="darkgreen") ``` ## What is the best line? ```{r PlotNoLines} AddSubsampleLine <- function(dat, sz=10) { dat.sub <- dat[sample.int(nrow(dat), size=sz),] mod.sub <- lm(logEggMass ~ logFemaleMass, data=dat.sub) abline(mod.sub) } par(mar=c(4.1,4.1,1,1), lwd=2) plot(Data$logFemaleMass, Data$logEggMass, xlab="Log body mass", ylab="Log egg mass", main="", lwd=3, col="lightgreen") ``` ## What is the best line? One of these? ```{r PlotLines} AddSubsampleLine <- function(dat, sz=10) { dat.sub <- dat[sample.int(nrow(dat), size=sz),] mod.sub <- lm(logEggMass ~ logFemaleMass, data=dat.sub) abline(mod.sub) } par(mar=c(4.1,4.1,1,1), lwd=2) plot(Data$logFemaleMass, Data$logEggMass, xlab="Log body mass", ylab="Log egg mass", main="", lwd=3, col="lightgreen") # mod <- lm(logEggMass ~ logFemaleMass, data=Data) replicate(10, AddSubsampleLine(dat=Data, sz=5)) ``` ## Excercise: What should a straight line be? For each plot, draw what you think is the best line ```{r Data, echo=FALSE, fig.height=4.5} PlotStuff <- function(x, y, mean = FALSE, lines=FALSE) { plot(x, y, xlim=c(0,3), ylim=c(0,3), ann=FALSE, axes = FALSE, cex=1.5) axis(1, at=0:3) axis(2, at=0:3, las=1) if(lines) abline(reg = lm(y~x), lty=3) if(mean) { Mean.y <- tapply(y, list(factor(x)), mean) points(as.numeric(names(Mean.y)), Mean.y, pch=4, col=2, cex=2) # if(lines) abline(reg = lm(Mean.y~as.numeric(names(Mean.y))), lty=3, col=2) } } par(mfrow=c(2,4), mar=c(2,2,2,2), oma=c(0,0,0,0)) PlotStuff(x=1:3, y=1:3) PlotStuff(x = c(1,1,2), y = c(1,2,2)) plot(anscombe$x1, anscombe$y1, xlab="x", ylab="y") plot(anscombe$x2, anscombe$y2, xlab="x", ylab="y") PlotStuff(x = c(1,1,2,2), y = c(1,2,1,2)) PlotStuff(x = c(0.5,1,1.5,1.5,2, 2.5), y = c(1.5,2.5,0.5, 2.5, 1.5,2.5)) plot(anscombe$x3, anscombe$y3, xlab="x", ylab="y") plot(anscombe$x4, anscombe$y4, xlab="x", ylab="y") ``` ## Defining a best line We want to fit a straight line: $$ y_i = \alpha + \beta x_i $$ But there is error that the line cannot explain, so we change the model to $$ y_i = \alpha + \beta x_i + \varepsilon_i $$ with$\sum \varepsilon_i = 0$ ## Maximum Likelihood We can assume that the errors are normally distributed: $$ \varepsilon_i \sim N(0, \sigma^2) $$ The log-likelihood is $$ l(\mathbf{y}|\mathbf{x}, \alpha, \beta, \sigma^2) = - \frac{n}{2} \log{\sigma^2} - \sum_{i=1}^n {\frac{(y_i - \alpha - \beta x_i)^2}{2\sigma^2} } $$ ## Maximum Likelihood We can assume that the errors are normally distributed: $$ \varepsilon_i \sim N(0, \sigma^2) $$ The log-likelihood is $$ l(\mathbf{y}|\mathbf{x}, \alpha, \beta, \sigma^2) = - \frac{n}{2} \log{\sigma^2} - \sum_{i=1}^n {\frac{ \textcolor{red}{(y_i - \alpha - \beta x_i)^2} }{2\sigma^2} } $$ This is quadratic in $y_i$, so this is the same minimising the sums of squares, i.e. the least squares estimate ## Drawing a best line $$ y_i = \alpha + \beta x_i + \varepsilon_i $$ ```{r PlotLines1} mod <- lm(logEggMass ~ logFemaleMass, data=Data) ss <- var(Data[,c("logFemaleMass", "logEggMass")]); delta <- 1 slope <- (ss[2,2] - delta*ss[1,1] + sqrt((ss[2,2]-delta*ss[1,1])^2 +4*delta*ss[1,2]^2))/(2*ss[1,2]) intercept <- mean(Data$logEggMass) - slope*mean(Data$logFemaleMass) par(mar=c(4.1,4.1,1,1), lwd=2) plot(Data$logFemaleMass, Data$logEggMass, xlim=c(4,8), ylim=c(0,6.2), xlab="Log body mass", ylab="Log egg mass", main="", lwd=3, col="lightgreen") abline(a=intercept, b=slope) ``` ## Drawing a best line $$ y_i = \alpha + \beta x_i + \varepsilon_i $$ ```{r PlotLines2} Hlight <- which(resid(mod)==max(resid(mod))) Expr2 <- expression(paste("(", x[i], ", " , alpha, "+", beta, x[i], "+", epsilon[i], ")")) par(mar=c(4.1,4.1,1,1), lwd=2) plot(Data$logFemaleMass, Data$logEggMass, xlim=c(4,8), ylim=c(0,6.2), xlab="Log body mass", ylab="Log egg mass", main="", lwd=3, col="lightgreen") abline(a=intercept, b=slope) points(Data$logFemaleMass[Hlight], Data$logEggMass[Hlight], col="darkblue", pch=16, cex=1.2) points(Data$logFemaleMass[Hlight], Data$logEggMass[Hlight], col="red", pch=1, cex=6) text(Data$logFemaleMass[Hlight]-0.2, Data$logEggMass[Hlight], Expr2, adj=1, cex=1.9) ``` ## Drawing a best line $$ y_i = \alpha + \beta x_i + \varepsilon_i $$ ```{r PlotLines3} Expected <- intercept + slope*Data$logFemaleMass Expr <- expression(paste("(", x[i], ", " , alpha, "+", beta, x[i], ")")) par(mar=c(4.1,4.1,1,1), lwd=2) plot(Data$logFemaleMass, Data$logEggMass, xlim=c(4,8), ylim=c(0,6.2), xlab="Log body mass", ylab="Log egg mass", main="", lwd=3, col="lightgreen") abline(a=intercept, b=slope) points(Data$logFemaleMass[Hlight], Data$logEggMass[Hlight], col="darkblue", pch=16, cex=1.2) points(Data$logFemaleMass[Hlight], Expected[Hlight], col="darkblue", pch=16, cex=1.2) text(Data$logFemaleMass[Hlight]-0.4, Expected[Hlight]-0.8, Expr, adj=0, cex=1.9) ``` ## Drawing a best line $$ y_i = \alpha + \beta x_i + \varepsilon_i $$ ```{r PlotLines4} # Expr3 <- expression(paste("(", x[i], ", " , y[i] - alpha, "-", beta, x[i], ")")) Expr3 <- expression(paste("(", y[i] - alpha, "-", beta, x[i], ")")) par(mar=c(4.1,4.1,1,1), lwd=2) plot(Data$logFemaleMass, Data$logEggMass, xlim=c(4,8), ylim=c(0,6.2), xlab="Log body mass", ylab="Log egg mass", main="", lwd=3, col="lightgreen") abline(a=intercept, b=slope) points(Data$logFemaleMass[Hlight], Data$logEggMass[Hlight], col="darkblue", pch=16, cex=1.2) points(Data$logFemaleMass[Hlight], Expected[Hlight], col="darkblue", pch=16, cex=1.2) arrows(x0 = Data$logFemaleMass[Hlight], y0 = Expected[Hlight], x1 = Data$logFemaleMass[Hlight], y1 = Data$logEggMass[Hlight], code=3) text(Data$logFemaleMass[Hlight]-0.1, (Expected[Hlight] + Data$logEggMass[Hlight])/2, Expr3, adj=1, cex=1.9) ``` ## Drawing a best line We minimise the squares of these distances ```{r PlotLines5} # Expr4 <- expression(paste("(", x[i], ", " , y[i] - alpha, "-", beta, x[i], ")", 2)) Expr4 <- expression(group("(", list(y[i] - alpha - beta*x[i]), ")" )^2 ) par(mar=c(4.1,4.1,1,1), lwd=2) plot(Data$logFemaleMass, Data$logEggMass, xlim=c(4,8), ylim=c(0,6.2), xlab="Log body mass", ylab="Log egg mass", main="", lwd=3, col="lightgreen") abline(a=intercept, b=slope) points(Data$logFemaleMass[Hlight], Data$logEggMass[Hlight], col="darkblue", pch=16, cex=1.2) segments(x0 = Data$logFemaleMass, y0 = Expected, x1 = Data$logFemaleMass, y1 = Data$logEggMass, col="grey70") text(Data$logFemaleMass[Hlight]-0.1, (Expected[Hlight] + Data$logEggMass[Hlight])*0.55, Expr4, adj=1, cex=1.9) ``` ## MLEs We could go through the maths, but $$ \begin{aligned} \hat{\alpha} &= \bar{y} - \hat{\beta} \bar{x} \\ \hat{\beta} &= \frac{\sum_{i=1}^{N} (x_i - \bar{x})(y_i - \bar{y}) }{\sum_{i=1}^{N} \left( x_i - \bar{x} \right)^2} \\ \hat{\sigma^2} &= \frac{1}{N} \sum_{i=1}^{N} \left(y_i - \hat{\alpha} - \hat{\beta}x_i \right)^2 \\ \end{aligned} $$ ## What these Mean: The Intercept The line goes through $(\hat{x}, \hat{y})$, and is extrapolated backwards to $x=0$ $$ \hat{\alpha} = \bar{y} - \hat{\beta} \bar{x} $$ ```{r ExplainIntercept, fig.height=4} mnlnFMass <- mean(Data$logFemaleMass, na.rm = TRUE) mnlnEggMass <- mean(Data$logEggMass, na.rm = TRUE) mod <- lm(logEggMass ~ logFemaleMass, data=Data) par(mar=c(4.1,4.1,1,1), lwd=2) plot(Data$logFemaleMass, Data$logEggMass, xlab="Log body mass", ylab="Log egg mass", main="", lwd=3, col="lightgreen", xlim=c(0,max(Data$logFemaleMass, na.rm = TRUE))) abline(mod) points(mnlnFMass, mnlnEggMass, pch=4, cex=2) text(mnlnFMass+0.1, mnlnEggMass-0.8, expression(paste("(", bar(x), ", ", bar(y), ")")), adj=0, cex=2) ``` ## What these Mean: The slope $$ \hat{\beta} = \frac{\sum_{i=1}^{N} (x_i - \bar{x})(y_i - \bar{y}) }{\sum_{i=1}^{N} \left( x_i - \bar{x} \right)^2} $$ This is $Cov(x, y)/Var(x)$ ## What these Mean: the variance $$ \hat{\sigma^2} = \frac{1}{N} \sum_{i=1}^{N} \left(y_i - \hat{\alpha} - \hat{\beta}x_i \right)^2 $$ Set $\hat{\mu_i} = \hat{\alpha} + \hat{\beta}x_i$ and we get $$ \hat{\sigma^2} = \frac{1}{N} \sum_{i=1}^{N} \left(y_i - \hat{\mu_i} \right)^2 $$ So, the same as the ML estimate of the variance, once we have corrected the values to their means ## The Example For this data we have $y_i =$ `r round(coef(mod)[1], 2)` + `r round(coef(mod)[2],2)` $x_i + \varepsilon_i$ $\varepsilon_i \sim N \left( 0,\right.$ `r round(summary(mod)$sigma, 2)` $^2)$ ```{r Example, fig.height=4} par(mar=c(4.1,4.1,1,1), lwd=2) plot(Data$logFemaleMass, Data$logEggMass, xlab="Log body mass", ylab="Log egg mass", main="", lwd=3, col="lightgreen", xlim=c(0,max(Data$logFemaleMass, na.rm = TRUE))) abline(mod) points(mnlnFMass, mnlnEggMass, pch=4, cex=2) text(mnlnFMass+0.1, mnlnEggMass-0.8, expression(paste("(", bar(x), ", ", bar(y), ")")), adj=0, cex=2) ``` ## Interpreting The Example For this data we have $y_i =$ `r round(coef(mod)[1], 2)` + `r round(coef(mod)[2],2)` $x_i + \varepsilon_i$ $\varepsilon_i \sim N \left( 0,\right.$ `r round(summary(mod)$sigma, 2)` $^2)$ If log Body Mass increases by 1 - body mass increases by $e^1=$ `r round(exp(1), 2)` times - egg mass increases by `r round(coef(mod)[2],2)` (i.e. `r round(exp(coef(mod)[2]),2)` times) so this is a bit less than proportional: it is about 3/4 ## Mis-Interpreting The Example $y_i =$ `r round(coef(mod)[1], 2)` + `r round(coef(mod)[2],2)` $x_i + \varepsilon_i$ $\varepsilon_i \sim N \left( 0,\right.$ `r round(summary(mod)$sigma, 2)` $^2)$ A mass-less bird would have an egg of negative mass - why does the model still make sense? ## How good is the model? - how good (i.e. precise) are the estimates? - how well does it explain the data? ## How precise are the estimates? The likelihood for $\alpha$ and $\beta$ is correlated ```{R JointLHood, results='hide', cache=TRUE} CalcLhood <- function(dat, alpha, slope, sigma=NULL, X="X", Y="Y") { if(!X%in%names(dat)) stop("X should be a name of dat") if(!Y%in%names(dat)) stop("Y should be a name of dat") mu <- alpha + slope*dat[,X] if(is.null(sigma)) { resid <- dat[,Y] - mu sigma <- sqrt(mean(resid^2)) } lh <- dnorm(dat[,Y], mean = mu, sd = sigma, log=TRUE) sum(lh) } CIs <- confint(mod, level = 0.99) Lhoods <- expand.grid( alpha = seq(CIs["(Intercept)", 1], CIs["(Intercept)", 2], length=501), beta = seq(CIs["logFemaleMass", 1], CIs["logFemaleMass", 2], length=501) # alpha = seq(-10, 9, length=5), beta = seq(-1, 2, length=5) ) MaxLH <- CalcLhood(Data, alpha=coef(mod)["(Intercept)"], slope=coef(mod)["logFemaleMass"], sigma=NULL, X="logFemaleMass", Y="logEggMass") Lhoods$lhood <- apply(Lhoods, 1, function(pp) CalcLhood(Data, alpha=pp["alpha"], slope=pp["beta"], sigma=NULL, X="logFemaleMass", Y="logEggMass")) - MaxLH Lhoods$Explhood <- exp(Lhoods$lhood) library(lattice) levelplot(Explhood ~ alpha + beta, data=Lhoods, xlim=CIs["(Intercept)", ], ylim=CIs["logFemaleMass", ], xlab=list(label=expression(alpha), cex=2), ylab=list(label=expression(beta), cex=2), rot=0) trellis.focus("panel", 1, 1, highlight=FALSE) lpoints(coef(mod)["(Intercept)"], coef(mod)["logFemaleMass"], pch=4, cex=2, col=1) trellis.unfocus() ``` ## Why is there a correlation? The line has to go through $(\bar{x}, \bar{y})$, so if we increase the intercept, we have to decrease the slope ```{r ExampleCorr, fig.height=4} slope2 <- coef(mod)[2]-0.2 intercept2 <- mean(Data$logEggMass) - slope2*mean(Data$logFemaleMass) par(mar=c(4.1,4.1,1,1), lwd=2) plot(Data$logFemaleMass, Data$logEggMass, xaxs="i", xlab="Log body mass", ylab="Log egg mass", main="", lwd=3, col="lightgreen", xlim=c(0,max(Data$logFemaleMass, na.rm = TRUE))) abline(mod) abline(a=intercept2, b=slope2, col=2) ``` ## Standard Errors The standard errors are not trivial to obtain, especially as they are correlated. We are primarily interested in the standard errors for $\alpha$ and $\beta$ $$ Var \left( \begin{array}{c} \alpha \\ \beta \end{array}\right) = \frac{ \hat{\sigma}^2}{\sum(x_i - \bar{x})^2} \left( \begin{array}{cc} \sum x_i^2 & -\sum x_i \\ -\sum x_i & n \end{array} \right) $$ Note that the covariance is 0 when $\sum x_i = 0$ ## Confidence Intervals From the standard errors we can calculate the asymptotic confidence intervals for the parameters, e.g. $$ \left(\hat{\alpha} - 1.96 s_{\alpha}, \hat{\alpha} + 1.96 s_{\alpha} \right) $$ where $s_{\alpha}$ is the standard error for $\alpha$ (in practice, get this from the computer) e.g the 95% confidence interval for the slope is (`r round(summary(mod)$coefficients[2,1] - 1.96*summary(mod)$coefficients[2,2],2)`, `r round(summary(mod)$coefficients[2,1] + 1.96*summary(mod)$coefficients[2,2],2)`), i.e. we can be sure it is < 1, and is fairly close to 3/4 (with less data, a t-distirbution makes more sense) ## Prediction Our Eclectus, Freyja and Eric, are thinking of breeding (at least Freyja is). How large will their egg be? ![](FrejyaEric.jpeg) ## Prediction Female Eclectus' body size: 390g - 445g (so median of 417.5g) Point estimate: $E(y) = `r round(coef(mod)[1], 2)` + `r round(coef(mod)[2],2)` \times log(417.5) = `r round(coef(mod)[1] + coef(mod)[2]*log(417.5),2)`$ So, a mass of exp(`r round(coef(mod)[1] + coef(mod)[2]*log(417.5),2)`) = `r round(exp(coef(mod)[1] + coef(mod)[2]*log(417.5)),1)`g But this is uncertain... (remember, we have log-log transformed the data) ## Prediction Intervals ```{R PredIntervals} mu.hat <- coef(mod)[1] + coef(mod)[2]*log(417.5) sd <- summary(mod)$sigma LCI <- mu.hat - 1.96*sd UCI <- mu.hat + 1.96*sd ``` Just as we have confidence intervals, we can also have prediction intervals We assume our data is normally distributed (given its mean), so if we knew the prameters it would be $$ y_{pred} \sim N(\alpha + \beta x_{pred}, \sigma^2) $$ e.g. the 95% confidence interval would be $e^{(`r round(mu.hat,2)` - 1.96 \times `r round(sd,2)`, `r round(mu.hat,2)` + 1.96 \times `r round(sd,2)`)}$ = (`r round(exp(LCI), 1)`, `r round(exp(UCI), 1)`) ## Full Prediction Intervals We also take into account the uncertainty in the data (and thus parameter estimates), so the standard deviation of the prediction becomes TZYPOES!!!!! $$ \sigma_{pred} = \sqrt{1 + \frac{1}{n} + \frac{(x_{pred} - \bar{x})^2}{\sum (x_{i} - \bar{x})^2}} $$ We also have to assume a t-ditribution (with n-1 degrees of freedom), to account for the uncertainty in $\hat{\sigma}^2$ $$ y_{pred} \sim t(\alpha + \beta x_{pred}, \sigma^2, n-1) $$ ## Full Prediction Intervals ```{r Preds, fig.height=4} x.pred <- seq(min(Data$logFemaleMass), max(Data$logFemaleMass), length=10) Pred <- predict(mod, newdata = data.frame(logFemaleMass=x.pred), interval = "prediction") par(mar=c(4.1,4.1,1,1), lwd=2) plot(Data$logFemaleMass, Data$logEggMass, xaxs="i", xlab="Log body mass", ylab="Log egg mass", main="", lwd=3, col="lightgreen", xlim=c(0,max(Data$logFemaleMass, na.rm = TRUE))) polygon(c(x.pred, rev(x.pred)), c(Pred[,"lwr"], rev(Pred[,"upr"])), col="lightblue", border=NA) abline(mod) points(Data$logFemaleMass, Data$logEggMass, col="darkblue") ``` ## Next... Friday: Exercises (at last!) https://www.math.ntnu.no/emner/TMA4268/2018v/1Intro/Rbeginner.html for masochists: https://www.math.ntnu.no/emner/TMA4268/2018v/1Intro/Rintermediate.html Next Week: How well does the model actually fit the data? - Grafen & Hails Chapter 2.4 - 2.7