ds <- read.table("http://data.princeton.edu/wws509/datasets/cuse.dat",
header = TRUE)
head(ds)
## age education wantsMore notUsing using
## 1 <25 low yes 53 6
## 2 <25 low no 10 4
## 3 <25 high yes 212 52
## 4 <25 high no 50 10
## 5 25-29 low yes 60 14
## 6 25-29 low no 19 10
Explain fit0
:
fit0 <- glm(cbind(using, notUsing) ~ 1, data = ds, family = "binomial")
fit0 # see that residual deviance is the null deviance
##
## Call: glm(formula = cbind(using, notUsing) ~ 1, family = "binomial",
## data = ds)
##
## Coefficients:
## (Intercept)
## -0.7746
##
## Degrees of Freedom: 15 Total (i.e. Null); 15 Residual
## Null Deviance: 165.8
## Residual Deviance: 165.8 AIC: 239.3
# estimating the probability p using the data can be done directly,
# using the MLE estimate
Estimated coefficient and proportion using contraception: Log-likelihood \[l(\beta)=\beta_0 N_1 - \log(1+\exp(\beta_0))N\] and derivative thereof (score function) \[ s(\beta_0)=\frac{\partial l}{\partial \beta} = N_1-N\frac{\exp(\beta_0)}{1+\exp(\beta_0)}\] and to find the MLE we set \(s(\hat{\beta}_0)=0\), and get
\[ \frac{\exp(\hat{\beta}_0)}{1+\exp(\hat{\beta}_0)}=\frac{N_1}{N}\] Observe that this also means that \[\hat{\pi}=\frac{N_1}{N}\] and further that \[ \hat{\beta}_0=\text{logit}(\frac{N_1}{N})\]
N <- sum(ds$using + ds$notUsing)
N1 <- sum(ds$using)
N2 <- N - N1
qlogis(N1/N) # this is the beta0hat
## [1] -0.7745545
fit0$coefficients
## (Intercept)
## -0.7745545
Estimated variance of \(\hat{\beta}_0\): \(\text{Cov}(\hat{\beta}_0)=F^{-1}(\hat{\beta}_0)\), and since is a scalar we might call it \(\text{Var}(\hat{\beta}_0)\).
\[F(\beta_0) = \sum_{j=1}^G x_jx_j^T n_j \pi_j(1-\pi_j)=\sum_{j=1}^G n_j \pi(1-\pi)=N\pi(1-\pi)\] The inverse is then: \[F^{-1}(\beta_0)=\frac{1}{N\pi (1-\pi)}\] and inserted \(\hat{\pi}=\frac{N_1}{N}\) to give \[\text{Var}(\hat{\beta}_0)=F^{-1}(\hat{\beta}_0)=\frac{1}{N \hat{\pi} (1-\hat{\pi})}=\frac{1}{N \frac{N_1}{N}\frac{N_2}{N}}=\frac{N}{N_1N_2}=\frac{1}{N_1}+\frac{1}{N_2}\]
# Fisher-matrix:
1/N1 + 1/N2
## [1] 0.002881477
# (1x1-matrix)
vcov(fit0) # same as we calculated
## (Intercept)
## (Intercept) 0.002881477
# beta0 is normal with mean fit0$coefficients and variance vcov(fit0)
critval <- qnorm(0.025, lower.tail = FALSE)
ci <- c(fit0$coefficients - critval * sqrt(vcov(fit0)), fit0$coefficients +
critval * sqrt(vcov(fit0)))
confint.default(fit0) # assumes asymptotic gaussian distribution
## 2.5 % 97.5 %
## (Intercept) -0.8797641 -0.6693448
confint(fit0)
## 2.5 % 97.5 %
## -0.8804716 -0.6700014
plogis(ci)
## [1] 0.2932267 0.3386436
# does the same
fit0$family$linkinv(ci)
## [1] 0.2932267 0.3386436
fit1 <- glm(cbind(using, notUsing) ~ wantsMore, data = ds, family = binomial)
summary(fit1) # Note that null-deviance is as fit0, but residual deviance is smaller
##
## Call:
## glm(formula = cbind(using, notUsing) ~ wantsMore, family = binomial,
## data = ds)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.7091 -1.2756 -0.3467 1.4667 3.5505
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.18636 0.07971 -2.338 0.0194 *
## wantsMoreyes -1.04863 0.11067 -9.475 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 165.772 on 15 degrees of freedom
## Residual deviance: 74.098 on 14 degrees of freedom
## AIC: 149.61
##
## Number of Fisher Scoring iterations: 4
exp(fit1$coefficients[2])
## wantsMoreyes
## 0.3504178
Parameter interpretation: if you want more kids, you are less probable to use contraceptives (this makes sense). And, when you change from not wanting more kids to wanting more kids the odds for contraceptive will be multiplied by \(\exp(\hat{\beta_1})\) 0.3504178. Remember, when \(\beta<0\) we have a decrease in the odds.
\(H_0\): \(\beta_1 = 0\), \(H_1\): \(\beta_1 \neq 0\)
Wald test: \(W = (\hat{\beta_1}/(\text{SD}(\hat{\beta_1})))^2\)
summary(fit1)
##
## Call:
## glm(formula = cbind(using, notUsing) ~ wantsMore, family = binomial,
## data = ds)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.7091 -1.2756 -0.3467 1.4667 3.5505
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.18636 0.07971 -2.338 0.0194 *
## wantsMoreyes -1.04863 0.11067 -9.475 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 165.772 on 15 degrees of freedom
## Residual deviance: 74.098 on 14 degrees of freedom
## AIC: 149.61
##
## Number of Fisher Scoring iterations: 4
# covariance matrix
summary(fit1)$cov.scaled
## (Intercept) wantsMoreyes
## (Intercept) 0.006354067 -0.006354067
## wantsMoreyes -0.006354067 0.012248298
vcov(fit1)
## (Intercept) wantsMoreyes
## (Intercept) 0.006354067 -0.006354067
## wantsMoreyes -0.006354067 0.012248298
# if you want to calculate the Fisher info directly from the formula
# -- but not needed! Can be found by using vcov(fit1)
new_ds <- xtabs(formula = cbind(using, notUsing) ~ wantsMore, data = ds)
fisherlist_1 <- list(x = cbind(c(1, 1), c(0, 1)), n = cbind(new_ds[,
1] + new_ds[, 2]), pi = fit1$family$linkinv(cbind(c(1, 1), c(0, 1)) %*%
fit1$coefficients))
gr1 <- with(fisherlist_1, x[1, ] %*% t(x[1, ]) * n[1, ] * pi[1, ] * (1 -
pi[1, ]))
gr2 <- with(fisherlist_1, x[2, ] %*% t(x[2, ]) * n[2, ] * pi[2, ] * (1 -
pi[2, ]))
fisher_1 <- gr1 + gr2
covmat_1 <- solve(fisher_1)
covmat_1
## [,1] [,2]
## [1,] 0.006354067 -0.006354067
## [2,] -0.006354067 0.012248298
beta_1 <- fit1$coefficients[2]
sd_1 <- sqrt(covmat_1[2, 2])
wald_1 <- (beta_1/sd_1)^2
wald_1 # the squared z-statistic from the summary table
## wantsMoreyes
## 89.77763
pchisq(wald_1, 1, lower.tail = FALSE)
## wantsMoreyes
## 2.664901e-21
# likelihood ratio test
diff_dev <- fit0$deviance - fit1$deviance
diff_df <- 15 - 14
diff_dev
## [1] 91.6744
pchisq(diff_dev, diff_df, lower.tail = FALSE)
## [1] 1.021784e-21
\(LRT = -2(l(SMALL) - l(BIG))\)
loglik <- function(par, args) {
y <- args$y
x <- args$x
n <- args$n
res <- sum(y * x %*% par - n * log(1 + exp(x %*% par)))
return(res)
}
args0 <- list(y = ds$using, n = ds$using + ds$notUsing, x = cbind(rep(1,
nrow(ds))))
args1 <- list(y = ds$using, n = ds$using + ds$notUsing, x = cbind(rep(1,
nrow(ds)), as.numeric(ds$wantsMore) - 1))
betas0 <- fit0$coefficients
betas1 <- fit1$coefficients
ll0 <- loglik(betas0, args0)
# or
loglik(matrix(c(betas0, 0), ncol = 1), args1)
## [1] -1001.847
# and
ll1 <- loglik(betas1, args1)
lrtest_1 <- -2 * (ll0 - ll1)
lrtest_1
## [1] 91.6744
diff_df <- 15 - 14 # 1 parameter in the first model, 2 in the second, and 16 observations in both
pchisq(lrtest_1, diff_df, lower.tail = FALSE)
## [1] 1.021784e-21
The deviance is given by \(-2(l(model) - l(saturated))\). The difference in the deviances for two models A and B is then \(-2(l_A - l_S) - (-2(l_B - l_S)) = -2l_A + 2l_S + 2l_B - 2l_S = 2l_B - 2l_A = -2(l_A - l_B)\). In our case, fit0 is model A and fit1 is model B.
fit0$deviance - fit1$deviance
## [1] 91.6744
They differ, but give the same conclusion as the critical value is 3.841. Both are assumed to be \(\chi^2\)-distributed with 1 df.
No solution given (explain the content of each model, before seeing diagnostics).
See the module pages on deviance for definition. Degrees of freedom (df) is the difference between number of observations (or groups) and the number of parameters \(\beta\) in your model. Deviance is used for assessing model fit, and model choice.
You can compare all nested models, i.e., models where one is a submodel of the other, using the likelihood-ratio test. They have to be nested, since you test whether a model with fewer parameters (null-hypothesis) is better than others. The smaller model will always have larger (or equal) deviance than the larger model.
ds$Y <- cbind(ds$using, ds$notUsing)
models <- list(
null = glm(Y ~ 1, family = binomial, data = ds),
age = glm(Y ~ age, family = binomial, data = ds),
desire = glm(Y ~ wantsMore, family = binomial, data = ds),
additive = glm(Y ~ age + wantsMore, family = binomial, data = ds),
interact = glm(Y ~ age*wantsMore, family = binomial, data = ds)
)
models
## $null
##
## Call: glm(formula = Y ~ 1, family = binomial, data = ds)
##
## Coefficients:
## (Intercept)
## -0.7746
##
## Degrees of Freedom: 15 Total (i.e. Null); 15 Residual
## Null Deviance: 165.8
## Residual Deviance: 165.8 AIC: 239.3
##
## $age
##
## Call: glm(formula = Y ~ age, family = binomial, data = ds)
##
## Coefficients:
## (Intercept) age25-29 age30-39 age40-49
## -1.5072 0.4607 1.0483 1.4246
##
## Degrees of Freedom: 15 Total (i.e. Null); 12 Residual
## Null Deviance: 165.8
## Residual Deviance: 86.58 AIC: 166.1
##
## $desire
##
## Call: glm(formula = Y ~ wantsMore, family = binomial, data = ds)
##
## Coefficients:
## (Intercept) wantsMoreyes
## -0.1864 -1.0486
##
## Degrees of Freedom: 15 Total (i.e. Null); 14 Residual
## Null Deviance: 165.8
## Residual Deviance: 74.1 AIC: 149.6
##
## $additive
##
## Call: glm(formula = Y ~ age + wantsMore, family = binomial, data = ds)
##
## Coefficients:
## (Intercept) age25-29 age30-39 age40-49 wantsMoreyes
## -0.8698 0.3678 0.8078 1.0226 -0.8241
##
## Degrees of Freedom: 15 Total (i.e. Null); 11 Residual
## Null Deviance: 165.8
## Residual Deviance: 36.89 AIC: 118.4
##
## $interact
##
## Call: glm(formula = Y ~ age * wantsMore, family = binomial, data = ds)
##
## Coefficients:
## (Intercept) age25-29 age30-39
## -1.4553 0.6354 1.5411
## age40-49 wantsMoreyes age25-29:wantsMoreyes
## 1.7643 -0.0640 -0.2672
## age30-39:wantsMoreyes age40-49:wantsMoreyes
## -1.0905 -1.3671
##
## Degrees of Freedom: 15 Total (i.e. Null); 8 Residual
## Null Deviance: 165.8
## Residual Deviance: 20.1 AIC: 107.6
lapply(models,summary)
## $null
##
## Call:
## glm(formula = Y ~ 1, family = binomial, data = ds)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.3264 -2.4619 -0.7162 2.1273 5.4591
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.77455 0.05368 -14.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 165.77 on 15 degrees of freedom
## Residual deviance: 165.77 on 15 degrees of freedom
## AIC: 239.28
##
## Number of Fisher Scoring iterations: 4
##
##
## $age
##
## Call:
## glm(formula = Y ~ age, family = binomial, data = ds)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.5141 -1.5019 0.3857 0.9679 3.5907
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.5072 0.1303 -11.571 < 2e-16 ***
## age25-29 0.4607 0.1727 2.667 0.00765 **
## age30-39 1.0483 0.1544 6.788 1.14e-11 ***
## age40-49 1.4246 0.1940 7.345 2.06e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 165.772 on 15 degrees of freedom
## Residual deviance: 86.581 on 12 degrees of freedom
## AIC: 166.09
##
## Number of Fisher Scoring iterations: 4
##
##
## $desire
##
## Call:
## glm(formula = Y ~ wantsMore, family = binomial, data = ds)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.7091 -1.2756 -0.3467 1.4667 3.5505
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.18636 0.07971 -2.338 0.0194 *
## wantsMoreyes -1.04863 0.11067 -9.475 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 165.772 on 15 degrees of freedom
## Residual deviance: 74.098 on 14 degrees of freedom
## AIC: 149.61
##
## Number of Fisher Scoring iterations: 4
##
##
## $additive
##
## Call:
## glm(formula = Y ~ age + wantsMore, family = binomial, data = ds)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7870 -1.3208 -0.3417 1.2346 2.4577
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.8698 0.1571 -5.536 3.10e-08 ***
## age25-29 0.3678 0.1754 2.097 0.036 *
## age30-39 0.8078 0.1598 5.056 4.27e-07 ***
## age40-49 1.0226 0.2039 5.014 5.32e-07 ***
## wantsMoreyes -0.8241 0.1171 -7.037 1.97e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 165.772 on 15 degrees of freedom
## Residual deviance: 36.888 on 11 degrees of freedom
## AIC: 118.4
##
## Number of Fisher Scoring iterations: 4
##
##
## $interact
##
## Call:
## glm(formula = Y ~ age * wantsMore, family = binomial, data = ds)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.67001 -0.85288 0.02621 0.72300 2.18925
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4553 0.2968 -4.903 9.43e-07 ***
## age25-29 0.6354 0.3564 1.783 0.07463 .
## age30-39 1.5412 0.3183 4.842 1.29e-06 ***
## age40-49 1.7643 0.3435 5.136 2.80e-07 ***
## wantsMoreyes -0.0640 0.3303 -0.194 0.84637
## age25-29:wantsMoreyes -0.2672 0.4091 -0.653 0.51366
## age30-39:wantsMoreyes -1.0905 0.3733 -2.921 0.00349 **
## age40-49:wantsMoreyes -1.3672 0.4834 -2.828 0.00468 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 165.772 on 15 degrees of freedom
## Residual deviance: 20.099 on 8 degrees of freedom
## AIC: 107.61
##
## Number of Fisher Scoring iterations: 4
df <- data.frame(deviance = round(unlist(lapply(models, deviance)), 2),
df = unlist(lapply(models, df.residual)),
aic = round(unlist(lapply(models, AIC))))
# comparing the model with age + wantsMore (small model) and age + wantsMore + age:wantsMore (large model)
# can use deviances instead of log-likelihood, then we have the formula dev(small)-dev(large). we do not multiply with (-2) now
# we test whether the test statistic is chisq with df = difference in number of parameters. This is the same as the difference in degrees of freedom
dev_small <- df$deviance[4]
dev_large <- df$deviance[5]
df_small <- df$df[4]
df_large <- df$df[5]
lrt <- dev_small - dev_large
df_lrt <- df_small - df_large # 3
qchisq(0.95, df_lrt) # critical value at 5 %
## [1] 7.814728
lrt # much larger than critical value
## [1] 16.79
pchisq(lrt, df_lrt, lower.tail = FALSE) # better with interaction
## [1] 0.0007806146
We want low AIC, so again we choose the interaction-model.
fit3add <- glm(cbind(using, notUsing) ~ age + education + wantsMore,
data = ds, family = binomial)
summary(fit3add)
##
## Call:
## glm(formula = cbind(using, notUsing) ~ age + education + wantsMore,
## family = binomial, data = ds)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5148 -0.9376 0.2408 0.9822 1.7333
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.8082 0.1590 -5.083 3.71e-07 ***
## age25-29 0.3894 0.1759 2.214 0.02681 *
## age30-39 0.9086 0.1646 5.519 3.40e-08 ***
## age40-49 1.1892 0.2144 5.546 2.92e-08 ***
## educationlow -0.3250 0.1240 -2.620 0.00879 **
## wantsMoreyes -0.8330 0.1175 -7.091 1.33e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 165.772 on 15 degrees of freedom
## Residual deviance: 29.917 on 10 degrees of freedom
## AIC: 113.43
##
## Number of Fisher Scoring iterations: 4
Rodriges: “Additive: The deviance of 29.92 on 10 d.f. tells us that this model does not fit the data, so the assumption that logit differences by one variable are the same across categories of the other two is suspect.”
The saturated model is a model where we have all possible interactions:
summary(glm(cbind(using, notUsing) ~ age * education * wantsMore, data = ds,
family = binomial))
##
## Call:
## glm(formula = cbind(using, notUsing) ~ age * education * wantsMore,
## family = binomial, data = ds)
##
## Deviance Residuals:
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.6094 0.3464 -4.646 3.38e-06
## age25-29 0.7309 0.4152 1.760 0.07838
## age30-39 1.7466 0.3841 4.547 5.43e-06
## age40-49 2.5585 0.4854 5.271 1.36e-07
## educationlow 0.6931 0.6856 1.011 0.31199
## wantsMoreyes 0.2041 0.3794 0.538 0.59062
## age25-29:educationlow -0.4565 0.8216 -0.556 0.57852
## age30-39:educationlow -0.7921 0.7232 -1.095 0.27338
## age40-49:educationlow -1.5997 0.7926 -2.018 0.04356
## age25-29:wantsMoreyes -0.3800 0.4705 -0.808 0.41928
## age30-39:wantsMoreyes -1.2833 0.4491 -2.858 0.00427
## age40-49:wantsMoreyes -1.1532 0.7138 -1.615 0.10620
## educationlow:wantsMoreyes -1.4663 0.8243 -1.779 0.07526
## age25-29:educationlow:wantsMoreyes 0.8288 0.9988 0.830 0.40666
## age30-39:educationlow:wantsMoreyes 1.2854 0.8955 1.435 0.15119
## age40-49:educationlow:wantsMoreyes 0.6093 1.1326 0.538 0.59063
##
## (Intercept) ***
## age25-29 .
## age30-39 ***
## age40-49 ***
## educationlow
## wantsMoreyes
## age25-29:educationlow
## age30-39:educationlow
## age40-49:educationlow *
## age25-29:wantsMoreyes
## age30-39:wantsMoreyes **
## age40-49:wantsMoreyes
## educationlow:wantsMoreyes .
## age25-29:educationlow:wantsMoreyes
## age30-39:educationlow:wantsMoreyes
## age40-49:educationlow:wantsMoreyes
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1.6577e+02 on 15 degrees of freedom
## Residual deviance: 2.8866e-15 on 0 degrees of freedom
## AIC: 103.51
##
## Number of Fisher Scoring iterations: 3
We have no degrees of freedom here! So we have 0 deviance, but no predictive power.
library(ggplot2)
plotdf <- data.frame(dres = fit3add$residuals, fitted = fit3add$fitted.values,
age = ds$age)
ggplot(plotdf, aes(x = fitted, y = dres)) + geom_point() + labs(x = "Fitted values",
y = "Deviance residuals")
Not any particular trend.
# Same code as in the module pages
frac = ds$using/(ds$using + ds$notUsing)
logitdf = data.frame(lfitted = qlogis(fit3add$fitted), lfrac = qlogis(frac),
age = ds$age, wantsMore = ds$wantsMore, education = ds$education)
ggplot(logitdf, aes(x = age)) + geom_point(aes(y = lfrac, colour = "saturated")) +
geom_point(aes(y = lfitted, colour = "fitted")) + facet_wrap(facets = ~wantsMore *
education) + labs(x = "age", y = "")