\[ f(y_i|\pi_i) = \binom{n_i}{y_i} \pi_i^{y_i}(1-\pi_i)^{n_i-y_i}. \]
\[ L(\beta) = \prod_{i=1}^GL_i(\beta) = \prod_{i=1}^Gf_i(y_i|\pi_i) = \prod_{i=1}^G {{n_i}\choose{y_i}} \pi_i^{y_i}(1-\pi_i)^{n_i-y_i}. \]
\[ l(\beta) = \log L(\beta), \]
\[ l_i(\beta) = y_i \log (\pi_i) + (n_i-y_i) \log (1-\pi_i) + \log{{n_i}\choose{y_i}} \] \[ =y_i \log (\pi_i) - y_i \log (1-\pi_i) + n_i \log(1-\pi_i) + \log{{n_i}\choose{y_i}} \ \] \[ = y_i \log (\frac{\pi_i}{1-\pi_i}) + n_i log(1-\pi_i)+\log{{n_i}\choose{y_i}} \] \[ = y_i \eta_i - n_i \log (1 + e^{\eta_i}) + \log{{n_i}\choose{y_i}}. \]
\[ l(\beta) = \sum_{i=1}^Gl_i(\beta) = \sum_{i=1}^G\bigg (y_i \eta_i - n_i \log (1 + e^{\eta_i})+log(1-\pi_i)+\log{{n_i}\choose{y_i}}\bigg ). \]
\[ s(\beta)= \frac{\partial}{\partial\beta}l(\beta) = \sum_{i=1}^G \frac{\partial l_i}{\partial\eta_i} = \sum_{i=1}^G s_i(\beta), \]
\[ s_i(\beta)= \frac{\partial l_i}{\partial\eta_i} \frac{\partial \eta_i}{\partial\beta} = \bigg[ y_i - n_i \frac {\exp (\eta_i)}{1+\exp(\eta_i)}\bigg]x_i = (y_i-n_i\pi_i)x_i = n_i x_i (\bar y_i - \pi_i), \,\,\,\, \text{where}\,\, \bar{y_i} =\frac{y_i}{n_i} \]
\[ s(\beta)=\sum_{i=1}^G s_i(\beta)=\sum_{i=1}^Gn_ix_i(\bar{y}_i-\pi_i). \]
\[ F_i (\beta) = E[s_i(\beta) s_i(\beta)^T]. \]
\[ F_i (\beta) = E[x_i ( y_i - n_i\pi_i)( y_i- n_i \pi_i) x_i^T] = E[x_i x_i^T ( y_i- n_i\pi_i)^2] \]
\[ = x_i x_i^T E[ ( y_i- n_i\pi_i)^2] = x_i x_i^T Var(y_i) = x_i x_i^T n_i \pi_i (1 - \pi_i) \]
\[ F(\beta) = \sum_{i=1}^G F_i(\beta) = \sum_{i=1}^G[x_i x_i^T n_i \pi_i (1-\pi_i)] \]
\[ \beta^{(t+1)}=\beta^{(t)} + F^{-1}(\beta^{(t)}) s(\beta^{(t)}), \,\,\, t= 0,1,2,\ldots \]
myglm <- function(formula, data) {
y <- data$y
n <- data$n
X <- model.matrix(formula,data)
beta <- rep(0, ncol(X))
repeat {
eta <- as.vector(X %*% beta)
pi <- 1/(1+exp(-eta))
score <- apply((y - n*pi)*X, 2, sum)
Z <- sqrt(n*pi*(1-pi))*X
Fisherinfo <- t(Z) %*% Z
if (sum(score^2)<1e-10)
break()
beta <- beta + solve(Fisherinfo) %*% score
cat(round(beta,3),"\n")
}
return(list(coef = beta, Fisherinfo = Fisherinfo, pi.hat = pi))
}
library(investr)
beetle = investr::beetle
mod = myglm(~ldose, beetle)
## -37.856 21.337
## -53.853 30.384
## -59.965 33.844
## -60.708 34.265
## -60.717 34.27
Coefficients \(\hat{\beta}\) are asymptotically \(N_p(\beta,F^{-1}(\hat{\beta})).\) Standard error estimates are the the square root of diagonal elements of of the inverse Fisher information matrix.
F.inv = solve(mod$Fisherinfo)
est.se = sqrt(diag(F.inv))
round(est.se,3)
## (Intercept) ldose
## 5.181 2.912
summary(glm(cbind(y,n-y) ~ ldose, binomial, data=beetle))
##
## Call:
## glm(formula = cbind(y, n - y) ~ ldose, family = binomial, data = beetle)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5941 -0.3944 0.8329 1.2592 1.5940
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -60.717 5.181 -11.72 <2e-16 ***
## ldose 34.270 2.912 11.77 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 284.202 on 7 degrees of freedom
## Residual deviance: 11.232 on 6 degrees of freedom
## AIC: 41.43
##
## Number of Fisher Scoring iterations: 4
\[ D=-2(\ln L(\text{candidate model})-\ln L(\text{saturated model}))=-2(l(\hat{\pi})-l(\tilde\pi))= -2\sum_{j=1}^G(l_i(\hat{\pi}_j)-l_i(\tilde{\pi}_j)) \] where \[ l(\hat\pi) = \sum_{i=1}^G y_i \log (\hat \pi_i) + (n_i - y_i) \log(1- \hat \pi_i) \]
and
\[ l(\tilde\pi) = \sum_{i=1}^G y_i \log (\frac{y_i}{n_i}) + (n_i - y_i) \log(1- \frac{y_i}{n_i}) \]
\[ D=2\sum_{j=1}^G [y_j\ln(\frac{y_j}{n_j\hat{\pi}_j})+(n_j-y_j)\ln(\frac{n_j-y_j}{n_j-n_j\hat{\pi}_j})] \]
n = beetle$n
y = beetle$y
pi.hat = mod$pi.hat
log.cand = dbinom(y, n, pi.hat, log = T)
log.sat = dbinom(y, n, y/n, log = T)
(Deviance = 2 * sum(log.sat - log.cand))
## [1] 11.23223
Deviance statistic is approximately \(\chi^2_{G-p}\) distributed if the sizes of groups \(n_i,\) \(i = 1, \ldots,G\) are sufficiently large. Larger values in the observed statistic indicate lack of fit.
qchisq(0.95, df = length(n) - length(mod$coef))
## [1] 12.59159
Since \(D < \chi^2_{8-2},\) we do not reject the hypotheses that the model fits the data well.