# ---------------------------------------------------------------------------------------- # Week 1 (lecture 1) time <- c(31.7, 39.2, 57.5, 65.0, 65.8, 70.0, 75.0, 75.2, 87.5, 88.3, 94.2, 101.7) dead <- c(1,1,1,0,1,1,0,0,1,0,0,1) library(survival) data1 <- Surv(time,dead) data1 Rhat <- survfit(data1 ~ 1) plot(Rhat, mark.time = TRUE,ylab="R(t)") Rhat print(Rhat, print.rmean = TRUE) summary(Rhat) # ---------------------------------------------------------------------------------------- # Week 1 (Lecture 2) (asymptotic weibull distribution of minimum of positive random variables) nsim <- 5000 n <- 1000 # 100, 1000,.... x <- rlnorm(n*nsim, sdlog=1) #x <- rexp(n*nsim) #x <- rgamma(n*nsim, shape=.5) x <- matrix(x,nsim,n) xmin <- apply(x,1,min) # fit a weibull distribution to see if it fits (writing and maximising the loglikelihood function ourself) loglik <- function(par,x) -sum(dweibull(x,shape=par[1],scale=par[2],log=TRUE)) weibfit <- optim(c(1,1),loglik,x=xmin) weibfit par(mfrow=c(3,1)) # histogram vs fitted f(x) hist(xmin,prob=TRUE) curve(dweibull(x, weibfit$par[1],weibfit$par[2]),add=TRUE) # estimated vs fitted F(x) plot(ecdf(xmin),col="red") curve(pweibull(x, weibfit$par[1], weibfit$par[2]),add=TRUE) # empircal vs theoretical quantiles QQ-plot plot( qweibull(ppoints(xmin), weibfit$par[1], weibfit$par[2]), # theoretical quantiles sort(xmin) # empirical quantiles ) abline(a=0,b=1) # hazard function for lognormal zlnorm <- function(t,mu,sigma) dlnorm(t,mu,sigma)/(1-plnorm(t,mu,sigma)) curve(zlnorm(t,0,1),xname="t",0,10) # fitting the lognormal to data via survreg time <- c(31.7, 39.2, 57.5, 65.0, 65.8, 70.0, 75.0, 75.2, 87.5, 88.3, 94.2, 101.7) dead <- c(1,1,1,0,1,1,0,0,1,0,0,1) library(survival) data1 <- Surv(time,dead) Rhat <- survfit(data1 ~ 1) plot(Rhat) lnormfit <- survreg(data1 ~ 1, dist="lognormal") summary(lnormfit) curve(1-plnorm(x, meanlog = lnormfit$coefficients, sdlog=lnormfit$scale ),add=TRUE) # ---------------------------------------------------------------------------------------- # Week 2 (lecture 1) # Gumbel as asymptotic distribution of minimum of iid normals curve(dnorm(x),-10,2,ylim=c(0,2)) for (n in 10^(1:6)) curve(n*(1-pnorm(x))^(n-1)*dnorm(x),add=TRUE,n=1000) #Gumbel as asymptotic distribution of minimum of iid normals curve(dexp(x),0,10,ylim=c(0,1)) for (n in 10^(1:6)) curve(n*(pexp(x))^(n-1)*dexp(x),add=TRUE,n=1000) # Gumbel as asymptotic distribution of minimum of iid normals nsim <- 5000 n <- 5000 # 100, 1000,.... x <- rnorm(n*nsim) # Gumbel holds asymptotically but note the very slow convergence #x <- -rexp(n*nsim) # fast convergece #x <- rgamma(n*nsim, shape=.5) x <- matrix(x,nsim,n) xmin <- apply(x,1,min) # fit a weibull distribution to see if it fits (writing and maximising the loglikelihood function ourself) gumbfit <- survreg(Surv(xmin) ~ 1, dist="extreme") pgumbel <- function(q,loc=0,scale=1) 1-exp(-exp((q-loc)/scale)) qgumbel <- function(p,loc=0,scale=1) loc + scale * log(-log(1-p)) dgumbel <- function(x,loc=0,scale=1) {x <- (x-loc)/scale; exp(x-exp(x))/scale} par(mfrow=c(3,1)) hist(xmin,prob=TRUE,breaks=50) curve(dgumbel(x,gumbfit$coefficients,gumbfit$scale),add=TRUE) plot(ecdf(xmin),col="red") curve(pgumbel(x,gumbfit$coefficients,gumbfit$scale),add=TRUE) plot(sort(xmin), qgumbel(ppoints(xmin), gumbfit$coefficients,gumbfit$scale), cex=.3) abline(a=0,b=1) # Fitting the Weibull via log-location-scale representation used by survreg time <- rweibull(n=100,shape=3,scale=10) # Usual Weibull parameterization weibfit2 <- survreg(Surv(time) ~ 1, dist="weibull") summary(weibfit2) exp(weibfit2$coefficients) # recovered scale from Gumbel location 1/weibfit2$scale # and shape parameter from Gumbel scale # Week 2, lecture 2 # First simulate some randomly right censored data n <- 200 alpha <- 2 T <- rweibull(n,shape=alpha,scale=1) C <- runif(n,0,2*median(T)) # (Y,delta)-representation time <- pmin(T,C) delta <- ifelse(time==T,1,0) treatment <- sample(0:2,replace = TRUE,size=n) # Kaplan-Merier estimate Rhat <- survfit(Surv(time,delta)~1) print(Rhat, print.rmean=TRUE) plot(Rhat) # Kaplan-Meier estimate of the survival function R(t) and confidence intervals # based on assuming normality of Rhat(t), log(Rhat(t)) and log(-log(Rhat(t))) par(mfrow=c(3,1)) for (i in c("plain","log","log-log")) { Rhat <- survfit(Surv(time,delta)~1, conf.type=i) # plot(Rhat) curve(exp(-x^alpha),add=TRUE,col="green") # true survival function } # Estimating E(T) = MTTF and median(T) from incomplete non-parametric estimate of survival function abline(a=.5,b=0,col="blue") print(Rhat, print.rmean=TRUE) # Nelson-Aalen estimator of Z(t) # using number of failures d_i and number at risk n_i computed via survfit par(mfrow=c(1,1)) Zhat.NA <- cumsum(Rhat$n.event/Rhat$n.risk) curve(t^alpha, 0, 2, xname="t") # True cumulative hazard plot(Rhat$time, Zhat.NA) # Nelson-Aalen estimate Zhat.KM <- -log(Rhat$surv) # Kaplan-Meier estimate (of Z(t)) points(Rhat$time, Zhat.KM, col="red") # ---------------------------------------------------------------------------------------- # Week 3, lecture 2 # TTT-plot (see algorithm in slides 6) # compute total time on test for all observations # First sort in increase order n <- length(time) permutation <- order(time) time <- time[permutation] delta <- delta[permutation] Tau <- n*time[1] # total time on test at all censoring and failure times for (i in 2:n) Tau[i] <- Tau[i-1] + (n-i+1)*(time[i]-time[i-1]) # Compute new Y's containing TTT at time of failures # Under H0 of exponentiality these are arrival times in a homogeneous Poisson process Y <- Tau[delta==1] # k <- length(Y) i <- 1:(k-1) plot(i/k, Y[i]/Y[k], xlim=c(0,1), ylim=c(0,1)) # hence they behave as orderstats of n-1 iid unif(0,1) variates # Barlow-Proschan statistic W <- sum(Y[i]/Y[k]) Z <- (W-(k-1)/2)/sqrt((k-1)/12) Z 2*pnorm(abs(Z),lower.tail=FALSE) # ---------------------------------------------------------------------------------------- # Week 4 # log-rank test library(survival) library(MASS) attach(Melanoma) Melanoma data <- Surv(time,ifelse(status==1,1,0)) plot(survfit(data~sex),conf.int=TRUE,col=rep(c(2,1),each=3)) legend("topright",c("males","females"),col=c(1,2),lty=1) survdiff(data ~ sex) summary(coxph(data ~sex)) survdiff(Surv(time,delta)~treatment) # example in Moore, p. 46 tt <- c(6, 7, 10, 15, 19, 25) delta <- c(1, 0, 1, 1, 0, 1) trt <- c(0, 0, 1, 0, 1, 1) survdiff(Surv(tt, delta) ~ trt) # the same computed "manually" d0 <- c(1,0,1,0) e0 <- c(.5,.25,.3333333,0) v0 <- c(.25,.1875,.222222,0) (sum(d0-e0))^2/sum(v0) # difference between approximations based on the (multi)hypergeometric and based on alternative test statistic in slides 6 library(survival) n <- 20 k <- 2 treatment <- sample(1:k,replace = TRUE,size=n) C <- runif(n,0,5) alpha <- 1 nsim <- 10000 chisq <- data.frame(hypergeom=double(nsim),multinom=double(nsim)) colnames(chisq) <- c("hypergeom","multinom") for (i in 1:nsim) { T <- rweibull(n,shape=alpha,scale=1) time <- pmin(T,C) delta <- ifelse(time==T,1,0) sdiff <- survdiff(Surv(time,delta)~treatment) chisq$hypergeom[i] <- sdiff$chisq chisq$multinom[i] <- with(sdiff,sum((obs-exp)^2/exp)) } par(mfcol=c(3,2)) for (type in names(chisq)) { plot(qchisq(ppoints(nsim),df=k-1),sort(chisq[[type]]),pch=".",ylab=type) abline(a=0,b=1) plot(ecdf(chisq[[type]])) curve(pchisq(x,df=k-1),add=TRUE,col="green") hist(chisq[[type]],breaks=50,prob=TRUE,xlim=c(0,12),main=type) curve(dchisq(x,df=k-1),add=TRUE) cat("type I error rate for",type,"approximation:",mean(chisq[[type]]>qchisq(.95,df=k-1)),"\n") } # ---------------------------------------------------------------------------------------- # Week 4, lecture 2, maximum likelihood # Right censored observations from exponential model n <- 100 tt <- rexp(n) # scale parameter theta=1 y <- ifelse(tt>1, 1, tt) # right censoring at t=1 delta <- ifelse(tt>1, 0, 1) par(mfrow=c(2,1)) matplot(rbind(0,y),rbind(1:n,1:n),type="l",lty=1,col=1,xlab="time",ylab="unit") points(y,1:n,pch=ifelse(delta==1,"x","|")) Rhat <- survfit(Surv(y,delta)~1) plot(Rhat) print(Rhat, print.rmean=TRUE) s <- sum(y) # total time on test r <- sum(delta) # number of failures thetahat <- s/r # MLE of theta se <- thetahat/sqrt(r) # ---------------------------------------------------------------------------------------- # Week 5, lucture 1 many confidence intervals and one credible interval # 1) Confidence interval based on asymptotic normality of thetahat c(thetahat - 1.96*se, thetahat + 1.96*se) # conf.interval for MTTF = ET = theta # 2) Survreg fits the exponential via its log-location-scale representation # involving a location parameter mu equal to the log of the scale parameter theta # of the exponential. The confidence interval for mu = log(theta) is computed # based on the observed Fisher information and normality of muhat. This can # then be backtransformed # to a confidence-interval for theta as follows exp(confint(survreg(Surv(y,delta) ~ 1, dist="exponential"))) # 3) There is no coincidence that this is exactly same interval as the one described in slides no. 8 # (based on asymptotic normality of log(thetahat)) and variance obtained from var(thetahat) by the delta method thetahat*exp(c(-1,1)*qnorm(.975)/sqrt(r)) # 4) Likelihood based confidence interval (uses 2(l(thetahat)-l(theta)) as pivot) l <- function(theta) -r*log(theta)-s/theta ldiff <- function(theta) l(theta) - l(thetahat) + qchisq(.95,df=1)/2 c(uniroot(ldiff,c(0.1,thetahat))$root, uniroot(ldiff,c(thetahat,10))$root) # 5) Exact for type II censoring (experiment end at the r'th failure, r prespecified) # This doesn't really apply because the censoring time (and not r) is prespecified here 2*thetahat*r/qchisq(c(.975,.025),df=2*r) # 6) Bayesian credible interval following via Bayes theorem is using a # non-informative scale prior on thet (or, equivalently, on the rate lambda). # This applies regardless of censoring mechanism but note difference in meaning! 2*s/qchisq(c(.975,.025),df=2*r) # ---------------------------------------------------------------------------------------- # Week 5, lecture 2 # Left censoring, Baboon example, see p. 189 in Moore 2016. library(ssym) data("Baboons") n <- nrow(Baboons) par(mfrow=c(2,1)) with(Baboons,{ plot(NA,xlim=c(0,24),ylim=c(1,n)) matlines(rbind(0,t),rbind(1:n,1:n),col=1,lty=1) points(t,1:nrow(Baboons),pch=ifelse(cs==0,"x","<")) }) Baboons <- within(Baboons,{ t.right <- t t.left <- t t.left[cs==1] <- .0001 # left censored observations surv <- Surv(t.left,t.right,type="interval2") }) # First compute a non-parametric estimate (this is outside the scope of the course) source("https://bioconductor.org/biocLite.R") biocLite("Icens") library(interval) plot(icfit(surv ~ 1, data=Baboons),xlim=c(0,24)) # Next fit a parametric model to the censored data weibfit <- survreg(surv~1,dist="weibull", Baboons) curve(1-pweibull(x,scale=exp(weibfit$coefficients),shape=1/weibfit$scale),add=TRUE) # Left truncated data, example from Moore 2016, section 3.5 tt <- c(7, 6, 6, 5, 2, 4) delta <- c(0, 1, 0, 0, 1, 1) backTime <- c(-2, -5, -3, -3, -2, -5) tm.enter <- -backTime tm.exit <- tt - backTime n <- length(tt) matplot(rbind(tm.enter,tm.exit),rbind(1:n,1:n),type="l",col=1,lty=1,xlim=c(0,max(tm.exit))) points(tm.exit,1:n,pch=ifelse(delta==1,"x","|")) points(tm.enter,1:n,pch="(") # Non-parametric models: Kaplan-Meier estimate of R(t) Rhat <- survfit(Surv(tm.enter, tm.exit, delta, type="counting") ~ 1) plot(Rhat) summary(Rhat) # Parametric models: survreg in the survival package doesn't handle left truncation, however, # (nor does MINITAB) but luckily a sweede wrote a package that does. This package # uses yet another parameterization of the weibull (the logs of the scale and shape) library(eha) weibfit <- aftreg(Surv(tm.enter, tm.exit, delta, type="counting") ~ 1, dist="weibull") summary(weibfit) curve(1-pweibull(x,scale=exp(weibfit$coefficients[1]),shape=exp(weibfit$coefficients[2])),add=TRUE) # ---------------------------------------------------------------------------------------- # Feb 20: Parametric inference for Weibull models with right censored data gastricXelox <- read.csv("https://www.math.ntnu.no/~jarlet/levetid/gastricXelox.csv") data <- Surv(gastricXelox$timeWeeks,gastricXelox$delta) weibfit <- survreg(data ~ 1,dist="weibull") summary(weibfit) # our own implementation of the log likelihood function weib.loglik <- function(par, data) { theta <- par[1] alpha <- par[2] y <- data[,"time"] delta <- data[,"status"] r <- sum(data[,"status"]) r*log(alpha) - r*alpha*log(theta) + (alpha-1)*sum(log(y[delta==1])) - sum((y/theta)^alpha) } # verify that maximising this yields the same results opt <- optim(c(1,1),weib.loglik, data=data, control=list(fnscale=-1), hessian=TRUE) opt$par c(exp(weibfit$coefficients), 1/weibfit$scale) # standard errors sqrt(diag(solve(-opt$hessian))) # Estimate of ET EThat <- exp(weibfit$coefficients)*gamma(weibfit$scale + 1) # Asymptotic estimate of the variance-covariance matrix of the parameters # note that this involves yet another parameterization (in terms of the log scale) vcov(weibfit) # Delta method # derivatives of ET with respect to mu and log(scale) der <- c(EThat, exp(weibfit$coefficients)*digamma(weibfit$scale + 1)*weibfit$scale) varEThat <- t(der) %*% vcov(weibfit) %*% der seEThat <- sqrt(varEThat) # standard error of EThat seEThat print(survfit(data~1),print.rmean=TRUE) # seems reasonable compared to K-M estimate # Testing H0: alpha=0 vs H1: alpha!=0 expfit <- survreg(data ~ 1,dist="exponential") W <- 2*(logLik(weibfit)-logLik(expfit)) # test statistics W qchisq(.95, df = 2 - 1) # critical value pchisq(W,df = 1,lower.tail = FALSE) # p - value # profile confidence interval for shape parameter alpha weib.profile.loglike <- function(alpha, data) { y <- data[,"time"] delta <- data[,"status"] r <- sum(delta) theta <- (sum(y^alpha)/r)^(1/alpha) # mle of theta given alpha weib.loglik(c(theta,alpha), data=data) # log.lik. maximised w.r.t. theta (just a function alpha) } alpha <- seq(.5,1.5,len=100) ploglik <- numeric(100) for (i in seq_along(alpha)) { ploglik[i] <- weib.profile.loglike(alpha[i], data=data) } plot(alpha,ploglik,type="l") abline(b=0, a=opt$val - qchisq(.95,df=1)/2) # 2-d (1-alpha) likelihood based confidence set x <- seq(20, 200, len=100) y <- seq(.5,2, len=100) z <- matrix(NA,100,100) for (i in 1:100) for (j in 1:100) z[i,j] <- weib.loglik(c(x[i],y[j]),data=data) confset <- contourLines(x, y, z, levels=opt$val - qchisq(0.05, df=2, lower.tail=FALSE)/2)[[1]] plot(confset$x,confset$y) # --------------------------------------------------------------------------------------- # Febr. 23: Ex. cont. # Weibull probability plot weibprobplot <- function(data) { Rhat <- survfit(data~1) r <- sum(data[,"status"]) subset <- data[,"status"]==1 Rhathat <- (Rhat$surv[subset]+c(1,Rhat$surv[subset][-r]))/2 tfailure <- Rhat$time[subset] plot(log(tfailure), log(-log(Rhathat))) } # the plot for gastricXelox data weibprobplot(data) # the plot for data that is truly weibull time <- rweibull(100,shape=3) weibprobplot(Surv(time)) survreg.distributions$lognormal # General probability plot for any derived distribution available in survreg # using functions available in survreg.distributions (see its help page for details) probabilityplot <- function(data,dist="weibull", n=20, level=.05) { Rhat <- survfit(data~1) # Compute KM-estimate of R(t) subset <- data[,"status"]==1 # subset of observed failures Rhathat <- (Rhat$surv[subset]+c(1,Rhat$surv[subset][-r]))/2 # adjusted KM-estimate at each failure time tfailure <- Rhat$time[subset] # time of observed failures parent <- if (is.function(dist)) dist$dist else survreg.distributions[[dist]]$dist # parent distribution (gumbel, normal or logistic) quantileParent <- survreg.distributions[[parent]]$quantile # quantile function of parant distribution plot(log(tfailure), quantileParent(1-Rhathat), xlab=expression(log(t[(i)])), ylab="parent quantile", main=paste("Probability plot for",dist)) # make the plot fit <- survreg(data~1, dist=dist) a <- -fit$coefficients/fit$scale b <- 1/fit$scale abline(a,b) lntt <- seq(min(log(tfailure)),max(log(tfailure)),len=n) yhat <- a + b*lntt der <- cbind(-1/fit$scale, -yhat) vcov <- vcov(fit) if (nrow(vcov)==1) vcov <- matrix(c(as.vector(vcov),0,0,0),2,2) se <- sqrt(diag(der %*% vcov %*% t(der))) for (sign in c(-1,1)) lines(lntt, yhat+sign*qnorm(level)*se, col="red") } par(mfrow=c(3,2)) probabilityplot(data,"loglogistic") probabilityplot(data,"lognormal") probabilityplot(data,"weibull") probabilityplot(data,"exponential") probabilityplot(data,"rayleigh") # Threshold Weibull model # Pike (1966) Cancer data y <- c(143, 163, 188, 188, 190, 192, 206, 209, 213, 216, 220, 227, 230, 234, 246, 265, 304, 216, 244) delta <- rep(c(1,0),c(17,2)) data <- Surv(y,delta) data par(mfrow=c(2,1)) # profile likelihoood for gamma (work with the log of min(y)-delta to avoid numerical underflow) log.ymin.gamma <- seq(5,-25,length=100) profilegamma <- numeric(100) for (i in seq_along(profilegamma)) { data.shifted <- Surv(y - min(y) + exp(log.ymin.gamma[i]), delta) profilegamma[i] <- logLik(survreg(data.shifted ~ 1)) } plot(min(y)-exp(log.ymin.gamma),profilegamma,type="l",xlab="gamma",xlim=c(0,min(y)+.1)) abline(v=min(y),lty=3) plot(log.ymin.gamma,profilegamma,type="l") # profile log likelihood of l(alpha,gamma) treating gridres <- 200 alpha <- seq(.1,3,len=gridres) gamma <- seq(.8*min(y),min(y)-.001,len=gridres) l <- matrix(NA,gridres,gridres) for (i in seq_along(gamma)) for (j in seq_along(alpha)) { scale <- (sum((y - gamma[i])^alpha[j])/length(y))^(1/alpha[j]) # mle of theta given gamma and alpha l[i,j] <- sum(dweibull(y-gamma[i], shape=alpha[j], scale=scale, log=TRUE)) } contour(gamma,alpha,l,xlab="gamma", ylab="alpha",levels=seq(max(l),by=-1,len=100)) lines(c(min(y),min(y)),c(0,1),col="red") # Bayesian inference with scale priors on shape and scale # and uniform on location parameter gamma # Reparameterize in terms of log(shape), log(scale) and # log(y_min - gamma) library(MCMCpack) logposterior <- function(par,y) { gamma <- min(y) - exp(par[3]) sum(dweibull(y-gamma,exp(par[1]),exp(par[2]),log=TRUE)) + par[3] } y <- rweibull(100,shape=.8,scale=10) + 1 chain0 <- MCMCmetrop1R(logposterior, rep(0,3), y=y, V=.01*diag(3)) chain <- MCMCmetrop1R(logposterior, rep(0,3), y=y, V=var(chain0)) plot(exp(chain)) summary(exp(chain)) plot(chain) # Survival regression library(survival) library(MASS) attach(Melanoma) Melanoma head(Melanoma,15) data <- Surv(time,ifelse(status==1,1,0)) plot(survfit(data~sex),col=rep(c(2,1),each=1)) legend("topright",c("males","females"),col=c(1,2),lty=1) survdiff(data ~ sex) sreg <- survreg(data ~ sex, dist="weibull") summary(sreg) exp(9.11)*gamma(.912+1) # ET for males exp(9.11 - 0.61)*gamma(.912+1) # ET for males predict(sreg, newdata=data.frame(sex=0:1)) sreg0 <- survreg(data ~ 1, dist="weibull") # likelihood ratio test W <- 2*(logLik(sreg)-logLik(sreg0)) pchisq(W, df=1, lower.tail=FALSE) drop1(sreg, test="Chisq") # more covariates sreg2 <- survreg(data ~ sex + thickness + age, dist="weibull") summary(sreg2) drop1(sreg2, test="Chisq") # Example loosely based on Moore cp. 6.3 library(asaur) data(pharmacoSmoking) attach(pharmacoSmoking) # levels of ageGroup4 and employment class(employment) employment levels(ageGroup4) # how to change the reference category with relevel ageGroup4 <- relevel(ageGroup4, ref="65+") ageGroup4 <- relevel(ageGroup4, ref="21-34") ageGroup4 # Model A with ttr (time in days until relapse, relapse status) as response and ageGroup4 as covariate modelA <- survreg(Surv(ttr+.5, relapse) ~ ageGroup4, dist="weibull") summary(modelA) # the model matrix (dummy encoding of the four levels of ageGroup4) head(model.matrix(~ ageGroup4 + employment)) # how to change contrasts used contrasts(employment) <- "contr.sum" # used by Mette in Linear Statistical Methods contrasts(employment) contrasts(employment) <- "contr.treatment" # the default contrasts(employment) # Model B obtained by including instead employment (full-time, part-time or other) as the covariate modelB <- survreg(Surv(ttr+.5,relapse) ~ employment, dist="weibull") summary(modelB) # Model C including both ageGroup4 and employment as covariates modelC <- survreg(Surv(ttr+.5,relapse) ~ ageGroup4 + employment, dist="weibull") summary(modelC) # predicted lifetime for a subject in first agegroup = 21-34 and employment = parttime # Testing A vs C pchisq(2*(logLik(modelC) - logLik(modelA)), df=2, lower.tail = FALSE) # Testing B vs C pchisq(2*(logLik(modelC) - logLik(modelB)), df=3, lower.tail = FALSE) # The same with anova anova(modelA, modelC) # The same with drop1 drop1(modelC, test="Chisq") # AIC extractAIC(modelA) extractAIC(modelB) extractAIC(modelC) # automatic model selection by AIC using stepwise algorithm (use with caution) modelAll <- survreg(Surv(ttr+.5, relapse) ~ grp + gender + race + employment + yearsSmoking + levelSmoking + ageGroup4 + ageGroup2 + priorAttempts + longestNoSmoke, dist="weibull") result.step <- step(modelAll) # Probability plot version 2. The input x is a fitted survreg object possibly with covariates probabilityplot2 <- function(x, n=20, level=.05,...) { # first compute standardized residuals for all failures and censoring events sresid <- (x$y[,1] - x$linear.predictors)/x$scale Rhat <- survfit(Surv(sresid, x$y[,2])~1) # compute the K-M estimate based on the standardized residuals subset <- x$y[,2]==1 # subset of units that failed r <- sum(subset) # number of observed failures Rhathat <- (Rhat$surv[subset]+c(1, Rhat$surv[subset][-r]))/2 # adjusted KM-estimate at each failure time u <- Rhat$time[subset] # standarized residuals for the observed failures parent <- if (is.function(x$dist)) x$dist$dist else survreg.distributions[[x$dist]]$dist # parent distribution (gumbel, normal or logistic) quantileParent <- survreg.distributions[[parent]]$quantile # quantile function of parent distribution plot(u, quantileParent(1-Rhathat), xlab="standardized residual", ylab="parent quantile", main=paste("Probability plot for sresidfor",x$dist),...) # make the plot abline(a=0,b=1) } par(mfrow=c(2,2)) probabilityplot2(result.step,ylim=c(-4,1),xlim=c(-3,1)) probabilityplot2(modelA,ylim=c(-4,1),xlim=c(-3,1)) probabilityplot2(survreg(Surv(ttr+.5,relapse)~1,dist="weibull"),ylim=c(-4,1),xlim=c(-3,1)) ## Monday March 6: Cox proportional hazard model summary(modelA) modelA.coxph <- coxph(Surv(ttr,relapse) ~ ageGroup4) modelA.coxph # Simple toy example y <- c(5,10,40,80,120,400,600) delta <- c(0,1,0,0,1,1,0) x <- c(12,10,3,5,3,4,1) # fit the model using coxph in survival-package # Find the MLE by plotting the log likleihood for -1 < beta <- 8 L <- function(beta) exp(10*beta)/(exp(10*beta) + exp(3*beta) + exp(5*beta) + exp(3*beta) + exp(4*beta) + exp(beta))* exp(3*beta)/(exp(3*beta) + exp(4*beta) + exp(beta))* exp(4*beta)/(exp(4*beta) + exp(beta)) l <- function(beta) log(L(beta)) curve(l, -1, 8) fit <- optim(0, l, control=list(fnscale=-1), hessian=TRUE) fit$par # check the standard errors sqrt(1/(-fit$hessian)) # plot the confidence limits abline(h = fit$val - qchisq(.95,df=1)/2) # verify the the regression coefficient is approximately that of the weibull times -alpha confint(coxph(Surv(y,delta) ~ x)) # Thursday, march 9. # Prop. haz. property of weibull model theta <- c(1,2) alpha <- 1.5 par(mfcol=c(2,2)) curve(dweibull(x,alpha,theta[1]),0,5,ylab="density") curve(dweibull(x,alpha,theta[2]),add=TRUE,col="red") curve(dweibull(x,alpha,theta[1])/(1-pweibull(x,alpha,theta[1])),0,5,ylab="hazard") curve(dweibull(x,alpha,theta[2])/(1-pweibull(x,alpha,theta[2])),add=TRUE,col="red") # Non-prop. haz property of the lognormal sigma <- .2 mu <- c(0,1) curve(dlnorm(x,mu[1],sigma),0,5,ylab="density") curve(dlnorm(x,mu[2],sigma),add=TRUE,col="red") curve(dlnorm(x,mu[1],sigma)/(1-plnorm(x,mu[1],sigma)),0,5,ylab="hazard") curve(dlnorm(x,mu[2],sigma)/(1-plnorm(x,mu[2],sigma)),add=TRUE,col="red") # Cox prop. haz model vs Weibull regression. Loss of power from non-parametric part of Cox model set.seed(1) x <- rnorm(10) y <- rweibull(length(x), scale=exp(.5*x),shape=2) # True value prop.haz regression coefficient = -0.2 summary(weibreg <- survreg(Surv(y) ~ x, dist="weibull")) (coxreg <- coxph(Surv(y) ~ x)) betatilde <- -weibreg$coefficients["x"]/weibreg$scale der <- t(c(0, -1/weibreg$scale, -betatilde)) varbetatilde <- der %*% vcov(weibreg) %*% t(der) sqrt(varbetatilde) # about 36% information loss for n=10 varbetatilde/vcov(coxreg) # Monday, March 13 # Example 2: Comparing two groups: Lifetimes of two batches of sodium sulphur batteries cat("y,delta,x 164,1,0 164,1,0 218,1,0 230,1,0 263,1,0 467,1,0 538,1,0 639,1,0 669,1,0 917,1,0, 1148,1,0 1678,0,0 1678,0,0 1678,0,0 76,0,1 82,0,1 210,1,1 315,1,1 385,1,1 412,1,1 491,1,1 504,1,1 522,1,1 646,0,1 678,1,1 775,1,1 884,1,1 1131,1,1 1456,1,1 1824,1,1 1827,1,1 2248,1,1 2385,1,1 3077,1,1",file="/tmp/tmp.dat") data <- read.table("/tmp/tmp.dat",header=TRUE,sep=",") attach(data) par(mfrow=c(2,1)) ## Separate K-M estimates plot(survfit(Surv(y,delta)~x)) ## Log rank test survdiff(Surv(y,delta)~x) ## Coxph model coxreg <- coxph(Surv(y,delta) ~ x, ties="exact") # ties = "exact"/"breslow" coxreg ## significance of x via z-test summary(coxreg) ## Significance of x via drop1( , test="Chisq") (compare to log-rank test) drop1(coxreg, test="Chisq") # Obtaining estimate of baseline hazard given betahat summary(survfit(coxreg)) plot(survfit(coxreg)) # Obtaining estimate of R_0(t)exp(beta^T x) for x=1 plot(survfit(coxreg, newdata=data.frame(x=0:1))) # Cox Snell residuals defined as # V_i = Zhat(y_i) # are not directly available in R but thesecan be computed via # Martingale residuals (the default) as these are # defined as # M_i = delta_i - Zhat(y_i) # where delta_i is the censoring indicator coxsnellresid <- delta - resid(coxreg,type="martingale") # generalized cox snell (replace censored cox snell by # replacing these with their conditional expectation coxsnellresid <- coxsnellresid + (1-delta) hist(coxsnellresid) # qq-plot (empirical vs theoretical quantiles should fall on a straight line) plot(sort(coxsnellresid),qexp(ppoints(coxsnellresid))) abline(a=0,b=1) # log minus log plot to inspect proportional hazard assumption Rhat <- survfit(Surv(y,delta) ~ x) # this contains _two_ survival curves # These can be subscripted to produce estimates of cumulative hazard # for each level of x as follows: plot(Rhat[1]$time,log(-log(Rhat[1]$surv)), ylim=c(-3,1),xlim=c(0,3000),type="s") points(Rhat[2]$time,log(-log(Rhat[2]$surv)),col=2,type="s") # vs log(t) instead of vs t plot(log(Rhat[1]$time),log(-log(Rhat[1]$surv)), ylim=c(-3,1),type="s") points(log(Rhat[2]$time),log(-log(Rhat[2]$surv)),col=2,type="s") # March 14, martingale, deviance, schoenfeld, Cox-Snell residuals # # # # # # # # # # # # # # # Section 7.1.1 martingale and deviance residuals # # # # # # # # # # # # # # library(asaur) attach(pharmacoSmoking) priorAttemptsT <- priorAttempts priorAttemptsT[priorAttempts > 20] <- 20 library(survival) result.0.coxph <- coxph(Surv(ttr, relapse) ~ 1) rr.0 <- residuals(result.0.coxph, type="martingale") # Figure 7.1 # First, we need the "smoothSEcurve" function (also in the Appendix) smoothSEcurve <- function(yy, xx) { # use after a call to "plot" # fit a lowess curve and 95% confidence interval curve xx.list <- min(xx) + ((0:100)/100)*(max(xx) - min(xx)) # make list of x values # Then fit loess function through the points (xx, yy) at the listed values yy.xx <- predict(loess(yy ~ xx), se=T, newdata=data.frame(xx=xx.list)) lines(yy.xx$fit ~ xx.list, lwd=2) lines(yy.xx$fit - qt(0.975, yy.xx$df)*yy.xx$se.fit ~ xx.list, lty=2) lines(yy.xx$fit + qt(0.975, yy.xx$df)*yy.xx$se.fit ~ xx.list, lty=2) } # Now the plot par(mfrow=c(3,2)) plot(rr.0 ~ age) smoothSEcurve(rr.0, age) title("Martingale residuals\nversus age") logAge <- log(age) plot(rr.0 ~ logAge) smoothSEcurve(rr.0, logAge) title("Martingale residuals\nversus log age") plot(rr.0 ~ priorAttemptsT) smoothSEcurve(rr.0, priorAttemptsT) title("Martingale residuals versus\nprior attempts") logPriorAttemptsT <- log(priorAttemptsT + 1) plot(rr.0 ~ logPriorAttemptsT) smoothSEcurve(rr.0, logPriorAttemptsT) title("Martingale residuals versus\nlog prior attempts") plot(rr.0 ~ longestNoSmoke) smoothSEcurve(rr.0, longestNoSmoke) # Note that "\n" is a "newline" indicator title("Martingale residuals versus\nlongest period without smoking") logLongestNoSmoke <- log(longestNoSmoke+1) plot(rr.0 ~ logLongestNoSmoke) smoothSEcurve(rr.0, logLongestNoSmoke) title("Martingale residuals versus\nlog of longest period without smoking") # # # # # result.grp.coxph <- coxph(Surv(ttr, relapse) ~ grp) result.step <- step(result.grp.coxph, scope=list(upper=~ grp + gender + race + employment + yearsSmoking + levelSmoking + age + priorAttemptsT + logLongestNoSmoke, lower=~grp) ) result.step # Figure 7.2 rr.final <- residuals(result.step, type="martingale") par(mfrow=c(2,2)) plot(rr.final ~ age) smoothSEcurve(rr.final, age) title("Martingale residuals\nversus age") plot(rr.final ~ grp) title("Martingale residuals\nversus treatment group") plot(rr.final ~ employment) title("Martingale residuals\nversus employment") par(mfrow=c(2,2)) cox.zph(result.step) plot(cox.zph(result.step)) # # # # # # # # # # # # # # # Section 7.2.1 checking the proportional hazards assumption; log cumulative hazard plots # # # # # # # # # # # # # # attach(pancreatic2) pfs.month <- pfs/30.25 result.surv.LA <- survfit(Surv(pfs.month) ~ stage, subset={stage == "LA"}) time.LA <- result.surv.LA$time surv.LA <- result.surv.LA$surv cloglog.LA <- log(-log(surv.LA)) logtime.LA <- log(time.LA) result.surv.M <- survfit(Surv(pfs.month) ~ stage, subset={stage == "M"}) time.M <- result.surv.M$time surv.M <- result.surv.M$surv cloglog.M <- log(-log(surv.M)) logtime.M <- log(time.M) # Fig 7.4 windows() plot(cloglog.LA ~ logtime.LA, type="s", col="blue", lwd=2, xlab="Log time", ylab="Complementary log-log survival") lines(cloglog.M ~ logtime.M, col="red", lwd=2, type="s") legend("bottomright", legend=c("Locally advanced", "Metastatic"), col=c("blue","red"), lwd=2) # # # # # # # # # # # # # # # Section 7.2.2 checking the proportional hazards assumption; Schoenfeld residuals # # # # # # # # # # # # # # result.coxph <- coxph(Surv(pfs.month) ~ stage) result.sch.resid <- cox.zph(result.coxph, transform="km") # Fig 7.5 plot(result.sch.resid) result.sch.resid cox.zph(result.coxph, transform="rank") cox.zph(result.coxph, transform="identity") # Thursday, March 23 # Accelerated life testing, Insulation data. Detoriation of insulation used for # electric motors normally running at temperatures of 80 to 100 degrees Centigrade. library(survival) data <- read.table("https://www.math.ntnu.no/emner/TMA4275/2017v/datasett/Insulate.txt", header=TRUE) attach(data) inverseTemp <- 1/(Temp + 273.16) # Inverse temperature in degrees Kelvin delta <- ifelse(Censor=="F",1,0) # Fit a weibull regression with the Inverse of Temperature in Kelvin as stressor # (the Arrhenius model) weibreg <- survreg(Surv(FailureT,delta) ~ inverseTemp, dist="weibull") summary(weibreg) vcov(weibreg) # Then compute predicted values of the median an associated 95% confidence bands # at different temperatures (Relation Plot in MINITAB/Slide 13) tempRange <- seq(80,170,by=5) pred <- predict(weibreg, type="uquantile", p=.5, newdata=data.frame(inverseTemp=1/(273.16 + tempRange)), se.fit=TRUE) plot(tempRange,pred$fit,type="l") lines(tempRange, pred$fit + 1.96*pred$se, lty=2) lines(tempRange, pred$fit - 1.96*pred$se, lty=2) # Transform to scale of response plot(tempRange,exp(pred$fit),type="l") lines(tempRange, exp(pred$fit + 1.96*pred$se), lty=2) lines(tempRange, exp(pred$fit - 1.96*pred$se), lty=2) # Compute the deviance "by hand" thetahat <- exp(weibreg$linear.predictors) alpha <- 1/weibreg$scale y <- FailureT D <- 2*sum((y/thetahat)^alpha - delta*(1+alpha*log(y/thetahat))) D # the deviance computed from the deviance residuals res <- residuals(weibreg, type="deviance") sum(res^2) # Assessing the Arrenius assumption by plotting deviance residuals against the stressor res <- residuals(weibreg, type="deviance") plot(res~inverseTemp) smoothSEcurve(res, inverseTemp) # Assessing the Weibull and Arrenius assumption using probability plots (as in Slides 13) Rhat <- survfit(Surv(FailureT,delta) ~ inverseTemp) # K-M estimates of R(t) for each unique temp plot(NA,xlim=log(range(FailureT)), ylim=c(-3,1), xlab="log(time)", ylab="log(-log R(t))") for (i in 1:length(unique(inverseTemp))) { points(log(Rhat[i]$time),log(-log(Rhat[i]$surv)), type="s") # cloglog transformed K-M vs. log time x <- unique(inverseTemp)[i] mu <- weibreg$coefficients %*% c(1,x) sigma <- weibreg$scale theta <- exp(mu) alpha <- 1/sigma abline(-alpha*log(theta),alpha) } # March 30 # Parametric MLE for nonhomogeneous Poisson process (NHPPs) # Grampus data, time in days of valve seat replacements in 41 diesel locomotivies # from p. 119 in # https://books.google.no/books?id=DQBqbl40EXUC&pg=PA118&lpg=PA118&dq=grampus+data+recurrent+events&source=bl&ots=zNXVcwdZYv&sig=TtQXZYxef4FSb3QZN-cT_uT8xT8&hl=no&sa=X&ved=0ahUKEwjdjcic1PvSAhVCXiwKHR59BwMQ6AEIIDAA#v=onepage&q=grampus%20data%20recurrent%20events&f=false # s1,s2,.. are replacement times and tau the service time cat(" j, s1 ,s2 ,s3 ,s4 ,tau 1 , , , , ,761 2 , , , , ,759 3 ,98 , , , ,667 4 ,326,643,653, ,667 5 , , , , ,665 6 ,84 , , , ,667 7 ,87 , , , ,663 8 ,646, , , ,653 9 , , , , ,651 10,92 , , , ,653 11,258,328,377,621,650 12,61 ,539, , ,648 13,254,276,298,640,644 14,76 ,539, , ,642 15,635, , , ,641 16,349,404,561, ,649 17, , , , ,631 18, , , , ,596 19,120,479, , ,614 20,323,447, , ,582 21,139,139, , ,589 22, , , , ,593 23,573, , , ,589 24,165,408,604, ,606 25,249, , , ,594 26,344,497, , ,613 27,265,586, , ,595 28,166,206,348, ,389 29, , , , ,601 30,410,581, , ,601 31, , , , ,611 32, , , , ,608 33, , , , ,587 34,367, , , ,603 35,202,563,570, ,585 36, , , , ,587 37, , , , ,578 38, , , , ,578 39, , , , ,586 40, , , , ,585 41, , , , ,582 ", file="/tmp/temp.dat") data <- read.table("/tmp/temp.dat",na.strings="- ",header=TRUE,sep=",") tau <- data[,6] s <- data[,2:5] # Plot the data par(mfrow=c(2,1)) plot(NA,xlim=c(0,max(tau)),ylim=c(0,length(tau)), xlab="time",ylab="unit (locomotive)") for (j in 1:length(tau)) { lines(c(0,tau[j]),c(j,j)) ss <- s[j,] ss <- ss[!is.na(ss)] points(ss,rep(j,length(ss))) } # List of functions specifying a power law NHPP powerlaw <- list( logw=function(t,par) log(par[2]) - log(par[1]) + (par[2]-1)*log(t/par[1]), W=function(t,par) (t/par[1])^par[2] ) # List of functions specifying a log linear NHPP loglinear <- list( logw=function(t,par) par[1]+par[2]*t, W=function(t,par) { if (par[2]==0) exp(par[1])*t else exp(par[1])/par[2]*(exp(par[2]*t)-1) } ) # log likelihood of data from NHPP with log intensity function logw and # mean function W contained the list model negloglik <- function(par,model,s,tau) { tmp <- -sum(model$W(tau, par)) + sum(model$logw(s, par),na.rm=TRUE) -tmp } powerlawfit <- optim(c(.01,1), negloglik, model=powerlaw, s=s, tau=tau, lower=c(.001,.001),hessian=TRUE) powerlawfit se <- sqrt(diag(solve(powerlawfit$hessian))) cbind(powerlawfit$par,se) # Wald test of H_0 : beta = 1 (HPP) z <- (powerlawfit$par[2]-1)/se[2] z 2*pnorm(abs(z),lower.tail=FALSE) # p-value loglinfit <- optim(c(log(.001),0), negloglik, model=loglinear, s=s, tau=tau, hessian=TRUE) se <- sqrt(diag(solve(loglinfit$hessian))) cbind(loglinfit$par,se) # Wald test of H_0 : beta = 0 (HPP) z <- (loglinfit$par[2])/se[2] z 2*pnorm(abs(z),lower.tail=FALSE) # p-value # Plot the two models curve(exp(loglinear$logw(x,loglinfit$par)),to=max(tau),ylim=c(0,.005)) curve(exp(powerlaw$logw(x,powerlawfit$par)),to=max(tau),ylim=c(0,.005),add=TRUE)