## R-code for lda analysis adapted from R-code provided in ## https://stat.ethz.ch/pipermail/r-help/2010-May/239195.html # ========================================================================================= # (1) Generate sample labelled data # ========================================================================================= # needed for rmvnorm library(mvtnorm) set.seed(11) # true covariance matrix sigma <- diag(c(0.5, 0.25)) mu <- matrix(0,nrow=3,ncol=2) # true mean values mu[1,] <- c(-1,1) mu[2,] <- c(1,1) mu[3,] <- c(0,-2) n <- 300 J <- 3 x <- matrix(0, nrow = n, ncol = J) # class labels x[,3] <- rep(1:3, each = n/J) x[1 :100,1:2] <- rmvnorm(n = n/J, mean = mu[1,], sigma = sigma) x[101:200,1:2] <- rmvnorm(n = n/J, mean = mu[2,], sigma = sigma) x[201:300,1:2] <- rmvnorm(n = n/J, mean = mu[3,], sigma = sigma) colnames(x) <- c("x", "y", "class") # ========================================================================================= # (2) Plot the labelled data # ========================================================================================= ## ## Function for plotting the data separated by classes, hacked out of predplot: ## http://stat.ethz.ch/teaching/lectures/FS_2010/CompStat/predplot.R ## plotclasses <- function(x, main = "", len = 200, ...) { xp <- seq(min(x[,1]), max(x[,1]), length=len) yp <- seq(min(x[,2]), max(x[,2]), length=len) grid <- expand.grid(xp, yp) colnames(grid) <- colnames(x)[-3] plot(x[,1],x[,2],col=c(1,2,4)[x[,3]],pch=x[,3],main=main,xlab='x',ylab='y', ...) } #plotclasses(x) # ========================================================================================= # (3) Functions needed: calculate separating hyperplane between two given # classes and converting hyperplanes to line equations for the p=2 case # ========================================================================================= ## Returns the coefficients for the hyperplane that separates one class from another. ## Computes the coefficients according to the formula: ## $x^T\hat{\Sigma}^{-1}(\hat{\mu}_0-\hat{\mu}_1) - \frac{1}{2}(\hat{\mu}_0 + ## \hat{\mu}_1)^T\hat{\Sigma}^{-1}(\hat{\mu}_0-\hat{\mu}_1)+\log(\frac{p_0}{p_1})$ ## ## sigmainv(DxD) - precalculated sigma (covariance matrix) inverse ## mu1(1xD) - precalculated mu mean for class 1 ## mu2(1xD) - precalculated mu mean for class 2 ## prior1 - precalculated prior probability for class 1 ## prior2 - precalculated prior probability for class 2 ownldahyperplane <- function(sigmainv,mu1,mu2,prior1,prior2) { J <- nrow(mu) b <- sigmainv%*%(mu1 - mu2) c <- -(1/2)*t(mu1 + mu2)%*%sigmainv%*%(mu1 - mu2) + log(prior1/prior2) return(list(b=b,c=c)) } ## Returns linear betas (intersect and slopes) for the given hyperplane structure. ## The structure is a list that matches the output of the function defined above. ownlinearize <- function(sephyp) { # line slope and intersect return(list(beta0=-sephyp$c/sephyp$b[2], beta1=-sephyp$b[1]/sephyp$b[2])) } # ========================================================================================= # (4) Run lda # ========================================================================================= # needed for lda/qda library(MASS) fit <- lda(x=x[,1:2],grouping=x[,3]) # extract A matrix A <- fit$scaling # calculate sigma hat inverse sigmahatinv <- A%*%t(A) # get prior hat probabilities priorhat <- fit$prior # get mu hat muhat <- fit$means # ========================================================================================= # (5) Calculate the separating hyperplanes which can be added using abline # or create the class boundaries using lines by fixing six points. # Run the abline one at the time after running step 2 anew # ========================================================================================= # calculate dec. boundary 1-2 sephyp12 <- ownldahyperplane(sigmahatinv,muhat[1,],muhat[2,], priorhat[1],priorhat[2]) # get line for 1-2 line12 <- ownlinearize(sephyp12) sephyp23 <- ownldahyperplane(sigmahatinv,muhat[2,],muhat[3,], priorhat[2],priorhat[3]) line23 <- ownlinearize(sephyp23) sephyp13 <- ownldahyperplane(sigmahatinv,muhat[1,],muhat[3,], priorhat[1],priorhat[3]) line13 <- ownlinearize(sephyp13) # ======================================================================================== # (6) Run this snippet to see the class regions # ========================================================================================= A= cbind(c(line13$beta1, line23$beta1), c(1,1)) b= c(line13$beta0, line23$beta0) is12 = solve(A, b) plotclasses(x, xlim=c(-4,4), ylim=c(-4,4)) library(plotrix) # abline(a=line13$beta0, b=line13$beta1) # abline(a=line12$beta0, b=line12$beta1) # abline(a=line23$beta0, b=line23$beta1) ablineclip(a=line13$beta0, b=line13$beta1, x1=-4, x2=is12[1], y1=-4, y2=is12[2]) ablineclip(a=line12$beta0, b=line12$beta1, x1=-4, x2=4, y1=is12[2], y2=4) ablineclip(a=line23$beta0, b=line23$beta1, x1=is12[1], x2=4, y1=-4, y2=is12[2]) # ======================================================================================== # Predictions of new observations # ========================================================================================= topred <- as.data.frame(rbind(c(-2,-2), c(-2,1), c(0,0), c(2,2))) colnames(topred) <- c("x", "y") points(x=topred[,1], y=topred[,2], col="green", pch="*",cex=3) # use the predict function to classify the new points predict(fit, topred)$posterior predict(fit, topred)$class