##################################################################### ##################################################################### # # Discriminant Analysis # ##################################################################### ##################################################################### # We can read the SAT scores (with graduation information) data from the Internet: # Note: These data are based on the old (pre-2016) SAT scoring system, which was out of 2400 total points. satgradu <- read.table("http://www.stat.sc.edu/~hitchcock/satgradu.txt", header=T) # If not connected to the Internet, we could save the file and read it # similarly to the following: # satgradu<-read.table("Z:/stat_530/satgradu.txt", header=T) # The first three columns are the SAT scores on the 3 tests, # the last column (called gradu) is an indicator of whether the student successfully graduated # (1 = graduated, 0 = did not graduate) attach(satgradu) # Using the built-in lda function in the MASS package # for linear discriminant analysis: library(MASS) # assuming equal prior probabilities of graduating or not: dis <- lda(gradu ~ math + reading + writing, data=satgradu, prior=c(0.5, 0.5)) # a1, a2, a3 are given as "Coefficients of linear discriminants". # Let's predict whether a new applicant with # SAT scores of: math = 550, reading = 610, writing = 480 # will graduate: newobs <- rbind( c(550,610,480) ) dimnames(newobs) <- list(NULL,c('math','reading', 'writing')) newobs <- data.frame(newobs) predict(dis,newdata=newobs)$class # Posterior probabilities of this applicant being in each group: predict(dis,newdata=newobs)$posterior # Making predictions for several new individuals at once: newobs <- rbind( c(300,420,280), c(510,480,470), c(780,760,710) ) dimnames(newobs) <- list(NULL,c('math','reading', 'writing')) newobs <- data.frame(newobs) predict(dis,newdata=newobs) # assuming prior probability of graduating is about twice as large # as probability of not graduating: # dis <- lda(gradu ~ math + reading + writing, data=satgradu, prior=c(0.33, 0.67)) # If we do not specify any prior probabilities, it will by default use the proportions # of the sampled individuals that are in each group as the prior probabilities. # Sometimes this is the best approach to take, unless you have some prior idea about the group probabilities! dis2 <- lda(gradu ~ math + reading + writing, data=satgradu) dis2 # Using the default approach to predict whether a new applicant with # SAT scores of: math = 550, reading = 610, writing = 480 # will graduate: newobs <- rbind( c(550,610,480) ) dimnames(newobs) <- list(NULL,c('math','reading', 'writing')) newobs <- data.frame(newobs) predict(dis2,newdata=newobs)$class # Posterior probabilities of this applicant being in each group: predict(dis2,newdata=newobs)$posterior # Making predictions for several new individuals at once with the default approach: newobs <- rbind( c(300,420,280), c(510,480,470), c(780,760,710) ) dimnames(newobs) <- list(NULL,c('math','reading', 'writing')) newobs <- data.frame(newobs) predict(dis2,newdata=newobs) ##### Misclassification rate of LDA rule: # Simple plug-in misclassification rate: group<-predict(dis, satgradu, method='plug-in')$class table(group,gradu) # The plug-in misclassification rate for LDA here is (11+4)/40 = 0.375. # cross-validation rate of LDA rule: ######## correct<-rep(0,times=nrow(satgradu) ) for (j in 1:nrow(satgradu) ) { mydis<-lda(grouping=gradu[-j], x=satgradu[-j,1:3], prior=c(0.5,0.5)) mypred<-predict(mydis,newdata=satgradu[j,1:3])$class correct[j] <- (mypred==gradu[j]) } cv.misclass <- 1-mean(correct) cv.misclass ######### # The cross-validation misclassification rate for LDA here is 0.425. ### Compare these to the approaches using the default prior probabilities! #################################################################################### #################################################################################### ####### Quadratic discriminant analysis can be implemented with the qda function: disquad <- qda(gradu ~ math + reading + writing, data=satgradu, prior=c(0.5, 0.5)) ##### Misclassification rate of QDA rule: # Simple plug-in misclassification rate: group<-predict(disquad, satgradu, method='plug-in')$class table(group,gradu) # The plug-in misclassification rate for QDA here is (10+2)/40 = 0.3. # cross-validation rate of QDA rule: ######## correct<-rep(0,times=nrow(satgradu) ) for (j in 1:nrow(satgradu) ) { mydisquad<-qda(grouping=gradu[-j], x=satgradu[-j,1:3], prior=c(0.5,0.5)) mypred<-predict(mydisquad,newdata=satgradu[j,1:3])$class correct[j] <- (mypred==gradu[j]) } cv.misclass <- 1-mean(correct) cv.misclass ######### # The cross-validation misclassification rate for QDA here is still 0.425. ### Compare these to the approaches using the default prior probabilities! #################################################################################### #################################################################################### ### Discriminant Analysis with More than 2 groups: # With the Egyptian skulls data from Table 5.8: skulls <- read.table("http://www.stat.sc.edu/~hitchcock/skullschap7.txt", header=T) attach(skulls) # Note that in the training sample, there are actually 30 skulls from EACH epoch: skull.lda <- lda(EPOCH~MB+BH+BL+NH, prior=c(0.2,0.2,0.2,0.2,0.2) ) skull.lda # Let's predict the epoch of a new skull with # MB = 135, BH = 144, BL = 97, NH = 53: newobs <- rbind( c(135,144,97,53) ) dimnames(newobs) <- list(NULL,c('MB','BH', 'BL', 'NH')) newobs <- data.frame(newobs) predict(skull.lda,newdata=newobs) #################################################################################### #################################################################################### # # Classification Using Logistic Regression, for the SAT scores example: # # We can read the SAT scores (with graduation information) data from the Internet: satgradu <- read.table("http://www.stat.sc.edu/~hitchcock/satgradu.txt", header=T) # If not connected to the Internet, we could save the file and read it # similarly to the following: # satgradu<-read.table("Z:/stat_530/satgradu.txt", header=T) # The first three columns are the SAT scores on the 3 tests, # the last column (called gradu) is an indicator of whether the student successfully graduated # (1 = graduated, 0 = did not graduate) attach(satgradu) glm.fit.sat <- glm(gradu ~ math + reading + writing, data=satgradu, family=binomial) summary(glm.fit.sat) # The z-tests tell us which explanatory variables are most important in predicting whether a student graduates. # We see the reading score is the most important explanatory variable is predicting graduation. # Let's predict whether a new applicant with # SAT scores of: math = 550, reading = 610, writing = 480 # will graduate: newobs <- rbind( c(550,610,480) ) dimnames(newobs) <- list(NULL,c('math','reading', 'writing')) newobs <- data.frame(newobs) predict(glm.fit.sat,newdata=newobs, type='response') # Making predictions for several new individuals at once: newobs <- rbind( c(300,420,280), c(510,480,470), c(780,760,710) ) dimnames(newobs) <- list(NULL,c('math','reading', 'writing')) newobs <- data.frame(newobs) predict(glm.fit.sat,newdata=newobs, type='response') ##### Misclassification rate of logistic regression rule: # Simple plug-in misclassification rate: group.probs<-predict(glm.fit.sat, satgradu, type='response') pred.group <- rep(0,times=nrow(satgradu)) pred.group[group.probs > 0.5] <- 1 table(pred.group,gradu) # The plug-in misclassification rate for logistic regression here is 10/40 = 0.25, # but all the students have been predicted to graduate! Seems a little strange ... # If we change the cutoff to 0.6, then two people are predicted to not graduate, and # the misclassification rate is still (1+9)/40 = 0.25: group.probs<-predict(glm.fit.sat, satgradu, type='response') pred.group <- rep(0,times=nrow(satgradu)) pred.group[group.probs > 0.6] <- 1 table(pred.group,gradu) # cross-validation rate of logistic regression rule: ######## correct<-rep(0,times=nrow(satgradu) ) for (j in 1:nrow(satgradu) ) { glm.fit.no.j<-glm(gradu ~ math + reading + writing, data=satgradu, family=binomial, subset=-j) mypred<-(predict(glm.fit.no.j,newdata=satgradu[j,1:3],typ='response') > 0.5) correct[j] <- (mypred==gradu[j]) } cv.misclass <- 1-mean(correct) cv.misclass ######### # The cross-validation misclassification rate for logistic regression here is 0.275.