#####################################################################
#####################################################################
#  
#  Discriminant Analysis
#
#####################################################################
#####################################################################

# We can read the SAT scores (with graduation information) data from the Internet:

# Note:  These data are based on the old (pre-2016) SAT scoring system, which was out of 2400 total points.

satgradu <- read.table("http://www.stat.sc.edu/~hitchcock/satgradu.txt", header=T)

# If not connected to the Internet, we could save the file and read it 
# similarly to the following:

# satgradu<-read.table("Z:/stat_530/satgradu.txt", header=T)


# The first three columns are the SAT scores on the 3 tests,
# the last column (called gradu) is an indicator of whether the student successfully graduated
# (1 = graduated, 0 = did not graduate)

attach(satgradu)

# Using the built-in lda function in the MASS package
# for linear discriminant analysis:

library(MASS)

# assuming equal prior probabilities of graduating or not:

dis <- lda(gradu ~ math + reading + writing, data=satgradu, prior=c(0.5, 0.5))

# a1, a2, a3 are given as "Coefficients of linear discriminants".

# Let's predict whether a new applicant with 
# SAT scores of: math = 550, reading = 610, writing = 480
# will graduate:

newobs <- rbind( c(550,610,480) )
dimnames(newobs) <- list(NULL,c('math','reading', 'writing'))
newobs <- data.frame(newobs)
predict(dis,newdata=newobs)$class

# Posterior probabilities of this applicant being in each group:

predict(dis,newdata=newobs)$posterior

# Making predictions for several new individuals at once:

newobs <- rbind( c(300,420,280), c(510,480,470), c(780,760,710) )
dimnames(newobs) <- list(NULL,c('math','reading', 'writing'))
newobs <- data.frame(newobs)
predict(dis,newdata=newobs)


# assuming prior probability of graduating is about twice as large 
# as probability of not graduating:

# dis <- lda(gradu ~ math + reading + writing, data=satgradu, prior=c(0.33, 0.67))

# If we do not specify any prior probabilities, it will by default use the proportions
# of the sampled individuals that are in each group as the prior probabilities.
# Sometimes this is the best approach to take, unless you have some prior idea about the group probabilities!

dis2 <- lda(gradu ~ math + reading + writing, data=satgradu)

dis2

# Using the default approach to predict whether a new applicant with 
# SAT scores of: math = 550, reading = 610, writing = 480
# will graduate:

newobs <- rbind( c(550,610,480) )
dimnames(newobs) <- list(NULL,c('math','reading', 'writing'))
newobs <- data.frame(newobs)
predict(dis2,newdata=newobs)$class

# Posterior probabilities of this applicant being in each group:
predict(dis2,newdata=newobs)$posterior

# Making predictions for several new individuals at once with the default approach:

newobs <- rbind( c(300,420,280), c(510,480,470), c(780,760,710) )
dimnames(newobs) <- list(NULL,c('math','reading', 'writing'))
newobs <- data.frame(newobs)
predict(dis2,newdata=newobs)


##### Misclassification rate of LDA rule:

# Simple plug-in misclassification rate:

group<-predict(dis, satgradu, method='plug-in')$class
table(group,gradu)

# The plug-in misclassification rate for LDA here is (11+4)/40 = 0.375.

# cross-validation rate of LDA rule:

########
correct<-rep(0,times=nrow(satgradu) )
for (j in 1:nrow(satgradu) ) {
mydis<-lda(grouping=gradu[-j], x=satgradu[-j,1:3], prior=c(0.5,0.5))
mypred<-predict(mydis,newdata=satgradu[j,1:3])$class
correct[j] <- (mypred==gradu[j])
}
cv.misclass <- 1-mean(correct)
cv.misclass
#########

# The cross-validation misclassification rate for LDA here is 0.425.

### Compare these to the approaches using the default prior probabilities!

####################################################################################
####################################################################################

####### Quadratic discriminant analysis can be implemented with the qda function:

disquad <- qda(gradu ~ math + reading + writing, data=satgradu, prior=c(0.5, 0.5))

##### Misclassification rate of QDA rule:

# Simple plug-in misclassification rate:

group<-predict(disquad, satgradu, method='plug-in')$class
table(group,gradu)

# The plug-in misclassification rate for QDA here is (10+2)/40 = 0.3.

# cross-validation rate of QDA rule:

########
correct<-rep(0,times=nrow(satgradu) )
for (j in 1:nrow(satgradu) ) {
mydisquad<-qda(grouping=gradu[-j], x=satgradu[-j,1:3], prior=c(0.5,0.5))
mypred<-predict(mydisquad,newdata=satgradu[j,1:3])$class
correct[j] <- (mypred==gradu[j])
}
cv.misclass <- 1-mean(correct)
cv.misclass
#########

# The cross-validation misclassification rate for QDA here is still 0.425.

### Compare these to the approaches using the default prior probabilities!

####################################################################################
####################################################################################

### Discriminant Analysis with More than 2 groups:

# With the Egyptian skulls data from Table 5.8:

skulls <- read.table("http://www.stat.sc.edu/~hitchcock/skullschap7.txt", header=T)

attach(skulls)

# Note that in the training sample, there are actually 30 skulls from EACH epoch:

skull.lda <- lda(EPOCH~MB+BH+BL+NH, prior=c(0.2,0.2,0.2,0.2,0.2) )
skull.lda 

# Let's predict the epoch of a new skull with 
# MB = 135, BH = 144, BL = 97, NH = 53:

newobs <- rbind( c(135,144,97,53) )
dimnames(newobs) <- list(NULL,c('MB','BH', 'BL', 'NH'))
newobs <- data.frame(newobs)
predict(skull.lda,newdata=newobs)


####################################################################################
####################################################################################
#
# Classification Using Logistic Regression, for the SAT scores example:
#

# We can read the SAT scores (with graduation information) data from the Internet:

satgradu <- read.table("http://www.stat.sc.edu/~hitchcock/satgradu.txt", header=T)

# If not connected to the Internet, we could save the file and read it 
# similarly to the following:

# satgradu<-read.table("Z:/stat_530/satgradu.txt", header=T)


# The first three columns are the SAT scores on the 3 tests,
# the last column (called gradu) is an indicator of whether the student successfully graduated
# (1 = graduated, 0 = did not graduate)

attach(satgradu)

glm.fit.sat <- glm(gradu ~ math + reading + writing, data=satgradu, family=binomial)
summary(glm.fit.sat)

# The z-tests tell us which explanatory variables are most important in predicting whether a student graduates.
# We see the reading score is the most important explanatory variable is predicting graduation.

# Let's predict whether a new applicant with 
# SAT scores of: math = 550, reading = 610, writing = 480
# will graduate:

newobs <- rbind( c(550,610,480) )
dimnames(newobs) <- list(NULL,c('math','reading', 'writing'))
newobs <- data.frame(newobs)
predict(glm.fit.sat,newdata=newobs, type='response')


# Making predictions for several new individuals at once:

newobs <- rbind( c(300,420,280), c(510,480,470), c(780,760,710) )
dimnames(newobs) <- list(NULL,c('math','reading', 'writing'))
newobs <- data.frame(newobs)
predict(glm.fit.sat,newdata=newobs, type='response')


##### Misclassification rate of logistic regression rule:

# Simple plug-in misclassification rate:

group.probs<-predict(glm.fit.sat, satgradu, type='response')
pred.group <- rep(0,times=nrow(satgradu))
pred.group[group.probs > 0.5] <- 1 
table(pred.group,gradu)

# The plug-in misclassification rate for logistic regression here is 10/40 = 0.25, 
# but all the students have been predicted to graduate!  Seems a little strange ...

#  If we change the cutoff to 0.6, then two people are predicted to not graduate, and
# the misclassification rate is still (1+9)/40 = 0.25:

group.probs<-predict(glm.fit.sat, satgradu, type='response')
pred.group <- rep(0,times=nrow(satgradu))
pred.group[group.probs > 0.6] <- 1 
table(pred.group,gradu)



# cross-validation rate of logistic regression rule:

########
correct<-rep(0,times=nrow(satgradu) )
for (j in 1:nrow(satgradu) ) {
glm.fit.no.j<-glm(gradu ~ math + reading + writing, data=satgradu, family=binomial, subset=-j)
mypred<-(predict(glm.fit.no.j,newdata=satgradu[j,1:3],typ='response') > 0.5)
correct[j] <- (mypred==gradu[j])
}
cv.misclass <- 1-mean(correct)
cv.misclass
#########

# The cross-validation misclassification rate for logistic regression here is 0.275.

##########################################################

install.packages("pROC")
library(pROC)

# The ROC curve:  A way to judge the performance of the logistic regression model as a classifier, across a large grid of thresholds/cutoffs:

myroc <- roc(gradu,predict(glm.fit.sat, satgradu, type = "response")) 
plot(myroc)

# Sensitivity is the probability of predicting Y=1 for an individual who truly has Y=1
# Specificity is the probability of predicting Y=0 for an individual who truly has Y=0
# The bold curve plots (Specificity, Sensitivity) for a grid of cutoff thresholds going from 0 to 1.
# We want both Specificity and Sensitivity to be large, so we want a large Area Under Curve (AUC) when looking at the ROC plot.
# It's just so-so for this example ...