#################################
#
## k-nearest neighbors
#
#################################

# load the 'class' package:

library(class)

# load the 'ISLR' package:

library(ISLR)

# Alternately, reading in the 'Caravan' data set from the web:

Caravan <- read.table("http://people.stat.sc.edu/hitchcock/Caravan_data.txt", header=T)

attach(Caravan)

# This data set has 5822 observations and 86 variables!

# We will try to predict whether a person will purchase insurance, based on the 85 other variables.
# We will treat the first 1000 rows as the "test" data and the other 4822 rows as the training data.

# Standardizing using the 'scale' function:

Caravan.X <- Caravan[,-86]  # removing the categorical response variable, "Purchase"
Caravan.X.std <- scale(Caravan.X)

# Defining the training data:

Purchase.training <- Purchase[-(1:1000)]
X.training <- Caravan.X.std[-(1:1000),]

# Defining the test data:

X.test <- Caravan.X.std[1:1000,]

# Using KNN with K=1 to predict 'Purchase' on the test data:

knn.pred = knn(train = X.training, test = X.test, cl = Purchase.training, k=1)

# Seeing the classifications for the 1000 test customers:
print(knn.pred)

# Since we actually know which of the first 1000 customers purchased insurance, we can see how well we did:

Purchase.test <- Purchase[1:1000]

# misclassification rate for the 1000 customers:

mean(Purchase.test != knn.pred)

# It is around 12%.

### On the SAT/graduation data set:

# We can read the SAT scores (with graduation information) data from the Internet:

satgradu <- read.table("http://people.stat.sc.edu/hitchcock/satgradu.txt", header=T)

# If not connected to the Internet, we could save the file and read it 
# similarly to the following:

# satgradu<-read.table("Z:/stat_530/satgradu.txt", header=T)


# The first three columns are the SAT scores on the 3 tests,
# the last column (called gradu) is an indicator of whether the student successfully graduated
# (1 = graduated, 0 = did not graduate)

attach(satgradu)

X.train.sat <- satgradu[,c('math','reading', 'writing')]

# Making predictions for several new individuals at once:

newobs <- rbind( c(300,420,280), c(510,480,470), c(780,760,710) )
dimnames(newobs) <- list(NULL,c('math','reading', 'writing'))
newobs <- data.frame(newobs)

# KNN with various values of K:

dis.knn <- knn(train = X.train.sat, test = newobs, cl = gradu, k=1, prob=TRUE)
print(dis.knn)

dis.knn <- knn(train = X.train.sat, test = newobs, cl = gradu, k=4, prob=TRUE)
print(dis.knn)

dis.knn <- knn(train = X.train.sat, test = newobs, cl = gradu, k=10, prob=TRUE)
print(dis.knn)

dis.knn <- knn(train = X.train.sat, test = newobs, cl = gradu, k=20, prob=TRUE)
print(dis.knn)


##### Misclassification rate of KNN classification rule:

# Simple plug-in misclassification rate (for KNN with K=4):

group<-knn(train = X.train.sat, test = X.train.sat, cl = gradu, k=4)
table(group,gradu)

# The plug-in misclassification rate for KNN here is (2+8)/40 = 0.25.

# The plug-in misclassification rate for LDA here was (11+4)/40 = 0.375.

#################################
#
## Regression Trees
#
#################################


# load the 'rpart' package:

library(rpart)

# load the 'MASS' package:

library(MASS)

# loading the 'Boston' dataset:

data(Boston)

# Sample of 506 tracts in suburban Boston

# attaching the 'Boston' dataset:

# attach(Boston)

# examining a scatterplot matrix:
# pairs(Boston, pch=".")

# Goal:  Create a regression tree that will predict the median value of a home (medv, measured in thousands of dollars)
# based on 13 possible explanatory variables:

## The variables:

# crim: per capita crime rate by town. 
# zn: proportion of residential land zoned for lots over 25,000 sq.ft. 
# indus: proportion of non-retail business acres per town. 
# chas: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise). 
# nox: nitrogen oxides concentration (parts per 10 million). 
# rm: average number of rooms per dwelling. 
# age: proportion of owner-occupied units built prior to 1940. 
# dis: weighted mean of distances to five Boston employment centres. 
# rad: index of accessibility to radial highways. 
# tax: full-value property-tax rate per \$10,000. 
# ptratio: pupil-teacher ratio by town. 
# black: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town. 
# lstat: lower status of the population (percent). 
# medv: median value of owner-occupied homes in \$1000s. 



# fitting a regression tree, with default settings:

bos.tree <- rpart(medv ~  crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat, data=Boston)

# basic summary output:

print(bos.tree)

# plotting the graph of the tree (spacing proportional to improvement in fit):

plot(bos.tree); text(bos.tree)

## At each split:  If condition is TRUE, proceed to the LEFT branch.

# plotting the graph of the tree (with uniform spacing):

plot(bos.tree, compress=T, uniform=T, branch=0.4); text(bos.tree)


## Diagnostics based on residuals:

plot(predict(bos.tree), residuals(bos.tree), xlab='Predicted', ylab='Residuals')

qqnorm(residuals(bos.tree), main='Normal Q-Q plot of Residuals')



#### Predictions:

# Predict the median home value for an observation that is "typical" in all explanatory variables:

x.med <- apply(Boston[,-14],2,median)

predict(bos.tree, data.frame(t(x.med)))


# Predicting the median home value for an observation with 
# crim=2, zn=12.5, indus=13, chas=0, nox=0.5, rm=5.2, age=66.3, dis=4.2, rad=4, tax=300, ptratio=18, black=390, lstat=15.5:

x.val <- c(crim=2, zn=12.5, indus=13, chas=0, nox=0.5, rm=5.2, age=66.3, dis=4.2, rad=4, tax=300, ptratio=18, black=390, lstat=15.5)

predict(bos.tree, data.frame(t(x.val)))

#################################################
##
## Tree Pruning (Selection)
##
#################################################

# cp = ratio of lambda to the RSS on the no-branch tree
# A larger cp encourages a less complex tree
# A smaller cp allows a tree with lots of splits and branches

# Listing all trees with cp of at least 0.001:

bos.tree.cp <- rpart(medv ~  crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat, data=Boston, cp=0.001)

printcp(bos.tree.cp)

# Selecting cp that minimizes the 'xerror':

bos.tree.best <- prune.rpart(bos.tree.cp, cp=0.0011373)

# Plotting the 'best' tree:

plot(bos.tree.best); text(bos.tree.best)

## This is a more complex solution than before (too complex?).

##### A simpler tree (with higher cp):

bos.tree.simp <- prune.rpart(bos.tree.cp, cp=0.0361643)

# Plotting this tree:

plot(bos.tree.simp); text(bos.tree.simp)



##### Relative Error vs. cp plot:

plotcp(bos.tree.best)

## Where is the "elbow" in the plot?

#
# Predictions can then be done as above, based on whatever tree we select.
#

##### A medium-sized tree (with cp arond the elbow):

bos.tree.medium <- prune.rpart(bos.tree.cp, cp=0.0158512)

# Plotting this tree:

plot(bos.tree.medium); text(bos.tree.medium)
 
# Basically looks like the one R originally gave us using default settings.


#################################
#
## Classification Trees
#
#################################


# load the 'rpart' package:

library(rpart)


### Example:

# Reading in the hsb data:

hsb <- read.table("http://people.stat.sc.edu/hitchcock/hsbdata.txt", header=T)

attach(hsb)

# A classification tree to classify the students into one of three program types (academic, vocational, and general)
# based on numerous predictor variables (gender,race,ses,schtyp,read,write,math,science,socst).

tree.prog <- rpart(prog ~ gender+race+ses+schtyp+read+write+math+science+socst, data=hsb, method="class")

# Summary of object:

tree.prog

# Plotting this tree:

plot(tree.prog); text(tree.prog)


# Predicting the program type for a student with 
# gender='female',race='white',ses='low',schtyp='public',read=55,write=51,math=44,science=49,socst=56:

# Since some predictors have "character" values and some predictors have numeric values,
# we must put the values directly into a data frame rather than creating a vector first:

predict(tree.prog, data.frame(gender='female',race='white',ses='low',schtyp='public',read=55,write=51,math=44,science=49,socst=56))

#
# Similar tree pruning/selection as with regression trees could be done, if desired.
#

##### Misclassification rates of LDA rule vs. Classification tree:

# Simple plug-in misclassification rate of LDA rule here:

library(MASS)

# assuming equal prior probabilities of graduating or not:

dis <- lda(prog ~ gender+race+ses+schtyp+read+write+math+science+socst, data=hsb, prior=c(1/3, 1/3, 1/3))

group<-predict(dis, hsb, method='plug-in')$class
tab.lda.eq <- table(group,prog)

# The plug-in misclassification rate for LDA here with equal prior probabilities is:

1-( sum(diag(tab.lda.eq)) / sum(tab.lda.eq) )

# If we do not specify any prior probabilities, it will by default use the proportions
# of the sampled individuals that are in each group as the prior probabilities.

dis2 <- lda(prog ~ gender+race+ses+schtyp+read+write+math+science+socst, data=hsb)

group<-predict(dis2, hsb, method='plug-in')$class
tab.lda.prop <- table(group,prog)

# The plug-in misclassification rate for LDA here with proportional prior probabilities is:

1-( sum(diag(tab.lda.prop)) / sum(tab.lda.prop) )

group<-predict(tree.prog, type="class")
tab.tree <- table(group,prog)

# The plug-in misclassification rate for the classification tree is:

1-( sum(diag(tab.tree)) / sum(tab.tree) )



## The 'rpart' function will also accept user-specified prior class probabilities
## (by default it uses the proportions of individuals in each group as the prior probabilities):

tree.prog.eq <- rpart(prog ~ gender+race+ses+schtyp+read+write+math+science+socst, data=hsb, method="class", parms = list(prior = c(1/3, 1/3, 1/3)) )

group<-predict(tree.prog.eq, type="class")
tab.tree.eq <- table(group,prog)

# The plug-in misclassification rate for the classification tree is:

1-( sum(diag(tab.tree.eq)) / sum(tab.tree.eq) )


#################################
#
## Random Forests
#
#################################

# loading the 'randomForest' package:

library(randomForest)

##
## Regression with random forest:
##

library(MASS)

# loading the 'Boston' dataset:

data(Boston)

bos.rf <- randomForest(medv ~  crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat, data=Boston)

# default 'ntree' is 500
# default 'mtry' is p/3 for regression and sqrt(p) for classification, where 'p' is the number of predictors.
# These can be adjusted if desired:

# bos.rf2 <- randomForest(medv ~  crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat, data=Boston, ntree=515, mtry=5)

print(bos.rf)

# Predicting the median home value for an observation with 
# crim=2, zn=12.5, indus=13, chas=0, nox=0.5, rm=5.2, age=66.3, dis=4.2, rad=4, tax=300, ptratio=18, black=390, lstat=15.5:

x.val <- c(crim=2, zn=12.5, indus=13, chas=0, nox=0.5, rm=5.2, age=66.3, dis=4.2, rad=4, tax=300, ptratio=18, black=390, lstat=15.5)

predict(bos.rf, data.frame(t(x.val)))

# Seeing how important each explanatory variable was:
# (the higher the number, the more important)

bos.rf$importance

##
## Classification with random forest:
##


# Classifying the students into one of three program types (academic, vocational, and general)
# based on numerous predictor variables (gender,race,ses,schtyp,read,write,math,science,socst).

prog.rf <- randomForest(as.factor(prog) ~ gender+race+ses+schtyp+read+write+math+science+socst, data=hsb, method="class")

print(prog.rf)

# Seeing how important each explanatory variable was in the classification:
# (the higher the number, the more important)

prog.rf$importance


# With the Egyptian skulls data from Table 5.8:

skulls <- read.table("http://www.stat.sc.edu/~hitchcock/skullschap7.txt", header=T)

attach(skulls)

skull.rf <- randomForest(as.factor(EPOCH)~MB+BH+BL+NH, method="class" )
skull.rf 

# Let's predict the epoch of a new skull with 
# MB = 135, BH = 144, BL = 97, NH = 53:

newobs <- rbind( c(135,144,97,53) )
dimnames(newobs) <- list(NULL,c('MB','BH', 'BL', 'NH'))
newobs <- data.frame(newobs)
predict(skull.rf,newdata=newobs, type='response')

predict(skull.rf,newdata=newobs, type='prob')


# Seeing how important each explanatory variable was in the classification:
# (the higher the number, the more important)

skull.rf$importance

## Specifying prior probabilities:

skull.rf.eq <- randomForest(as.factor(EPOCH)~MB+BH+BL+NH, method="class", classwt=c(0.2,0.2,0.2,0.2,0.2) )
skull.rf.eq 

# Let's predict the epoch of a new skull with 
# MB = 135, BH = 144, BL = 97, NH = 53:

newobs <- rbind( c(135,144,97,53) )
dimnames(newobs) <- list(NULL,c('MB','BH', 'BL', 'NH'))
newobs <- data.frame(newobs)
predict(skull.rf.eq,newdata=newobs, type='response')

predict(skull.rf.eq,newdata=newobs, type='prob')


# Recall:

# skull.lda <- lda(as.factor(EPOCH)~MB+BH+BL+NH, prior=c(0.2,0.2,0.2,0.2,0.2) )
# predict(skull.lda,newdata=newobs)