#################################
#
## Regression Trees
#
#################################


# load the 'rpart' package:

library(rpart)

# load the 'MASS' package:

library(MASS)

# loading the 'Boston' dataset:

data(Boston)

# Sample of 506 tracts in suburban Boston

# attaching the 'Boston' dataset:

# attach(Boston)

# examining a scatterplot matrix:
# pairs(Boston, pch=".")

# Goal:  Create a regression tree that will predict the median value of a home (medv, measured in thousands of dollars)
# based on 13 possible explanatory variables:

## The variables:

# crim: per capita crime rate by town. 
# zn: proportion of residential land zoned for lots over 25,000 sq.ft. 
# indus: proportion of non-retail business acres per town. 
# chas: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise). 
# nox: nitrogen oxides concentration (parts per 10 million). 
# rm: average number of rooms per dwelling. 
# age: proportion of owner-occupied units built prior to 1940. 
# dis: weighted mean of distances to five Boston employment centres. 
# rad: index of accessibility to radial highways. 
# tax: full-value property-tax rate per \$10,000. 
# ptratio: pupil-teacher ratio by town. 
# black: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town. 
# lstat: lower status of the population (percent). 
# medv: median value of owner-occupied homes in \$1000s. 



# fitting a regression tree, with default settings:

bos.tree <- rpart(medv ~  crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat, data=Boston)

# basic summary output:

print(bos.tree)

# plotting the graph of the tree (spacing proportional to improvement in fit):

plot(bos.tree); text(bos.tree)

## At each split:  If condition is TRUE, proceed to the LEFT branch.

# plotting the graph of the tree (with uniform spacing):

plot(bos.tree, compress=T, uniform=T, branch=0.4); text(bos.tree)


## Diagnostics based on residuals:

plot(predict(bos.tree), residuals(bos.tree), xlab='Predicted', ylab='Residuals')

qqnorm(residuals(bos.tree), main='Normal Q-Q plot of Residuals')



#### Predictions:

# Predict the median home value for an observation that is "typical" in all explanatory variables:

x.med <- apply(Boston[,-14],2,median)

predict(bos.tree, data.frame(t(x.med)))


# Predicting the median home value for an observation with 
# crim=2, zn=12.5, indus=13, chas=0, nox=0.5, rm=5.2, age=66.3, dis=4.2, rad=4, tax=300, ptratio=18, black=390, lstat=15.5:

x.val <- c(crim=2, zn=12.5, indus=13, chas=0, nox=0.5, rm=5.2, age=66.3, dis=4.2, rad=4, tax=300, ptratio=18, black=390, lstat=15.5)

predict(bos.tree, data.frame(t(x.val)))

#################################################
##
## Tree Pruning (Selection)
##
#################################################

# cp = ratio of lambda to the RSS on the no-branch tree
# A larger cp encourages a less complex tree
# A smaller cp allows a tree with lots of splits and branches

# Listing all trees with cp of at least 0.001:

bos.tree.cp <- rpart(medv ~  crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat, data=Boston, cp=0.001)

printcp(bos.tree.cp)

# Selecting cp that minimizes the 'xerror':

bos.tree.best <- prune.rpart(bos.tree.cp, cp=0.0022459)

# Plotting the 'best' tree:

plot(bos.tree.best); text(bos.tree.best)

## This is a more complex solution than before (too complex?).

##### A simpler tree (with higher cp):

bos.tree.simp <- prune.rpart(bos.tree.cp, cp=0.0361643)

# Plotting this tree:

plot(bos.tree.simp); text(bos.tree.simp)



##### Relative Error vs. cp plot:

plotcp(bos.tree.best)

## Where is the "elbow" in the plot?

#
# Predictions can then be done as above, based on whatever tree we select.
#

################################################
##
## University admissions data
##
################################################



# load the 'rpart' package:

library(rpart)

# load the 'MASS' package:

library(MASS)

# reading in the 'univadmiss' dataset from Appendix C.4 of the book:

univadmiss <- read.table(file = "http://people.stat.sc.edu/Hitchcock/universityadmissions.txt", 
header=FALSE, col.names = c('student', 'GPA', 'HSrank', 'ACT', 'Year'))


# Sample of 705 university freshmen

# attaching the 'univadmiss' dataset:

# attach(univadmiss)

# examining a scatterplot matrix:
# pairs(univadmiss, pch=".")


## The variables:

# fitting a regression tree, with default settings:

univ.tree <- rpart(GPA ~  ACT+HSrank, data=univadmiss)

# basic summary output:

print(univ.tree)

# plotting the graph of the tree (spacing proportional to improvement in fit):

plot(univ.tree); text(univ.tree)

## At each split:  If condition is TRUE, proceed to the LEFT branch.

# plotting the graph of the tree (with uniform spacing):

plot(univ.tree, compress=T, uniform=T, branch=0.4); text(univ.tree)


## Diagnostics based on residuals:

plot(predict(univ.tree), residuals(univ.tree), xlab='Predicted', ylab='Residuals')

qqnorm(residuals(univ.tree), main='Normal Q-Q plot of Residuals')


#### Predictions:


# Predicting the freshman GPA for an observation with 
# ACT=26,HSrank=90

x.val <- c(ACT=26,HSrank=90)

predict(univ.tree, data.frame(t(x.val)))

### A Bootstrap interval for this estimated mean GPA value with ACT=26,HSrank=90:

### Random X resampling:

B = 1000
yhat.star.vec = rep(0,times=B)

x.val <- c(ACT=26,HSrank=90)

for (i in 1:B) {
boot.samp <- univadmiss[sample(1:(nrow(univadmiss)), replace=T),]
univ.tree.boot <- rpart(GPA ~  ACT+HSrank, data=boot.samp)
yhat.star.vec[i] <- predict(univ.tree.boot, data.frame(t(x.val)))
}

hist(yhat.star.vec)

alpha=0.05

# Percentile-method 95% CI for E(Y_h):

L.p <- quantile(yhat.star.vec, alpha/2)
U.p <- quantile(yhat.star.vec, 1-alpha/2)
print(paste("Percentile-method 95% CI for E(Y):", round(L.p,4), round(U.p,4)) )


#################################################
##
## Tree Pruning (Selection)
##
#################################################


# Listing all trees with cp of at least 0.001:

univ.tree.cp <- rpart(GPA ~  ACT+HSrank, data=univadmiss, cp=0.001)

printcp(univ.tree.cp)

# Selecting cp that minimizes the 'xerror':

univ.tree.best <- prune.rpart(univ.tree.cp, cp=0.0041288)

# Plotting the 'best' tree:

plot(univ.tree.best); text(univ.tree.best)

## This is a more complex solution than before (too complex?).

##### A simpler tree (with higher cp):

univ.tree.simp <- prune.rpart(univ.tree.cp, cp=0.0068851)

# Plotting this tree:

plot(univ.tree.simp); text(univ.tree.simp)


# The 'minsplit' argument sets "the minimum number of observations that must exist in a node in order for a split to be attempted".
# the default min split is 20, but this can be adjusted.


univ.tree.cp.ms <- rpart(GPA ~  ACT+HSrank, data=univadmiss, minsplit=10)
plot(univ.tree.cp.ms); text(univ.tree.cp.ms)

# Doesn't change in this case, but for a small-sample problem, it could make a difference.

# A silly 'minsplit' rule:
univ.tree.cp.ms <- rpart(GPA ~  ACT+HSrank, data=univadmiss, minsplit=300)
plot(univ.tree.cp.ms); text(univ.tree.cp.ms)


##### Relative Error vs. cp plot:

plotcp(univ.tree.best)

## Where is the "elbow" in the plot?

#
# Predictions can then be done as above, based on whatever tree we select.
#


#################################
#
## Random Forests
#
#################################

# loading the 'randomForest' package:

library(randomForest)

##
## Regression with random forest:
##

library(MASS)

# loading the 'Boston' dataset:

data(Boston)

bos.rf <- randomForest(medv ~  crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat, data=Boston)

# default 'ntree' is 500
# default 'mtry' is p/3 for regression and sqrt(p) for classification, where 'p' is the number of predictors.
# These can be adjusted if desired:

# bos.rf2 <- randomForest(medv ~  crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat, data=Boston, ntree=515, mtry=5)

print(bos.rf)

# Predicting the median home value for an observation with 
# crim=2, zn=12.5, indus=13, chas=0, nox=0.5, rm=5.2, age=66.3, dis=4.2, rad=4, tax=300, ptratio=18, black=390, lstat=15.5:

x.val <- c(crim=2, zn=12.5, indus=13, chas=0, nox=0.5, rm=5.2, age=66.3, dis=4.2, rad=4, tax=300, ptratio=18, black=390, lstat=15.5)

predict(bos.rf, data.frame(t(x.val)))

# Seeing how important each explanatory variable was:
# (the higher the number, the more important)

bos.rf$importance



#### The University Admissions regression with a random forest:

univadmiss <- read.table(file = "http://people.stat.sc.edu/Hitchcock/universityadmissions.txt", 
header=FALSE, col.names = c('student', 'GPA', 'HSrank', 'ACT', 'Year'))

univ.rf <- randomForest(GPA ~  ACT+HSrank, data=univadmiss)

# Predicting the freshman GPA for an observation with 
# ACT=26,HSrank=90

x.val <- c(ACT=26,HSrank=90)

predict(univ.rf, data.frame(t(x.val)))