################################# # ## Regression Trees # ################################# # load the 'rpart' package: library(rpart) # load the 'MASS' package: library(MASS) # loading the 'Boston' dataset: data(Boston) # Sample of 506 tracts in suburban Boston # attaching the 'Boston' dataset: # attach(Boston) # examining a scatterplot matrix: # pairs(Boston, pch=".") # Goal: Create a regression tree that will predict the median value of a home (medv, measured in thousands of dollars) # based on 13 possible explanatory variables: ## The variables: # crim: per capita crime rate by town. # zn: proportion of residential land zoned for lots over 25,000 sq.ft. # indus: proportion of non-retail business acres per town. # chas: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise). # nox: nitrogen oxides concentration (parts per 10 million). # rm: average number of rooms per dwelling. # age: proportion of owner-occupied units built prior to 1940. # dis: weighted mean of distances to five Boston employment centres. # rad: index of accessibility to radial highways. # tax: full-value property-tax rate per \$10,000. # ptratio: pupil-teacher ratio by town. # black: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town. # lstat: lower status of the population (percent). # medv: median value of owner-occupied homes in \$1000s. # fitting a regression tree, with default settings: bos.tree <- rpart(medv ~ crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat, data=Boston) # basic summary output: print(bos.tree) # plotting the graph of the tree (spacing proportional to improvement in fit): plot(bos.tree); text(bos.tree) ## At each split: If condition is TRUE, proceed to the LEFT branch. # plotting the graph of the tree (with uniform spacing): plot(bos.tree, compress=T, uniform=T, branch=0.4); text(bos.tree) ## Diagnostics based on residuals: plot(predict(bos.tree), residuals(bos.tree), xlab='Predicted', ylab='Residuals') qqnorm(residuals(bos.tree), main='Normal Q-Q plot of Residuals') #### Predictions: # Predict the median home value for an observation that is "typical" in all explanatory variables: x.med <- apply(Boston[,-14],2,median) predict(bos.tree, data.frame(t(x.med))) # Predicting the median home value for an observation with # crim=2, zn=12.5, indus=13, chas=0, nox=0.5, rm=5.2, age=66.3, dis=4.2, rad=4, tax=300, ptratio=18, black=390, lstat=15.5: x.val <- c(crim=2, zn=12.5, indus=13, chas=0, nox=0.5, rm=5.2, age=66.3, dis=4.2, rad=4, tax=300, ptratio=18, black=390, lstat=15.5) predict(bos.tree, data.frame(t(x.val))) ################################################# ## ## Tree Pruning (Selection) ## ################################################# # cp = ratio of lambda to the RSS on the no-branch tree # A larger cp encourages a less complex tree # A smaller cp allows a tree with lots of splits and branches # Listing all trees with cp of at least 0.001: bos.tree.cp <- rpart(medv ~ crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat, data=Boston, cp=0.001) printcp(bos.tree.cp) # Selecting cp that minimizes the 'xerror': bos.tree.best <- prune.rpart(bos.tree.cp, cp=0.0022459) # Plotting the 'best' tree: plot(bos.tree.best); text(bos.tree.best) ## This is a more complex solution than before (too complex?). ##### A simpler tree (with higher cp): bos.tree.simp <- prune.rpart(bos.tree.cp, cp=0.0361643) # Plotting this tree: plot(bos.tree.simp); text(bos.tree.simp) ##### Relative Error vs. cp plot: plotcp(bos.tree.best) ## Where is the "elbow" in the plot? # # Predictions can then be done as above, based on whatever tree we select. # ################################################ ## ## University admissions data ## ################################################ # load the 'rpart' package: library(rpart) # load the 'MASS' package: library(MASS) # reading in the 'univadmiss' dataset from Appendix C.4 of the book: univadmiss <- read.table(file = "http://people.stat.sc.edu/Hitchcock/universityadmissions.txt", header=FALSE, col.names = c('student', 'GPA', 'HSrank', 'ACT', 'Year')) # Sample of 705 university freshmen # attaching the 'univadmiss' dataset: # attach(univadmiss) # examining a scatterplot matrix: # pairs(univadmiss, pch=".") ## The variables: # fitting a regression tree, with default settings: univ.tree <- rpart(GPA ~ ACT+HSrank, data=univadmiss) # basic summary output: print(univ.tree) # plotting the graph of the tree (spacing proportional to improvement in fit): plot(univ.tree); text(univ.tree) ## At each split: If condition is TRUE, proceed to the LEFT branch. # plotting the graph of the tree (with uniform spacing): plot(univ.tree, compress=T, uniform=T, branch=0.4); text(univ.tree) ## Diagnostics based on residuals: plot(predict(univ.tree), residuals(univ.tree), xlab='Predicted', ylab='Residuals') qqnorm(residuals(univ.tree), main='Normal Q-Q plot of Residuals') #### Predictions: # Predicting the freshman GPA for an observation with # ACT=26,HSrank=90 x.val <- c(ACT=26,HSrank=90) predict(univ.tree, data.frame(t(x.val))) ### A Bootstrap interval for this estimated mean GPA value with ACT=26,HSrank=90: ### Random X resampling: B = 1000 yhat.star.vec = rep(0,times=B) x.val <- c(ACT=26,HSrank=90) for (i in 1:B) { boot.samp <- univadmiss[sample(1:(nrow(univadmiss)), replace=T),] univ.tree.boot <- rpart(GPA ~ ACT+HSrank, data=boot.samp) yhat.star.vec[i] <- predict(univ.tree.boot, data.frame(t(x.val))) } hist(yhat.star.vec) alpha=0.05 # Percentile-method 95% CI for E(Y_h): L.p <- quantile(yhat.star.vec, alpha/2) U.p <- quantile(yhat.star.vec, 1-alpha/2) print(paste("Percentile-method 95% CI for E(Y):", round(L.p,4), round(U.p,4)) ) ################################################# ## ## Tree Pruning (Selection) ## ################################################# # Listing all trees with cp of at least 0.001: univ.tree.cp <- rpart(GPA ~ ACT+HSrank, data=univadmiss, cp=0.001) printcp(univ.tree.cp) # Selecting cp that minimizes the 'xerror': univ.tree.best <- prune.rpart(univ.tree.cp, cp=0.0041288) # Plotting the 'best' tree: plot(univ.tree.best); text(univ.tree.best) ## This is a more complex solution than before (too complex?). ##### A simpler tree (with higher cp): univ.tree.simp <- prune.rpart(univ.tree.cp, cp=0.0068851) # Plotting this tree: plot(univ.tree.simp); text(univ.tree.simp) # The 'minsplit' argument sets "the minimum number of observations that must exist in a node in order for a split to be attempted". # the default min split is 20, but this can be adjusted. univ.tree.cp.ms <- rpart(GPA ~ ACT+HSrank, data=univadmiss, minsplit=10) plot(univ.tree.cp.ms); text(univ.tree.cp.ms) # Doesn't change in this case, but for a small-sample problem, it could make a difference. # A silly 'minsplit' rule: univ.tree.cp.ms <- rpart(GPA ~ ACT+HSrank, data=univadmiss, minsplit=300) plot(univ.tree.cp.ms); text(univ.tree.cp.ms) ##### Relative Error vs. cp plot: plotcp(univ.tree.best) ## Where is the "elbow" in the plot? # # Predictions can then be done as above, based on whatever tree we select. # ################################# # ## Random Forests # ################################# # loading the 'randomForest' package: library(randomForest) ## ## Regression with random forest: ## library(MASS) # loading the 'Boston' dataset: data(Boston) bos.rf <- randomForest(medv ~ crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat, data=Boston) # default 'ntree' is 500 # default 'mtry' is p/3 for regression and sqrt(p) for classification, where 'p' is the number of predictors. # These can be adjusted if desired: # bos.rf2 <- randomForest(medv ~ crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat, data=Boston, ntree=515, mtry=5) print(bos.rf) # Predicting the median home value for an observation with # crim=2, zn=12.5, indus=13, chas=0, nox=0.5, rm=5.2, age=66.3, dis=4.2, rad=4, tax=300, ptratio=18, black=390, lstat=15.5: x.val <- c(crim=2, zn=12.5, indus=13, chas=0, nox=0.5, rm=5.2, age=66.3, dis=4.2, rad=4, tax=300, ptratio=18, black=390, lstat=15.5) predict(bos.rf, data.frame(t(x.val))) # Seeing how important each explanatory variable was: # (the higher the number, the more important) bos.rf$importance #### The University Admissions regression with a random forest: univadmiss <- read.table(file = "http://people.stat.sc.edu/Hitchcock/universityadmissions.txt", header=FALSE, col.names = c('student', 'GPA', 'HSrank', 'ACT', 'Year')) univ.rf <- randomForest(GPA ~ ACT+HSrank, data=univadmiss) # Predicting the freshman GPA for an observation with # ACT=26,HSrank=90 x.val <- c(ACT=26,HSrank=90) predict(univ.rf, data.frame(t(x.val)))