################################# # ## k-nearest neighbors # ################################# # load the 'class' package: library(class) # load the 'ISLR' package: library(ISLR) # Alternately, reading in the 'Caravan' data set from the web: Caravan <- read.table("http://people.stat.sc.edu/hitchcock/Caravan_data.txt", header=T) attach(Caravan) # This data set has 5822 observations and 86 variables! # We will try to predict whether a person will purchase insurance, based on the 85 other variables. # We will treat the first 1000 rows as the "test" data and the other 4822 rows as the training data. # Standardizing using the 'scale' function: Caravan.X <- Caravan[,-86] # removing the categorical response variable, "Purchase" Caravan.X.std <- scale(Caravan.X) # Defining the training data: Purchase.training <- Purchase[-(1:1000)] X.training <- Caravan.X.std[-(1:1000),] # Defining the test data: X.test <- Caravan.X.std[1:1000,] # Using KNN with K=1 to predict 'Purchase' on the test data: knn.pred = knn(train = X.training, test = X.test, cl = Purchase.training, k=1) # Seeing the classifications for the 1000 test customers: print(knn.pred) # Since we actually know which of the first 1000 customers purchased insurance, we can see how well we did: Purchase.test <- Purchase[1:1000] # misclassification rate for the 1000 customers: mean(Purchase.test != knn.pred) # It is around 12%. ### On the SAT/graduation data set: # We can read the SAT scores (with graduation information) data from the Internet: satgradu <- read.table("http://people.stat.sc.edu/hitchcock/satgradu.txt", header=T) # If not connected to the Internet, we could save the file and read it # similarly to the following: # satgradu<-read.table("Z:/stat_530/satgradu.txt", header=T) # The first three columns are the SAT scores on the 3 tests, # the last column (called gradu) is an indicator of whether the student successfully graduated # (1 = graduated, 0 = did not graduate) attach(satgradu) X.train.sat <- satgradu[,c('math','reading', 'writing')] # Making predictions for several new individuals at once: newobs <- rbind( c(300,420,280), c(510,480,470), c(780,760,710) ) dimnames(newobs) <- list(NULL,c('math','reading', 'writing')) newobs <- data.frame(newobs) # KNN with various values of K: dis.knn <- knn(train = X.train.sat, test = newobs, cl = gradu, k=1, prob=TRUE) print(dis.knn) dis.knn <- knn(train = X.train.sat, test = newobs, cl = gradu, k=4, prob=TRUE) print(dis.knn) dis.knn <- knn(train = X.train.sat, test = newobs, cl = gradu, k=10, prob=TRUE) print(dis.knn) dis.knn <- knn(train = X.train.sat, test = newobs, cl = gradu, k=20, prob=TRUE) print(dis.knn) ##### Misclassification rate of KNN classification rule: # Simple plug-in misclassification rate (for KNN with K=4): group<-knn(train = X.train.sat, test = X.train.sat, cl = gradu, k=4) table(group,gradu) # The plug-in misclassification rate for KNN here is (2+8)/40 = 0.25. # The plug-in misclassification rate for LDA here was (11+4)/40 = 0.375. ################################# # ## Regression Trees # ################################# # load the 'rpart' package: library(rpart) # load the 'MASS' package: library(MASS) # loading the 'Boston' dataset: data(Boston) # Sample of 506 tracts in suburban Boston # attaching the 'Boston' dataset: # attach(Boston) # examining a scatterplot matrix: # pairs(Boston, pch=".") # Goal: Create a regression tree that will predict the median value of a home (medv, measured in thousands of dollars) # based on 13 possible explanatory variables: ## The variables: # crim: per capita crime rate by town. # zn: proportion of residential land zoned for lots over 25,000 sq.ft. # indus: proportion of non-retail business acres per town. # chas: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise). # nox: nitrogen oxides concentration (parts per 10 million). # rm: average number of rooms per dwelling. # age: proportion of owner-occupied units built prior to 1940. # dis: weighted mean of distances to five Boston employment centres. # rad: index of accessibility to radial highways. # tax: full-value property-tax rate per \$10,000. # ptratio: pupil-teacher ratio by town. # black: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town. # lstat: lower status of the population (percent). # medv: median value of owner-occupied homes in \$1000s. # fitting a regression tree, with default settings: bos.tree <- rpart(medv ~ crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat, data=Boston) # basic summary output: print(bos.tree) # plotting the graph of the tree (spacing proportional to improvement in fit): plot(bos.tree); text(bos.tree) ## At each split: If condition is TRUE, proceed to the LEFT branch. # plotting the graph of the tree (with uniform spacing): plot(bos.tree, compress=T, uniform=T, branch=0.4); text(bos.tree) ## Diagnostics based on residuals: plot(predict(bos.tree), residuals(bos.tree), xlab='Predicted', ylab='Residuals') qqnorm(residuals(bos.tree), main='Normal Q-Q plot of Residuals') #### Predictions: # Predict the median home value for an observation that is "typical" in all explanatory variables: x.med <- apply(Boston[,-14],2,median) predict(bos.tree, data.frame(t(x.med))) # Predicting the median home value for an observation with # crim=2, zn=12.5, indus=13, chas=0, nox=0.5, rm=5.2, age=66.3, dis=4.2, rad=4, tax=300, ptratio=18, black=390, lstat=15.5: x.val <- c(crim=2, zn=12.5, indus=13, chas=0, nox=0.5, rm=5.2, age=66.3, dis=4.2, rad=4, tax=300, ptratio=18, black=390, lstat=15.5) predict(bos.tree, data.frame(t(x.val))) ################################################# ## ## Tree Pruning (Selection) ## ################################################# # cp = ratio of lambda to the RSS on the no-branch tree # A larger cp encourages a less complex tree # A smaller cp allows a tree with lots of splits and branches # Listing all trees with cp of at least 0.001: bos.tree.cp <- rpart(medv ~ crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat, data=Boston, cp=0.001) printcp(bos.tree.cp) # Selecting cp that minimizes the 'xerror': bos.tree.best <- prune.rpart(bos.tree.cp, cp=0.0011373) # Plotting the 'best' tree: plot(bos.tree.best); text(bos.tree.best) ## This is a more complex solution than before (too complex?). ##### A simpler tree (with higher cp): bos.tree.simp <- prune.rpart(bos.tree.cp, cp=0.0361643) # Plotting this tree: plot(bos.tree.simp); text(bos.tree.simp) ##### Relative Error vs. cp plot: plotcp(bos.tree.best) ## Where is the "elbow" in the plot? # # Predictions can then be done as above, based on whatever tree we select. # ##### A medium-sized tree (with cp arond the elbow): bos.tree.medium <- prune.rpart(bos.tree.cp, cp=0.0158512) # Plotting this tree: plot(bos.tree.medium); text(bos.tree.medium) # Basically looks like the one R originally gave us using default settings. ################################# # ## Classification Trees # ################################# # load the 'rpart' package: library(rpart) ### Example: # Reading in the hsb data: hsb <- read.table("http://people.stat.sc.edu/hitchcock/hsbdata.txt", header=T) attach(hsb) # A classification tree to classify the students into one of three program types (academic, vocational, and general) # based on numerous predictor variables (gender,race,ses,schtyp,read,write,math,science,socst). tree.prog <- rpart(prog ~ gender+race+ses+schtyp+read+write+math+science+socst, data=hsb, method="class") # Summary of object: tree.prog # Plotting this tree: plot(tree.prog); text(tree.prog) # Predicting the program type for a student with # gender='female',race='white',ses='low',schtyp='public',read=55,write=51,math=44,science=49,socst=56: # Since some predictors have "character" values and some predictors have numeric values, # we must put the values directly into a data frame rather than creating a vector first: predict(tree.prog, data.frame(gender='female',race='white',ses='low',schtyp='public',read=55,write=51,math=44,science=49,socst=56)) # # Similar tree pruning/selection as with regression trees could be done, if desired. # ##### Misclassification rates of LDA rule vs. Classification tree: # Simple plug-in misclassification rate of LDA rule here: library(MASS) # assuming equal prior probabilities of graduating or not: dis <- lda(prog ~ gender+race+ses+schtyp+read+write+math+science+socst, data=hsb, prior=c(1/3, 1/3, 1/3)) group<-predict(dis, hsb, method='plug-in')$class tab.lda.eq <- table(group,prog) # The plug-in misclassification rate for LDA here with equal prior probabilities is: 1-( sum(diag(tab.lda.eq)) / sum(tab.lda.eq) ) # If we do not specify any prior probabilities, it will by default use the proportions # of the sampled individuals that are in each group as the prior probabilities. dis2 <- lda(prog ~ gender+race+ses+schtyp+read+write+math+science+socst, data=hsb) group<-predict(dis2, hsb, method='plug-in')$class tab.lda.prop <- table(group,prog) # The plug-in misclassification rate for LDA here with proportional prior probabilities is: 1-( sum(diag(tab.lda.prop)) / sum(tab.lda.prop) ) group<-predict(tree.prog, type="class") tab.tree <- table(group,prog) # The plug-in misclassification rate for the classification tree is: 1-( sum(diag(tab.tree)) / sum(tab.tree) ) ## The 'rpart' function will also accept user-specified prior class probabilities ## (by default it uses the proportions of individuals in each group as the prior probabilities): tree.prog.eq <- rpart(prog ~ gender+race+ses+schtyp+read+write+math+science+socst, data=hsb, method="class", parms = list(prior = c(1/3, 1/3, 1/3)) ) group<-predict(tree.prog.eq, type="class") tab.tree.eq <- table(group,prog) # The plug-in misclassification rate for the classification tree is: 1-( sum(diag(tab.tree.eq)) / sum(tab.tree.eq) ) ################################# # ## Random Forests # ################################# # loading the 'randomForest' package: library(randomForest) ## ## Regression with random forest: ## library(MASS) # loading the 'Boston' dataset: data(Boston) bos.rf <- randomForest(medv ~ crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat, data=Boston) # default 'ntree' is 500 # default 'mtry' is p/3 for regression and sqrt(p) for classification, where 'p' is the number of predictors. # These can be adjusted if desired: # bos.rf2 <- randomForest(medv ~ crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat, data=Boston, ntree=515, mtry=5) print(bos.rf) # Predicting the median home value for an observation with # crim=2, zn=12.5, indus=13, chas=0, nox=0.5, rm=5.2, age=66.3, dis=4.2, rad=4, tax=300, ptratio=18, black=390, lstat=15.5: x.val <- c(crim=2, zn=12.5, indus=13, chas=0, nox=0.5, rm=5.2, age=66.3, dis=4.2, rad=4, tax=300, ptratio=18, black=390, lstat=15.5) predict(bos.rf, data.frame(t(x.val))) # Seeing how important each explanatory variable was: # (the higher the number, the more important) bos.rf$importance ## ## Classification with random forest: ## # Classifying the students into one of three program types (academic, vocational, and general) # based on numerous predictor variables (gender,race,ses,schtyp,read,write,math,science,socst). prog.rf <- randomForest(as.factor(prog) ~ gender+race+ses+schtyp+read+write+math+science+socst, data=hsb, method="class") print(prog.rf) # Seeing how important each explanatory variable was in the classification: # (the higher the number, the more important) prog.rf$importance # With the Egyptian skulls data from Table 5.8: skulls <- read.table("http://www.stat.sc.edu/~hitchcock/skullschap7.txt", header=T) attach(skulls) skull.rf <- randomForest(as.factor(EPOCH)~MB+BH+BL+NH, method="class" ) skull.rf # Let's predict the epoch of a new skull with # MB = 135, BH = 144, BL = 97, NH = 53: newobs <- rbind( c(135,144,97,53) ) dimnames(newobs) <- list(NULL,c('MB','BH', 'BL', 'NH')) newobs <- data.frame(newobs) predict(skull.rf,newdata=newobs, type='response') predict(skull.rf,newdata=newobs, type='prob') # Seeing how important each explanatory variable was in the classification: # (the higher the number, the more important) skull.rf$importance ## Specifying prior probabilities: skull.rf.eq <- randomForest(as.factor(EPOCH)~MB+BH+BL+NH, method="class", classwt=c(0.2,0.2,0.2,0.2,0.2) ) skull.rf.eq # Let's predict the epoch of a new skull with # MB = 135, BH = 144, BL = 97, NH = 53: newobs <- rbind( c(135,144,97,53) ) dimnames(newobs) <- list(NULL,c('MB','BH', 'BL', 'NH')) newobs <- data.frame(newobs) predict(skull.rf.eq,newdata=newobs, type='response') predict(skull.rf.eq,newdata=newobs, type='prob') # Recall: # skull.lda <- lda(as.factor(EPOCH)~MB+BH+BL+NH, prior=c(0.2,0.2,0.2,0.2,0.2) ) # predict(skull.lda,newdata=newobs)