#### HW 6 solutions, Fall 2022 # Problem 1 (a): bumpbird <- read.table("http://www.stat.sc.edu/~hitchcock/bumpusbird.txt", header=T) names(bumpbird) <- c("ID", "tot.length", "alar.length", "beak.head.length", "humerus.length", "keel.stern.length") attach(bumpbird) bumpbird.numeric <- bumpbird[,-1] bumpbird.IDs <- bumpbird[,1] survival.indicator <- as.factor(c(rep("survived",times=21),rep("died",length=28))) # Using the built-in lda function in the MASS package # for linear discriminant analysis: library(MASS) # assuming equal prior probabilities of surviving or not: dis <- lda(survival.indicator ~ tot.length+alar.length+beak.head.length+humerus.length+keel.stern.length, data=bumpbird, prior=c(0.5, 0.5)) # Let's predict whether a new bird with # measurements of: tot.length=156, alar.length=242, beak.head.length=31.4, humerus.length=18.1, keel.stern.length=19.4 # will survive: newobs <- rbind( c(155, 238, 31.2, 17.9, 19.3) ) dimnames(newobs) <- list(NULL,c("tot.length", "alar.length", "beak.head.length", "humerus.length", "keel.stern.length")) newobs <- data.frame(newobs) predict(dis,newdata=newobs)$class # Posterior probabilities of this bird being in each group: predict(dis,newdata=newobs)$posterior ## problem 1(b) ##### Misclassification rate of LDA rule: # Simple plug-in misclassification rate: group<-predict(dis, bumpbird, method='plug-in')$class table(group,survival.indicator) # The plug-in misclassification rate for LDA here is 17/49 = 0.347. # cross-validation rate of LDA rule: ######## correct<-rep(0,times=nrow(bumpbird.numeric) ) for (j in 1:nrow(bumpbird.numeric) ) { mydis<-lda(grouping=survival.indicator[-j], x=bumpbird.numeric[-j,1:5], prior=c(0.5,0.5)) mypred<-predict(mydis,newdata=bumpbird.numeric[j,1:5])$class correct[j] <- (mypred==survival.indicator[j]) } cv.misclass <- 1-mean(correct) cv.misclass ######### # The CV rate is 0.551. # assuming default prior probabilities: dis2 <- lda(survival.indicator ~ tot.length+alar.length+beak.head.length+humerus.length+keel.stern.length, data=bumpbird) # Let's predict whether a new bird with # measurements of: tot.length=156, alar.length=242, beak.head.length=31.4, humerus.length=18.1, keel.stern.length=19.4 # will survive: newobs <- rbind( c(155, 238, 31.2, 17.9, 19.3) ) dimnames(newobs) <- list(NULL,c("tot.length", "alar.length", "beak.head.length", "humerus.length", "keel.stern.length")) newobs <- data.frame(newobs) predict(dis2,newdata=newobs)$class # Posterior probabilities of this bird being in each group: predict(dis2,newdata=newobs)$posterior ## problem 1(d) ##### Misclassification rate of LDA rule: # Simple plug-in misclassification rate: group<-predict(dis2, bumpbird, method='plug-in')$class table(group,survival.indicator) # The plug-in misclassification rate for LDA here is 18/49 = 0.367. # cross-validation rate of LDA rule: ######## correct<-rep(0,times=nrow(bumpbird.numeric) ) for (j in 1:nrow(bumpbird.numeric) ) { mydis<-lda(grouping=survival.indicator[-j], x=bumpbird.numeric[-j,1:5]) mypred<-predict(mydis,newdata=bumpbird.numeric[j,1:5])$class correct[j] <- (mypred==survival.indicator[j]) } cv.misclass <- 1-mean(correct) cv.misclass ######### # The CV rate is 0.469. ########################################################################################### ### problem 2: skulls <- read.table("http://people.stat.sc.edu/hitchcock/skullschap7.txt", header=T) attach(skulls) library(rpart) tree.sk.eq <- rpart(EPOCH ~ MB+BH+BL+NH, data=skulls, method="class", parms = list(prior = c(1/5, 1/5, 1/5, 1/5, 1/5)) ) # Plotting this tree: plot(tree.sk.eq); text(tree.sk.eq) group<-predict(tree.sk.eq, type="class") tab.tree.eq <- table(group,EPOCH) # The plug-in misclassification rate for the classification tree is: 1-( sum(diag(tab.tree.eq)) / sum(tab.tree.eq) ) # Let's predict the epoch of a new skull with # MB = 133, BH = 130, BL = 95, NH = 50: predict(tree.sk.eq, data.frame(MB = 133, BH = 130, BL = 95, NH = 50)) ## Part (b) # loading the 'randomForest' package: library(randomForest) ## Specifying prior probabilities: skull.rf.eq <- randomForest(as.factor(EPOCH)~MB+BH+BL+NH, method="class", classwt=c(0.2,0.2,0.2,0.2,0.2) ) skull.rf.eq # Let's predict the epoch of a new skull with # MB = 133.0, BH = 130.0, BL = 95.0, NH = 50.0 newobs <- rbind( c(133,130,95,50) ) dimnames(newobs) <- list(NULL,c('MB','BH', 'BL', 'NH')) newobs <- data.frame(newobs) predict(skull.rf.eq,newdata=newobs, type='response') predict(skull.rf.eq,newdata=newobs, type='prob') # Seeing how important each explanatory variable was in the classification: # (the higher the number, the more important) skull.rf.eq$importance # K-nearest neighbors: newobs <- rbind( c(133,130,95,50) ) dimnames(newobs) <- list(NULL,c('MB','BH', 'BL', 'NH')) newobs <- data.frame(newobs) library(class) knn.pred = knn(train = skulls[,-1], test = newobs, cl = skulls[,1], k=3) # Change k to 3, 5, 9 print(knn.pred) ######################################################################## ### Problem 3 USairpol.full <- read.table("http://people.stat.sc.edu/hitchcock/usair.txt", header=T) city.names <- as.character(USairpol.full[,1]) USairpol.data <- USairpol.full[,-1] USairpol.data$Temp <- (-USairpol.data$Temp) attach(USairpol.data) # load the 'rpart' package: library(rpart) # load the 'MASS' package: library(MASS) # fitting a regression tree, with default settings: usairpol.tree <- rpart(SO2 ~ Temp+Manuf+Pop+Wind+Precip+Days, data=USairpol.data) # basic summary output: print(usairpol.tree) # plotting the graph of the tree (spacing proportional to improvement in fit): plot(usairpol.tree); text(usairpol.tree) x.val <- c(Temp=60, Manuf=390, Pop=500, Wind=8.5, Precip=45, Days=110) predict(usairpol.tree, data.frame(t(x.val))) ## Random Forest: library(randomForest) ## ## Regression with random forest: ## library(MASS) usairpol.rf <- randomForest(SO2 ~ Temp+Manuf+Pop+Wind+Precip+Days, data=USairpol.data) x.val <- c(Temp=60, Manuf=390, Pop=500, Wind=8.5, Precip=45, Days=110) predict(usairpol.rf, data.frame(t(x.val))) # Seeing how important each explanatory variable was: # (the higher the number, the more important) usairpol.rf$importance library(caret) my.k <- 3 # Change this to 5 and 10 also # KNN Regression with multiple predictors: usairpol.reg.knn <- caret::knnreg(SO2 ~ Temp+Manuf+Pop+Wind+Precip+Days, data=USairpol.data, k=my.k) x.val <- c(Temp=60, Manuf=390, Pop=500, Wind=8.5, Precip=45, Days=110) predict(usairpol.reg.knn, data.frame(t(x.val))) # To check model fit: Plot predicted y-values against actual y-values: plot(SO2, predict(usairpol.reg.knn), xlab="y", ylab=expression(hat(y))) abline(a=0,b=1) #adds a 45-degree line to the plot ######################################################################## ### Problem 4 sidsdata <- read.table("http://www.stat.sc.edu/~hitchcock/SIDSdata.txt", header=T) attach(sidsdata) library(MASS) # load the 'class' package: library(class) # load the 'ISLR' package: library(ISLR) library(e1071) X.train <- cbind(HR,BW,Factor68,Gesage) y.train <- Group # Creating the SVC: dat=data.frame(x=X.train, y=as.factor(y.train)) tune.out <- tune(svm, y ~ ., data=dat, kernel='radial', scale=TRUE, probability=TRUE, ranges=list(cost=c(.001,.01,.1,1,5,10,100,1000), gamma=c(0.5,1,2,3,4) )) summary(tune.out) # cost=0.001 and gamma=5 works well. svmfit <- svm(y ~ ., data=dat, kernel='radial', scale=TRUE, probability=TRUE, cost=.001, gamma=5) # The points plotted as "x" are the support vectors. The other points are plotted as "o". # HR = 100, BW = 3000, Factor68 = 0.3, Gesage = 40 newobs <- rbind( c(100,3000,0.3,40) ) dimnames(newobs) <- list(NULL,c('HR','BW','Factor68','Gesage')) testdat <- data.frame(x=newobs) predict(svmfit,testdat,probability=TRUE)