# Example R code, Support Vector Classifier and Support Vector Machine: # A baseball example, with ONLY using two dimensions (OPS and WHIP) to predict winning status: bat2015 <- read.csv("http://www.stat.sc.edu/~hitchcock/baseball2015batting.txt", header=T) pitch2015 <- read.csv("http://www.stat.sc.edu/~hitchcock/baseball2015pitching.txt", header=T) baseball2015 <- merge(bat2015,pitch2015,by="Tm") winning <- (baseball2015$WLP>0.5) team <- baseball2015$Tm OPS2015 <- baseball2015$OPS WHIP2015 <- baseball2015$WHIP baseball2015class <- data.frame(team,winning,OPS2015,WHIP2015) # The columns called OPS2015 and WHIP2015 are numerical measures of batting and pitching performance, respectively. # the column called winning is an indicator of whether the team had a winning record that year. attach(baseball2015class) library(e1071) X.train <- cbind(OPS2015,WHIP2015) y.train <- winning plot(X.train[,2:1], col=(3-y.train) ) # The 2:1 is just to switch which variable goes on which axis. # Creating the SVC: dat=data.frame(x=X.train, y=as.factor(y.train)) # Picking the cost parameter by cross-validation: tune.out <- tune(svm, y ~ ., data=dat, kernel='linear', scale=TRUE, probability=TRUE, ranges=list(cost=c(.001,.01,.1,1,5,10,100,1000) )) summary(tune.out) # The cost=5, 10, 100, or 1000 perform well. svcfit=svm(y ~ ., data=dat, kernel='linear', cost=10, scale=TRUE, probability=TRUE) #scale=TRUE tells the function to scale the variables to have mean 0 and variance 1. plot(svcfit, dat) # The points plotted as "x" are the support vectors. The other points are plotted as "o". # The points in the light tan region will be assigned to Y=0 (not winning). # The points in the maroon region will be assigned to Y=1 (winning). # We see three points in the training data are misclassified # (the black point in the maroon region and the red points in the tan region). # Making predictions for several new individuals at once: newobs <- rbind( c(.760,1.30), c(.700,1.40), c(.760,1.25) ) dimnames(newobs) <- list(NULL,c('OPS2015','WHIP2015')) testdat <- data.frame(x=newobs) predict(svcfit,testdat,probability=TRUE) ############################################ ## Now using an SVM with a radial kernel: ############################################# # Picking the cost and gamma parameters by cross-validation: tune.out <- tune(svm, y ~ ., data=dat, kernel='radial', scale=TRUE, probability=TRUE, ranges=list(cost=c(.001,.01,.1,1,5,10,100,1000), gamma=c(0.5,1,2,3,4) )) summary(tune.out) # cost=10 and gamma=1 works well. svmfit <- svm(y ~ ., data=dat, kernel='radial', scale=TRUE, probability=TRUE, cost=10, gamma=1) plot(svmfit, dat) # The points plotted as "x" are the support vectors. The other points are plotted as "o". # The points in the light tan region will be assigned to Y=0 (not winning). # The points in the maroon region will be assigned to Y=1 (winning). # We see three points in the training data are misclassified # (the black point in the maroon region and the red points in the tan region). # Making predictions for several new individuals at once: newobs <- rbind( c(.760,1.30), c(.700,1.40), c(.760,1.25) ) dimnames(newobs) <- list(NULL,c('OPS2015','WHIP2015')) testdat <- data.frame(x=newobs) predict(svmfit,testdat,probability=TRUE) #################################### # SVM with more than 2 groups: #################################### # With the Egyptian skulls data from Table 5.8: skulls <- read.table("http://www.stat.sc.edu/~hitchcock/skullschap7.txt", header=T) attach(skulls) X.train <- cbind(MB,BH,BL,NH) y.train <- EPOCH dat=data.frame(x=X.train, y=as.factor(y.train)) # Picking the cost and gamma parameters by cross-validation: tune.out <- tune(svm, y ~ ., data=dat, kernel='radial', scale=TRUE, probability=TRUE, ranges=list(cost=c(.001,.01,.1,1,5,10,100,1000), gamma=c(0.5,1,2,3,4) )) summary(tune.out) summary(tune.out$best.model) # cost=100 and gamma=0.5 works best. svmfit.skull <- svm(y ~ ., data=dat, kernel='radial', scale=TRUE, probability=TRUE, cost=100, gamma=0.5) # Let's predict the epoch of a new skull with # MB = 135, BH = 144, BL = 97, NH = 53: newobs <- rbind( c(135,144,97,53) ) dimnames(newobs) <- list(NULL,c('MB','BH', 'BL', 'NH')) testdat <- data.frame(x=newobs) predict(svmfit.skull,testdat,probability=TRUE)