# R code to analyze the body fat data # using ridge regression # Save the data file into a directory and # use the full path name: bodyfat.data <- read.table(file = "http://people.stat.sc.edu/Hitchcock/bodyfatdata.txt", header=FALSE, col.names = c('triceps', 'thigh', 'midarm', 'bodyfat')) # attaching the data frame: attach(bodyfat.data) # must load the MASS package first: library(MASS) # Using R's automatic selection methods to select the biasing constant: # R calls this constant "lambda" select(lm.ridge(bodyfat ~ triceps + thigh + midarm, lambda = seq(0,1,0.001))) # The generalized cross-validation (GCV) criterion says # the optimal biasing constant is .019 bodyfat.ridge.reg <- lm.ridge(bodyfat ~ triceps + thigh + midarm, lambda = .019) # Printing the ridge-regression coefficient estimates for this problem: bodyfat.ridge.reg ########################### # Comparing the ridge-regression fit to the original least-squares fit: # # The X matrix for this problem: X.matrix <- cbind(rep(1,length=length(bodyfat)),triceps, thigh, midarm) # Getting the fitted values for the ridge-regression fit: fitted.vals <- X.matrix %*% c(43.840113, 2.117493, -0.959731, -1.018061) # Getting the SSE for the ridge-regression fit: sse.ridge <- sum((bodyfat-fitted.vals)^2); sse.ridge # The original least-squares fit: bodyfat.reg <- lm(bodyfat ~ triceps + thigh + midarm) summary(bodyfat.reg) # Getting the SSE for the original least-squares fit: sum(resid(bodyfat.reg)^2) # The SSE for the ridge-regression fit is not much higher, which is good. ### Comparing VIFs: #################################### ## # vif <- function(object, ...) UseMethod("vif") vif.default <- function(object, ...) stop("No default method for vif. Sorry.") vif.lm <- function(object, ...) { V <- summary(object)$cov.unscaled Vi <- crossprod(model.matrix(object)) nam <- names(coef(object)) if(k <- match("(Intercept)", nam, nomatch = F)) { v1 <- diag(V)[-k] v2 <- (diag(Vi)[-k] - Vi[k, -k]^2/Vi[k,k]) nam <- nam[-k] } else { v1 <- diag(V) v2 <- diag(Vi) warning("No intercept term detected. Results may surprise.") } structure(v1*v2, names = nam) } # ## ##################################### # VIFs for original estimated coefficients: vif(bodyfat.reg) # VIFs for ridge coefficients (see formula on p. 436 of book): rXX <- cor(cbind(triceps,thigh,midarm)) my.c <- 0.019 I.mat <- diag(rep(1,times=nrow(rXX))) VIF.ridge <- diag(solve(rXX+my.c*I.mat) %*% rXX %*% solve(rXX+my.c*I.mat) ) print(VIF.ridge) ########################################################## ## NOTE: The 'ridge' package uses a newer function that provides inferences (p-values) # for the coefficients. Estimates may be quite different from those obtained by lm.ridge library(ridge) bodyfat.ridge.new <- linearRidge(bodyfat ~ triceps + thigh + midarm) #automatic choice of lambda summary(bodyfat.ridge.new) ############################################################### # LASSO regression: my.X.mat <- model.matrix(~ triceps + thigh + midarm, data=bodyfat.data) library(glmnet) # Choosing "lambda", i.e., "c" by cross-validation: crossval <- cv.glmnet(x=my.X.mat,y=bodyfat) plot(crossval) penalty <- crossval$lambda.min #optimal lambda penalty bodyfat.reg.lasso <-glmnet(x=my.X.mat,y=bodyfat, alpha = 1, lambda = penalty ) #estimate the model coef(bodyfat.reg.lasso) fitted.lasso = predict(bodyfat.reg.lasso,my.X.mat,type="response") sse.lasso <- sum((bodyfat-fitted.lasso)^2); sse.lasso #################################################################### # Comparing a few predicted values from each of the fits: Xs.at.Q1 <- c(1,quantile(triceps,prob=0.25),quantile(thigh,prob=0.25),quantile(midarm,prob=0.25)) Xs.at.median <- c(1,median(triceps),median(thigh),median(midarm)) Xs.at.Q3 <- c(1,quantile(triceps,prob=0.75),quantile(thigh,prob=0.75),quantile(midarm,prob=0.75)) # When the X variables are at their 25th percentile: print(paste("Prediction with OLS: ", round(as.numeric(coef(bodyfat.reg) %*% Xs.at.Q1),2))) print(paste("Prediction with ridge: ", round(as.numeric(coef(bodyfat.ridge.reg) %*% Xs.at.Q1),2))) print(paste("Prediction with new ridge: ", round(as.numeric(coef(bodyfat.ridge.new) %*% Xs.at.Q1),2))) print(paste("Prediction with LASSO: ", round(as.numeric(coef(bodyfat.reg.lasso) %*% Xs.at.Q1),2))) # When the X variables are at their median: print(paste("Prediction with OLS: ", round(as.numeric(coef(bodyfat.reg) %*% Xs.at.median),2))) print(paste("Prediction with ridge: ", round(as.numeric(coef(bodyfat.ridge.reg) %*% Xs.at.median),2))) print(paste("Prediction with new ridge: ", round(as.numeric(coef(bodyfat.ridge.new) %*% Xs.at.median),2))) print(paste("Prediction with LASSO: ", round(as.numeric(coef(bodyfat.reg.lasso) %*% Xs.at.median),2))) # When the X variables are at their 75th percentile: print(paste("Prediction with OLS: ", round(as.numeric(coef(bodyfat.reg) %*% Xs.at.Q3),2))) print(paste("Prediction with ridge: ", round(as.numeric(coef(bodyfat.ridge.reg) %*% Xs.at.Q3),2))) print(paste("Prediction with new ridge: ", round(as.numeric(coef(bodyfat.ridge.new) %*% Xs.at.Q3),2))) print(paste("Prediction with LASSO: ", round(as.numeric(coef(bodyfat.reg.lasso) %*% Xs.at.Q3),2)))