# Multiple Regression example # California rain data # I am calling the data set "calirain". # The variables are: number city precip altitude latitude distance. ########## ## # Reading the data into R: my.datafile <- tempfile() cat(file=my.datafile, " 1 Eureka 39.57 43 40.8 1 2 RedBluff 23.27 341 40.2 97 3 Thermal 18.20 4152 33.8 70 4 FortBragg 37.48 74 39.4 1 5 SodaSprings 49.26 6752 39.3 150 6 SanFrancisco 21.82 52 37.8 5 7 Sacramento 18.07 25 38.5 80 8 SanJose 14.17 95 37.4 28 9 GiantForest 42.63 6360 36.6 145 10 Salinas 13.85 74 36.7 12 11 Fresno 9.44 331 36.7 114 12 PtPiedras 19.33 57 35.7 1 13 PasaRobles 15.67 740 35.7 31 14 Bakersfield 6.00 489 35.4 75 15 Bishop 5.73 4108 37.3 198 16 Mineral 47.82 4850 40.4 142 17 SantaBarbara 17.95 120 34.4 1 18 Susanville 18.20 4152 40.3 198 19 TuleLake 10.03 4036 41.9 140 20 Needles 4.63 913 34.8 192 21 Burbank 14.74 699 34.2 47 22 LosAngeles 15.02 312 34.1 16 23 LongBeach 12.36 50 33.8 12 24 LosBanos 8.26 125 37.8 74 25 Blythe 4.05 268 33.6 155 26 SanDiego 9.94 19 32.7 5 27 Daggett 4.25 2105 34.1 85 28 DeathValley 1.66 -178 36.5 194 29 CrescentCity 74.87 35 41.7 1 30 Colusa 15.95 60 39.2 91 ", sep=" ") options(scipen=999) # suppressing scientific notation calirain <- read.table(my.datafile, header=FALSE, col.names=c("number", "city", "precip", "altitude", "latitude", "distance")) # Note we could also save the data columns into a file and use a command such as: # calirain <- read.table(file = "z:/stat_516/filename.txt", header=FALSE, col.names = c("number", "city", "precip", "altitude", "latitude", "distance")) attach(calirain) # The data frame called calirain is now created, # with six variables (four of which we will use in the analysis). ## ######### # The lm() function gives us a basic regression output: rain.reg <- lm(precip ~ altitude + latitude + distance) summary(rain.reg) anova(rain.reg) #################################################################################** # Testing about sets of coefficients in R: # To test in R whether beta_1=beta_3=0, given that X2 is in the model # we must specify the full model (with all variables included) and # the reduced model (with ONLY latitude (X2) included) rain.full.model <- lm(precip ~ altitude + latitude + distance) rain.reduced.model <- lm(precip ~ latitude) # This will give us the F-statistic and P-value for this test: anova(rain.reduced.model, rain.full.model) #################################################################################** # INFERENCES ABOUT THE RESPONSE VARIABLE # We want to (1) estimate the mean precipitation for cities of altitude 100 feet, # latitude 40 degrees, and 70 miles from the coast. # and (2) predict the precipitation of a new city of altitude 100 feet, # latitude 40 degrees, and 70 miles from the coast. # The predict() function will give us CIs for the mean of Y and PIs for Y, # for the values of X1, X2, X3 in the data set. x.values <- data.frame(cbind(altitude = 100, latitude = 40, distance = 70)) # getting the 90% confidence interval for the mean at X1=100, X2=40 and X3=70: predict(rain.reg, x.values, interval="confidence", level=0.90) # getting the 90% prediction interval for a new response at X1=100, X2=40 and X3=70: predict(rain.reg, x.values, interval="prediction", level=0.90) #################################################################################** ### The following code will produce residual plots for this multiple regression *** # residual plot: plot(fitted(rain.reg), resid(rain.reg)); abline(h=0) # Q-Q plot of the residuals: qqnorm(resid(rain.reg)) #####################################################################################** # Getting Variance Inflation Factors is easy in R: # just add some options to the MODEL statement: # To get VIF's, first copy this function (by Bill Venables) into R: #################################### ## # vif <- function(object, ...) UseMethod("vif") vif.default <- function(object, ...) stop("No default method for vif. Sorry.") vif.lm <- function(object, ...) { V <- summary(object)$cov.unscaled Vi <- crossprod(model.matrix(object)) nam <- names(coef(object)) if(k <- match("(Intercept)", nam, nomatch = F)) { v1 <- diag(V)[-k] v2 <- (diag(Vi)[-k] - Vi[k, -k]^2/Vi[k,k]) nam <- nam[-k] } else { v1 <- diag(V) v2 <- diag(Vi) warning("No intercept term detected. Results may surprise.") } structure(v1*v2, names = nam) } # ## ##################################### # Then use it as follows: vif(rain.reg) ##################### # Getting Influence Statistics is easy In R: # The studentized residuals we discussed in class # (which we compare in absolute value to 2.5) are the # internally studentized residuals. # rstandard gives the INTERNALLY studentized residuals # (what SAS calls "Student Residual") rstandard(rain.reg) # getting the measures of influence: # Gives DFFITS, Cook's Distance, Hat diagonal elements, and some others. influence.measures(rain.reg) #####################################################################################** # R can help with automated variable selection guides: # Using the C(p) and Adjusted R^2 Criteria to find the best model(s): # To install the "leaps" package: # Go to "Packages" menu and choose # "Install package(s) from CRAN" # scroll to the "leaps" package and choose it # An Internet connection is needed to install it. # Then you can proceed with the code... # Load the "leaps" package: library(leaps) # The leaps function provides the all-subsets analysis: # Printing the 2 best models of each size, using the Cp criterion: leaps(x = cbind(altitude, latitude, distance), y = precip, nbest=2, method="Cp") # Printing the 2 best models of each size, using the adjusted R^2 criterion: leaps(x = cbind(altitude, latitude, distance), y = precip, nbest=2, method="adjr2") # The TRUE and FALSE values in the output refer to which variables are in each # candidate model and which variables are out. #####################################################################################**