# Multiple Regression example  
# California rain data 

# I am calling the data set "calirain".  
# The variables are: number city precip altitude latitude distance.    

##########
##
# Reading the data into R:

my.datafile <- tempfile()
cat(file=my.datafile, "
 1 Eureka         39.57   43 40.8   1  
 2 RedBluff       23.27  341 40.2  97  
 3 Thermal        18.20 4152 33.8  70  
 4 FortBragg      37.48   74 39.4   1  
 5 SodaSprings    49.26 6752 39.3 150  
 6 SanFrancisco   21.82   52 37.8   5  
 7 Sacramento     18.07   25 38.5  80  
 8 SanJose        14.17   95 37.4  28  
 9 GiantForest    42.63 6360 36.6 145  
10 Salinas        13.85   74 36.7  12  
11 Fresno          9.44  331 36.7 114  
12 PtPiedras      19.33   57 35.7   1  
13 PasaRobles     15.67  740 35.7  31  
14 Bakersfield     6.00  489 35.4  75  
15 Bishop          5.73 4108 37.3 198  
16 Mineral        47.82 4850 40.4 142  
17 SantaBarbara   17.95  120 34.4   1  
18 Susanville     18.20 4152 40.3 198   
19 TuleLake       10.03 4036 41.9 140   
20 Needles         4.63  913 34.8 192  
21 Burbank        14.74  699 34.2  47  
22 LosAngeles     15.02  312 34.1  16  
23 LongBeach      12.36   50 33.8  12  
24 LosBanos        8.26  125 37.8  74  
25 Blythe          4.05  268 33.6 155  
26 SanDiego        9.94   19 32.7   5  
27 Daggett         4.25 2105 34.1  85  
28 DeathValley     1.66 -178 36.5 194  
29 CrescentCity   74.87   35 41.7   1  
30 Colusa         15.95   60 39.2  91  
", sep=" ")

options(scipen=999) # suppressing scientific notation

calirain <- read.table(my.datafile, header=FALSE, col.names=c("number", "city", "precip", "altitude", "latitude", "distance")) 
  
# Note we could also save the data columns into a file and use a command such as:
# calirain <- read.table(file = "z:/stat_516/filename.txt", header=FALSE, col.names = c("number", "city", "precip", "altitude", "latitude", "distance"))

attach(calirain)

# The data frame called calirain is now created, 
# with six variables (four of which we will use in the analysis).
##
#########


# The lm() function gives us a basic regression output:  

rain.reg <- lm(precip ~ altitude + latitude + distance)
summary(rain.reg)

anova(rain.reg)

#################################################################################** 


# Testing about sets of coefficients in R:  


# To test in R whether beta_1=beta_3=0, given that X2 is in the model  
# we must specify the full model (with all variables included) and 
# the reduced model (with ONLY latitude (X2) included)

rain.full.model <- lm(precip ~ altitude + latitude + distance)

rain.reduced.model <- lm(precip ~ latitude)

# This will give us the F-statistic and P-value for this test:

anova(rain.reduced.model, rain.full.model)



#################################################################################** 

# INFERENCES ABOUT THE RESPONSE VARIABLE  

# We want to (1) estimate the mean precipitation for cities of altitude 100 feet,  
# latitude 40 degrees, and 70 miles from the coast.                        
# and (2) predict the precipitation of a new city of altitude 100 feet,    
# latitude 40 degrees, and 70 miles from the coast.                        


# The predict() function will give us CIs for the mean of Y and PIs for Y,  
# for the values of X1, X2, X3 in the data set.  

x.values <- data.frame(cbind(altitude = 100, latitude = 40, distance = 70))                            

# getting the 90% confidence interval for the mean at X1=100, X2=40 and X3=70:

predict(rain.reg, x.values, interval="confidence", level=0.90)

# getting the 90% prediction interval for a new response at X1=100, X2=40 and X3=70:

predict(rain.reg, x.values, interval="prediction", level=0.90)


#################################################################################** 

### The following code will produce residual plots for this multiple regression *** 

# residual plot:

plot(fitted(rain.reg), resid(rain.reg)); abline(h=0)

# Q-Q plot of the residuals:

qqnorm(resid(rain.reg))

#####################################################################################** 

# Getting Variance Inflation Factors is easy in R:  
# just add some options to the MODEL statement:                         

# To get VIF's, first copy this function (by Bill Venables) into R:

####################################
##
#
vif <- function(object, ...)
UseMethod("vif")

vif.default <- function(object, ...)
stop("No default method for vif. Sorry.")

vif.lm <- function(object, ...) {
  V <- summary(object)$cov.unscaled
  Vi <- crossprod(model.matrix(object))
        nam <- names(coef(object))
  if(k <- match("(Intercept)", nam, nomatch = F)) {
                v1 <- diag(V)[-k]
                v2 <- (diag(Vi)[-k] - Vi[k, -k]^2/Vi[k,k])
                nam <- nam[-k]
        } else {
                v1 <- diag(V)
                v2 <- diag(Vi)
                warning("No intercept term detected. Results may surprise.")
        }
        structure(v1*v2, names = nam)
} 
#
##
#####################################

# Then use it as follows:

vif(rain.reg)

#####################

# Getting Influence Statistics is easy In R:
                   
# The studentized residuals we discussed in class           
# (which we compare in absolute value to 2.5) are the       
# internally studentized residuals.      

# rstandard gives the INTERNALLY studentized residuals 
# (what SAS calls "Student Residual")

rstandard(rain.reg)

# getting the measures of influence:
# Gives DFFITS, Cook's Distance, Hat diagonal elements, and some others.

influence.measures(rain.reg)               

#####################################################################################** 

# R can help with automated variable selection guides:    

# Using the C(p) and Adjusted R^2 Criteria to find the best model(s):  


# To install the "leaps" package:
# Go to "Packages" menu and choose
# "Install package(s) from CRAN"
# scroll to the "leaps" package and choose it
# An Internet connection is needed to install it.

# Then you can proceed with the code...

# Load the "leaps" package:

library(leaps)

# The leaps function provides the all-subsets analysis:

# Printing the 2 best models of each size, using the Cp criterion:

leaps(x = cbind(altitude, latitude, distance), y = precip, nbest=2, method="Cp")

# Printing the 2 best models of each size, using the adjusted R^2 criterion:

leaps(x = cbind(altitude, latitude, distance), y = precip, nbest=2, method="adjr2")

# The TRUE and FALSE values in the output refer to which variables are in each
# candidate model and which variables are out.

#####################################################################################**