## Some additional example code ## that may be helpful for the project. # PLEASE READ Sec. 11.15 in the text, which has a brief discussion # of some of these issues of model fit and checking model assumptions. # The data from Example 1, Sec. 11.10 notes: y <- c(39.1,38.9,53.2,89.3,31.3,31.5,89.6,54.5,28.8,46.9,67.9,49.5,41,49.7,55.3,84.5,46.5,54.1,57.9,91.2,29.8,89.1) x1 <- c(1065,950,1090,1350,930,985,1390,1170,950,990,1035,845,1000,1065,1065,1325,1035,1005,918,1370,970,1375) x2 <- c(9.482,13.149,9.418,26.969,8.489,8.329,29.605,13.154,10.887,6.046,14.889,11.694,9.911,9.371,14.051,18.420,13.302,8.098,12.999,35.393,5.518,35.669) # Fitting the model: my.reg.model <- lm(y ~ x1 + x2) # A summary of estimated coefficients, F-test and t-tests, etc.: summary(my.reg.model) #### Plots of residuals to check model assumptions: # Plot of residuals against fitted values: plot(fitted(my.reg.model), residuals(my.reg.model), ylab='Residuals', xlab='Fitted Values' );abline(h=0) # We want to see random scatter here, no particular pattern. # A pattern showing the spread of the residuals varies from # the left side of the plot to the right indicates the error # variance may not be constant. # A curved pattern in this plot indicates the linear relationship # between Y and the X's may NOT be appropriate. # Normal Q-Q plot of the residuals: qqnorm( residuals(my.reg.model) ) # In the Q-Q plot, we want to see the points forming a roughly # straight-line pattern. Serious deviations from a straight line # indicate the normality assumption for the errors may not be # appropriate. # In any of these cases, if there are problems, transforming Y # and refitting the model could help. # Examples: Use # sqrty <- sqrt(y) # and treat sqrty as the response variable. Or use # lny <- log(y) # and treat lny as the response variable. Or use # invy <- 1/y # and treat invy as the response variable. Or use # ysq <- y^2 # and treat ysq as the response Variable. # # In any case, a lot of trial and error is called for: trying transformations, # refitting the model, and examining the residual plots for the altered model. # If no transformation substantially improves the fit, it may not be worth using # transformed variables, since transformations make the estimated model more # complicated to interpret. ## More diagnostic plots: # Plots of residuals against each independent variable separately: plot(x1, residuals(my.reg.model), ylab='Residuals' );abline(h=0) dev.new() #opens new plotting window while keeping other window up plot(x2, residuals(my.reg.model), ylab='Residuals' );abline(h=0) # Again in these plots, we hope to see random scatter. # Serious curved patterns could indicate that the particular X-variable in # question should enter the model in a non-linear manner. ######################################### ######################################### # Checking for multicollinearity: # To get VIF's, first copy this function (by Bill Venables) into R: #################################### ## vif <- function(object, ...) UseMethod("vif") vif.default <- function(object, ...) stop("No default method for vif. Sorry.") vif.lm <- function(object, ...) { V <- summary(object)$cov.unscaled Vi <- crossprod(model.matrix(object)) nam <- names(coef(object)) if(k <- match("(Intercept)", nam, nomatch = F)) { v1 <- diag(V)[-k] v2 <- (diag(Vi)[-k] - Vi[k, -k]^2/Vi[k,k]) nam <- nam[-k] } else { v1 <- diag(V) v2 <- diag(Vi) warning("No intercept term detected. Results may surprise.") } structure(v1*v2, names = nam) } ## ##################################### # Then use it as follows: vif(my.reg.model) # Any VIF greater than 5 indicates potential multicollinearity problems. # Any VIF greater than 10 indicates serious multicollinearity problems. # A possible remedy is to remove one or more affected independent variables # and re-fit the model. ######################################### ######################################### # An exploratory regression analysis may involve numerous candidate explanatory variables. # These could be completely separate variables measured on each observations, or they could # be functions of other explanatory variables, as with the quadratic regression example we saw, # or with cross-product model terms representing the interaction between two variables. # The analyst may want to do VARIABLE SELECTION (a.k.a. model selection) to determine which # independent variables truly have an effect on the response variable. # Specific models can be compared using the formal t-tests and/or F-tests we have studied. # In a truly exploratory analysis, a common approach is to fit numerous models (even all the models # that represent every possible subset of the candidate predictors) to compare them using a # MODEL SELECTION CRITERION. Some commonly used criteria include the Bayesian Information Criterion (BIC), # and the Adjusted R^2 (adjr2). # The R function 'regsubsets' in the 'leaps' package will calculate the BIC and Adjusted R^2 for models based on # all possible subsets of the candidate explanatory variables. # Models with small values of BIC are considered better. # Models with large values of Adjusted R^2 are considered better. # An example: # Just making up some random data values # (clearly by construction these are unlikely to be important variables in the model) set.seed(1213) x3 <- runif(n=length(y), min=50,max=70) set.seed(1989) x4 <- rnorm(n=length(y), mean=100, sd=15) set.seed(12131989) x5 <- runif(n=length(y), min=100,max=200) mydata <- data.frame(y,x1,x2,x3,x4,x5) # Creating a data frame containing my variables #install.packages("leaps") # installing the 'leaps' package, need to have an Internet connection for this. library(leaps) # loading the 'leaps' package myresults <-regsubsets(y~x1+x2+x3+x4+x5, data=mydata,nvmax=NULL,nbest=1) summary(myresults) # prints best (lowest SSE) model, for each fixed model size ("*" means that variable is included) summary(myresults)$adjr2 summary(myresults)$bic which.max(summary(myresults)$adjr2) which.min(summary(myresults)$bic) # Caution: Doing inference (like tests and CIs about the beta's) on a model chosen based on a model selection # criterion will typically produce biased results, because the sample is used twice (once to choose the # model's variables and again to do the inference). A way to get around this is to split the data randomly into # two subsets and use one subset for variable selection, and the other subset for inference. # Obviously this works best in practice if the original number of observations is large. # Checking for outliers: rstudent(my.reg.model) # An observation with an ABSOLUTE value of "rstudent" greater than 3 # is very unusual relative to the overall regression relationship. # To further investigate whether any observations are "high-influence points": dffits(my.reg.model) # If any observation has an ABSOLUTE value of "dffits" much greater than the following cutoff: cutoff <- 2*sqrt(length(coef(my.reg.model))/length(y)); print(cutoff) # then that observation may have an usual influence on the regression model. # You could remove the dependent and independent variable values # associated with that observation and re-fit the model. If your overall # conclusions change substantially (and if your sample size is not too small), you could consider # reporting the regression analysis with the high-influence point deleted. ######################################### ######################################### # IMPORTANT: For your project, these are practical modeling issues to consider. # Often there is not only one right way to build the regression model. # The important thing is to clearly explain and justify your modeling decisions. # Interpretation of the model and answering relevant research questions is the # primary goal! ######################################### ######################################### # If your data set is relatively large or already in a spreadsheet, then you # probably don't want to type it all into individual vectors in R. # Here's how to read it in as an entire data frame. # First, save the spreadsheet as a .csv file in some folder. # Then change the working directory in R, navigating to that folder. # In Windows, this is done via File --> Change dir... # Then use the read.csv function to read in the data as a data frame: mydata <- read.csv("ExampleProjectData.csv",header=T) # this assumes the first row of the spreadsheet is a header with the variable names. # Otherwise, you could change it to header=F, but then you'd have to supply the column names with # something like: # read.csv("ExampleProjectDataNoHeader.csv",header=F, col.names=c("y","x1","x2","x3","x4","x5")) # After the data frame mydata is created, # then you could attach(mydata) # and do the analysis as above.