## Some additional example code
## that may be helpful for the project.

# PLEASE READ Sec. 11.15 in the text, which has a brief discussion
# of some of these issues of model fit and checking model assumptions.

# The data from Example 1, Sec. 11.10 notes:

y <- c(39.1,38.9,53.2,89.3,31.3,31.5,89.6,54.5,28.8,46.9,67.9,49.5,41,49.7,55.3,84.5,46.5,54.1,57.9,91.2,29.8,89.1)
x1 <- c(1065,950,1090,1350,930,985,1390,1170,950,990,1035,845,1000,1065,1065,1325,1035,1005,918,1370,970,1375)
x2 <- c(9.482,13.149,9.418,26.969,8.489,8.329,29.605,13.154,10.887,6.046,14.889,11.694,9.911,9.371,14.051,18.420,13.302,8.098,12.999,35.393,5.518,35.669)

# Fitting the model:

my.reg.model <- lm(y ~ x1 + x2)

# A summary of estimated coefficients, F-test and t-tests, etc.:

summary(my.reg.model)

#### Plots of residuals to check model assumptions:

# Plot of residuals against fitted values:

plot(fitted(my.reg.model), residuals(my.reg.model), ylab='Residuals', xlab='Fitted Values' );abline(h=0)

# We want to see random scatter here, no particular pattern.
# A pattern showing the spread of the residuals varies from 
# the left side of the plot to the right indicates the error
# variance may not be constant.
# A curved pattern in this plot indicates the linear relationship
# between Y and the X's may NOT be appropriate.

# Normal Q-Q plot of the residuals:

qqnorm( residuals(my.reg.model) )

# In the Q-Q plot, we want to see the points forming a roughly 
# straight-line pattern.  Serious deviations from a straight line
# indicate the normality assumption for the errors may not be
# appropriate.


# In any of these cases, if there are problems, transforming Y 
# and refitting the model could help.
# Examples: Use 
#              sqrty <- sqrt(y) 
# and treat sqrty as the response variable. Or use
#              lny <- log(y) 
# and treat lny as the response variable. Or use
#              invy <- 1/y 
# and treat invy as the response variable. Or use
#              ysq <- y^2 
# and treat ysq as the response Variable.
#
# In any case, a lot of trial and error is called for:  trying transformations, 
# refitting the model, and examining the residual plots for the altered model.
# If no transformation substantially improves the fit, it may not be worth using
# transformed variables, since transformations make the estimated model more
# complicated to interpret.

## More diagnostic plots:

# Plots of residuals against each independent variable separately:

plot(x1, residuals(my.reg.model), ylab='Residuals' );abline(h=0)
dev.new() #opens new plotting window while keeping other window up
plot(x2, residuals(my.reg.model), ylab='Residuals' );abline(h=0)

# Again in these plots, we hope to see random scatter.  
# Serious curved patterns could indicate that the particular X-variable in
# question should enter the model in a non-linear manner.

#########################################
#########################################

# Checking for multicollinearity:

# To get VIF's, first copy this function (by Bill Venables) into R:

####################################
##
vif <- function(object, ...)
UseMethod("vif")
vif.default <- function(object, ...)
stop("No default method for vif. Sorry.")
vif.lm <- function(object, ...) {
  V <- summary(object)$cov.unscaled
  Vi <- crossprod(model.matrix(object))
        nam <- names(coef(object))
  if(k <- match("(Intercept)", nam, nomatch = F)) {
                v1 <- diag(V)[-k]
                v2 <- (diag(Vi)[-k] - Vi[k, -k]^2/Vi[k,k])
                nam <- nam[-k]
        } else {
                v1 <- diag(V)
                v2 <- diag(Vi)
                warning("No intercept term detected. Results may surprise.")
        }
        structure(v1*v2, names = nam)
} 
##
#####################################

# Then use it as follows:

vif(my.reg.model)

# Any VIF greater than 5 indicates potential multicollinearity problems.
# Any VIF greater than 10 indicates serious multicollinearity problems.
# A possible remedy is to remove one or more affected independent variables
# and re-fit the model.

#########################################
#########################################

# An exploratory regression analysis may involve numerous candidate explanatory variables.
# These could be completely separate variables measured on each observations, or they could 
# be functions of other explanatory variables, as with the quadratic regression example we saw,
# or with cross-product model terms representing the interaction between two variables.

# The analyst may want to do VARIABLE SELECTION (a.k.a. model selection) to determine which 
# independent variables truly have an effect on the response variable.
# Specific models can be compared using the formal t-tests and/or F-tests we have studied.
# In a truly exploratory analysis, a common approach is to fit numerous models (even all the models 
# that represent every possible subset of the candidate predictors) to compare them using a 
# MODEL SELECTION CRITERION.  Some commonly used criteria include the Bayesian Information Criterion (BIC), 
# and the Adjusted R^2 (adjr2).  
# The R function 'regsubsets' in the 'leaps' package will calculate the BIC and Adjusted R^2 for models based on 
# all possible subsets of the candidate explanatory variables.
# Models with small values of BIC are considered better.
# Models with large values of Adjusted R^2 are considered better.

# An example:

# Just making up some random data values 
# (clearly by construction these are unlikely to be important variables in the model)

set.seed(1213)
x3 <- runif(n=length(y), min=50,max=70)
set.seed(1989)
x4 <- rnorm(n=length(y), mean=100, sd=15)
set.seed(12131989)
x5 <- runif(n=length(y), min=100,max=200)

mydata <- data.frame(y,x1,x2,x3,x4,x5) # Creating a data frame containing my variables

#install.packages("leaps")  # installing the 'leaps' package, need to have an Internet connection for this.
library(leaps)  # loading the 'leaps' package
myresults <-regsubsets(y~x1+x2+x3+x4+x5, data=mydata,nvmax=NULL,nbest=1) 
summary(myresults) # prints best (lowest SSE) model, for each fixed model size ("*" means that variable is included)
summary(myresults)$adjr2
summary(myresults)$bic
which.max(summary(myresults)$adjr2)
which.min(summary(myresults)$bic)

# Caution:  Doing inference (like tests and CIs about the beta's) on a model chosen based on a model selection 
# criterion will typically produce biased results, because the sample is used twice (once to choose the 
# model's variables and again to do the inference).  A way to get around this is to split the data randomly into
# two subsets and use one subset for variable selection, and the other subset for inference.
# Obviously this works best in practice if the original number of observations is large.


# Checking for outliers:

rstudent(my.reg.model)

# An observation with an ABSOLUTE value of "rstudent" greater than 3
# is very unusual relative to the overall regression relationship.

# To further investigate whether any observations are "high-influence points":

dffits(my.reg.model)

# If any observation has an ABSOLUTE value of "dffits" much greater than the following cutoff:
cutoff <- 2*sqrt(length(coef(my.reg.model))/length(y)); print(cutoff)
# then that observation may have an usual influence on the regression model.

# You could remove the dependent and independent variable values 
# associated with that observation and re-fit the model.  If your overall 
# conclusions change substantially (and if your sample size is not too small), you could consider 
# reporting the regression analysis with the high-influence point deleted.

#########################################
#########################################

# IMPORTANT:  For your project, these are practical modeling issues to consider.  
# Often there is not only one right way to build the regression model. 
# The important thing is to clearly explain and justify your modeling decisions.
# Interpretation of the model and answering relevant research questions is the 
# primary goal!

#########################################
#########################################

# If your data set is relatively large or already in a spreadsheet, then you
# probably don't want to type it all into individual vectors in R.
# Here's how to read it in as an entire data frame.
# First, save the spreadsheet as a .csv file in some folder.
# Then change the working directory in R, navigating to that folder.
# In Windows, this is done via File --> Change dir...
# Then use the read.csv function to read in the data as a data frame:

mydata <- read.csv("ExampleProjectData.csv",header=T)
# this assumes the first row of the spreadsheet is a header with the variable names.
# Otherwise, you could change it to header=F, but then you'd have to supply the column names with
# something like:
# read.csv("ExampleProjectDataNoHeader.csv",header=F, col.names=c("y","x1","x2","x3","x4","x5"))

# After the data frame mydata is created,
# then you could
attach(mydata)

# and do the analysis as above.