#########################
#                       #
# STAT 599 Spring 2013  #
#                       #
# Example R code        #
#                       #
# Chapter 3             #
#                       #
#########################

### Installing the add-on packages needed for this course:

# If you haven't already done so, 
# type at the command line (without the preceding # ):

# install.packages(c('faraway','mgcv','acepack','mda','brlr','sm','wavethresh','SuppDists','lme4'))

# and choose a U.S. mirror site when the window pops up.
# This will install the needed packages for you.

# load the 'faraway' package:

library(faraway)

# loading the 'gala' dataset:

data(gala)

# attaching the 'gala' dataset:

attach(gala)

# Fitting a normal linear regression model:

model.1 <- lm(Species ~ Area + Elevation + Nearest + Scruz + Adjacent, data=gala)

# Residuals plotted against fitted values:

plot(predict(model.1), residuals(model.1), xlab='Fitted', ylab='Residuals')

# Fitting a normal linear regression model with a square-root transformed response:

model.2 <- lm(sqrt(Species) ~ Area + Elevation + Nearest + Scruz + Adjacent, data=gala)

# Residuals plotted against fitted values:

plot(predict(model.2), residuals(model.2), xlab='Fitted', ylab='Residuals')

######

# Fitting a Poisson GLM:

model.p <- glm(Species ~ Area + Elevation + Nearest + Scruz + Adjacent, family=poisson, data=gala)

summary(model.p)

# Investigating the variance function:

# Plotting (y - mu-hat)^2 against mu-hat (on the scale of the response):

plot(fitted(model.p), (Species-fitted(model.p))^2, xlab=expression(hat(mu)), ylab=expression((y-hat(mu))^2))
abline(0,1)

# Taking logs will plot the (y - mu-hat)^2 against mu-hat on the scale of the linear predictor:

plot(log(fitted(model.p)), log((Species-fitted(model.p))^2), xlab=expression(hat(mu)), ylab=expression((y-hat(mu))^2))
abline(0,1)

# Estimating a dispersion parameter:

my.disp <- sum(residuals(model.p, type='pearson')^2)/model.p$df.res
print(my.disp)

# Fitting the model with the overdispersion accounted for will change the standard errors:

summary(model.p, dispersion=my.disp)

# With the overdispersion in the model, need to use 
# F-test to test significance of individual predictors:

drop1(model.p, test="F")

# A couple of the predictors are not significant.
# Should we fit a simpler model without them?

model.p.simp1 <- glm(Species ~ Area + Elevation + Adjacent, family=poisson, data=gala)
model.p.simp2 <- glm(Species ~ Area + Elevation + Scruz + Adjacent, family=poisson, data=gala)

AIC(model.p.simp1, model.p.simp2, model.p)

# Use your best judgment...


#####################
#
# 3.2 Rate Models
#
#####################

# loading the 'faraway' package:

library(faraway)

# loading the 'dicentric' data set:

data(dicentric)

# attaching the 'dicentric' data frame:

attach(dicentric)

# An interaction plot, treating rate=ca/cells as the response:

with(dicentric,interaction.plot(doserate, doseamt, ca/cells))

# There appears to be some interaction.

# A normal linear model, with rate=ca/cells as the response:
# Note the '*' between the predictors will tell R 
# to include the interaction(s):

norm.lmod <- lm(ca/cells ~ doserate*factor(doseamt), data=dicentric)
summary(norm.lmod)
# Log-transforming the doserate predictor:
norm.lmod.2 <- lm(ca/cells ~ log(doserate)*factor(doseamt), data=dicentric)
summary(norm.lmod.2)

# The second model has a higher adjusted R2...

# Checking residual diagnostics:

plot(residuals(norm.lmod.2) ~ fitted(norm.lmod.2), xlab='Fitted', ylab='Residuals')
abline(h=0)

# Some definite nonconstant error variance...

### A rate model:

dosef <- factor(doseamt)

# Note the default log link for the Poisson GLM:
pois.ratemod <- glm(ca ~ log(cells) + log(doserate)*dosef, family=poisson, data=dicentric)
summary(pois.ratemod)

# The book suggests setting the coefficient of
# log(cells) to 1 with the offset function.
# This allows us to model the rate=ca/cells
# with a Poisson GLM with log link.

pois.ratemod2 <- glm(ca ~ offset(log(cells)) + log(doserate)*dosef, family=poisson, data=dicentric)
summary(pois.ratemod2)

# The deviance indicates a good fit.

drop1(pois.ratemod2, test="Chi")

# The interaction is highly significant, so we stick with the 
# interaction model.

# The plot of page 62 (top) indicates the effect of doseamt 
# may be quadratic, but this is difficult to determine 
# unless we collect data at more dose amounts.

# Plot of deviance residuals against linear predictor values:
plot(residuals(pois.ratemod2) ~ predict(pois.ratemod2, type='link'), xlab=expression(hat(eta)), ylab='Deviance residuals')

# No apparent problem.

##################################
#
# 3.3 Negative Binomial Models
#
##################################


# loading the 'faraway' package:

library(faraway)

# loading the 'solder' data set:

data(solder)

# attaching the 'solder' data frame:

attach(solder)

# Trying a Poisson model:

mod.pois <- glm(skips ~ Opening + Solder + Mask + PadType + Panel, family=poisson, data=solder)
summary(mod.pois)

# Not a good fit based on the deviance.

# Trying a Poisson model with all two-way interactions included:

mod.pois2 <- glm(skips ~ (Opening + Solder + Mask + PadType + Panel)^2, family=poisson, data=solder)
summary(mod.pois2)

# Still not a great fit.

# Plotting estimated variance versus the mean:

plot(fitted(mod.pois2), (skips-fitted(mod.pois2))^2, xlab=expression(hat(mu)), ylab=expression((y-hat(mu))^2))
abline(0,1)

# Maybe some overdispersion?

#### A negative binomial model (with k=1)

library(MASS)  # loading the 'MASS' package
mod.neg.bin1 <- glm(skips ~ Opening + Solder + Mask + PadType + Panel, family=negative.binomial(1), data=solder)
summary(mod.neg.bin1)

#### A negative binomial model (with k=3)

mod.neg.bin3 <- glm(skips ~ Opening + Solder + Mask + PadType + Panel, family=negative.binomial(3), data=solder)
summary(mod.neg.bin3)

# AIC is a little better...

#### A negative binomial model (with k estimated via ML)

mod.neg.bin <- glm.nb(skips ~ Opening + Solder + Mask + PadType + Panel, data=solder)
summary(mod.neg.bin)

# The AIC is good:

drop1(mod.neg.bin, test="Chi")

# Looks like none of the predictors should be dropped.