#########################
#                       #
# STAT 599 Spring 2013  #
#                       #
# Example R code        #
#                       #
# Chapter 8             #
#                       #
#########################

### Installing the add-on packages needed for this course:

# If you haven't already done so, 
# type at the command line (without the preceding # ):

# install.packages(c('faraway','mgcv','acepack','mda','brlr','sm','wavethresh','SuppDists','lme4'))

# and choose a U.S. mirror site when the window pops up.
# This will install the needed packages for you.

# load the 'faraway' package:

library(faraway)

# loading the 'pulp' dataset:

data(pulp)

# attaching the 'pulp' dataset:

attach(pulp)

## Section 8.1:

### Fitting a basic fixed-effects one-way ANOVA:

op <- options(contrasts=c("contr.sum", "contr.poly"))

# Using the 'aov' function to fit the one-way ANOVA:
lmod <- aov(bright ~ operator, data=pulp)
summary(lmod)

coef(lmod)

options(op)

# estimate of sigma^2_alpha:

(anova(lmod)["operator", "Mean Sq"] - anova(lmod)["Residuals", "Mean Sq"]) / 5

# Note 5 = number of observations per factor level.

### Fitting a random-effects ANOVA:

# load the 'lme4' package:

library(lme4)

mmod <- lmer(bright ~1 + (1|operator), data=pulp)
summary(mmod)

# The first part indictes the fixed effects:  
# the '1' indicates the intercept is fixed.

# The term(s) in parentheses describe the random effects:
# The (1|operator) indicates the random levels are based on 'operator'
# and the '1' indicates the random effect is constant within each group.

# The default above is to compute REML estimates.

# To get ML estimates, use: method='ML':

smod <- lmer(bright ~1 + (1|operator), data=pulp, method='ML')
summary(smod)

#### Section 8.2:  Inference for the random effects model:

# If we treat operators as fixed levels, our F-test for an operator effect is 
# given in the summary output:

summary(lmod)

# If we treat operators as random levels, we could use a likelihood ratio 
# approach to test for significant variation among the random effects:

# Our reduced model here is just a null model with no operator effects:

nullmod <- lm(bright ~ 1, data=pulp)

# Our full model is the model above that includes random operator effects.
# Comparing the log likelihoods:

LR.test.stat <- as.numeric(2*(logLik(smod) - logLik(nullmod)))
print(LR.test.stat)

# Assuming a chi-square null distribution:

pchisq(LR.test.stat, 1, lower=F)

# This p-value is probably not trustworthy here;
# the null distribution is probably not chi-square.

# The parametric bootstrap will give a better P-value:

########
####
#

num.reps <- 100
lrstat <- numeric(num.reps) # create a dummy vector (of zeroes) of length 'num.reps'
for(i in 1:num.reps){
 y <- unlist(simulate(nullmod)) # Simulate data under the null model
 bnull <- lm(y ~ 1) # fit null model
 balt <- lmer(y ~ 1 + (1|operator), data=pulp, method='ML') # fit alternative (full) model
 lrstat[i] <- as.numeric(2*(logLik(balt)-logLik(bnull))) # calculate LR statistic
}

## Bootstrap p-value:

boot.p <- mean(lrstat > LR.test.stat)
print(boot.p)

# Monte Carlo standard error of this estimate:

sqrt(boot.p*(1-boot.p)/num.reps)

##### Section 8.3:  Predicting Random Effects:

## Predicting each random effect using an empirical Bayes approach:

ranef(mmod)$operator

# Compare these to the fixed effects:

model.tables(lmod)

# Each fixed effect estimate is exactly 1.3121 times larger than 
# the corresponding random effect estimate.


## Getting the Best Linear Unbiased Predictors (BLUPs) 
## for observations in any of the groups:

fixef(mmod) + ranef(mmod)$operator


## Diagnostic Plots:

# Normal Q-Q plot of residuals:

qqnorm(resid(mmod),main='Normality Check for Residuals')

# Plot of Residuals vs. Fitted values:

plot(fitted(mmod),resid(mmod),xlab='Fitted Values', ylab='Residuals'); abline(h=0)


# The normality assumption and the constant variance assumption seems reasonable...

############################################################
#
## Section 8.4:  Blocks As Random Effects


# load the 'faraway' package:

library(faraway)

# loading the 'penicillin' dataset:

data(penicillin)

# attaching the 'penicillin' dataset:

attach(penicillin)

### Fitting a basic fixed-effects two-way ANOVA:

op <- options(contrasts=c("contr.sum", "contr.poly"))

# Using the 'aov' function to fit the two-way ANOVA: 
# This considers both treatment effects and block effects to be FIXED.

pen.mod <- aov(yield ~ blend + treat, data=penicillin)
summary(pen.mod)

coef(pen.mod)

# Using the 'lmer' function in the 'lme4' package to fit the two-factor mixed effects model: 

# load the 'lme4' package:

library(lme4)

m.pen.mod <- lmer(yield ~ treat + (1|blend), data=penicillin)
summary(m.pen.mod)

options(op)

## Getting the Best Linear Unbiased Predictors (BLUPs) 
## for observations with any of the blends:

ranef(m.pen.mod)$blend



#####
#### Testing for Treatment Effect:

## ANOVA method:

anova(m.pen.mod)

# Identical to result obtain when blocks were considered fixed...


## Likelihood Ratio Method:

# defining the (full) model under the alternative:

m.pen.mod.alt <- lmer(yield ~ treat + (1|blend), data=penicillin, method='ML')

# defining the model under the null:

m.pen.mod.null <- lmer(yield ~ 1 + (1|blend), data=penicillin, method='ML')

# Assuming a chi-square null distribution for the LR statistic:

anova(m.pen.mod.alt, m.pen.mod.null)

# NOT assuming a chi-square null distribution for the LR statistic:
# Instead, using a parametric bootstrap approach:

########
####
#

num.reps <- 100
lrstat <- numeric(num.reps) # create a dummy vector (of zeroes) of length 'num.reps'
for(i in 1:num.reps){
 yy <- unlist(simulate(m.pen.mod.null)) # Simulate data under the null model
 bnull <- lmer(yy ~ 1 + (1|blend), data=penicillin, method='ML') # fit null model
 balt <- lmer(yy ~ treat + (1|blend), data=penicillin, method='ML') # fit alternative (full) model
 lrstat[i] <- as.numeric(2*(logLik(balt)-logLik(bnull))) # calculate LR statistic
}

## Checking whether the LR statistic seems to be ch-square(3 df) distributed:

plot(qchisq((1:num.reps)/(num.reps+1),3),sort(lrstat),xlab=expression(chi[3]^2),ylab='Simulated LR stat'); abline(0,1)

## Bootstrap p-value:

LR.test.stat <- 4.05  # from table above

boot.p <- mean(lrstat > LR.test.stat)
print(boot.p)

# Monte Carlo standard error of this estimate:

sqrt(boot.p*(1-boot.p)/num.reps)


#####
#### Testing for Block Effect:


## Likelihood Ratio Method:

# defining the (full) model under the alternative:

m2.pen.mod.alt <- lmer(yield ~ treat + (1|blend), data=penicillin)

# defining the model under the null (no blend effect):

m2.pen.mod.null <- lm(yield ~ treat, data=penicillin)

LR.stat <- 2*(logLik(m2.pen.mod.alt) - logLik(m2.pen.mod.null,REML=TRUE))
print(LR.stat)

# Using a parametric bootstrap approach:

num.reps <- 100
lrstatf <- numeric(num.reps) # create a dummy vector (of zeroes) of length 'num.reps'
for(i in 1:num.reps){
 yyy <- unlist(simulate(m2.pen.mod.null)) # Simulate data under the null model
 bnull <- lm(yyy ~ treat, data=penicillin) # fit null model
 balt <- lmer(yyy ~ treat + (1|blend), data=penicillin) # fit alternative (full) model
 lrstatf[i] <- as.numeric(2*(logLik(balt)-logLik(bnull,REML=TRUE))) # calculate LR statistic
}


## Bootstrap p-value:

boot.p <- mean(lrstatf > LR.stat)
print(boot.p)

# Monte Carlo standard error of this estimate:

sqrt(boot.p*(1-boot.p)/num.reps)


############################################################
#
## Section 8.6:  Models with Nested Effects


# load the 'faraway' package:

library(faraway)

# loading the 'eggs' dataset:

data(eggs)

# attaching the 'eggs' dataset:

attach(eggs)

# load the 'lme4' package:

library(lme4)

# In this analysis, 'labs' are random, 
# 'technicians' are nested within labs 
# and 'samples' are nested within technicians.
# The technicians and samples are also considered random.

nested.mod <- lmer(Fat ~ 1 + (1|Lab) + (1|Lab:Technician) + (1|Lab:Technician:Sample), data=eggs)
summary(nested.mod)

# A simpler model, removing the component for variance among samples:

nested.mod.simp <- lmer(Fat ~ 1 + (1|Lab) + (1|Lab:Technician), data=eggs)
summary(nested.mod.simp)

#Testing whether the variance due to samples is significant:
anova(nested.mod, nested.mod.simp)
# Is this p-value trustworthy?  Doubtful.

# Finding the LR statistic for our data and models:

LR.stat.n <- 2*(logLik(nested.mod) - logLik(nested.mod.simp))
print(LR.stat.n)

# Testing whether the variance due to samples is significant, 
# using the parametric bootstrap approach:

num.reps <- 100
lrstatn <- numeric(num.reps) # create a dummy vector (of zeroes) of length 'num.reps'
for(i in 1:num.reps){
 rFat <- unlist(simulate(nested.mod.simp)) # Simulate data under the null model
 nullmod <- lmer(rFat ~ 1 + (1|Lab) + (1|Lab:Technician), data=eggs) # fit null model
 altmod <- lmer(rFat ~ 1 + (1|Lab) + (1|Lab:Technician) + (1|Lab:Technician:Sample), data=eggs) # fit alternative (full) model
 lrstatn[i] <- as.numeric(2*(logLik(altmod)-logLik(nullmod))) # calculate LR statistic
}


## Bootstrap p-value:

boot.p <- mean(lrstatn > LR.stat.n)
print(boot.p)

# Monte Carlo standard error of this estimate:

sqrt(boot.p*(1-boot.p)/num.reps)

#############

# A still simpler model, also removing the component for variance among technicians:

nested.mod.simp2 <- lmer(Fat ~ 1 + (1|Lab), data=eggs)
summary(nested.mod.simp2)

LR.stat.n2 <- 2*(logLik(nested.mod.simp) - logLik(nested.mod.simp2))
print(LR.stat.n2)


# Now testing whether the variance due to technicians is significant, 
# using the parametric bootstrap approach:

num.reps <- 100
lrstatt <- numeric(num.reps) # create a dummy vector (of zeroes) of length 'num.reps'
for(i in 1:num.reps){
 rFat <- unlist(simulate(nested.mod.simp2)) # Simulate data under the null model
 nullmod2 <- lmer(rFat ~ 1 + (1|Lab), data=eggs) # fit null model
 altmod2 <- lmer(rFat ~ 1 + (1|Lab) + (1|Lab:Technician), data=eggs) # fit alternative (full) model
 lrstatt[i] <- as.numeric(2*(logLik(altmod2)-logLik(nullmod2))) # calculate LR statistic
}


## Bootstrap p-value:

boot.p <- mean(lrstatt > LR.stat.n2)
print(boot.p)

# Monte Carlo standard error of this estimate:

sqrt(boot.p*(1-boot.p)/num.reps)


############################################################
#
## Section 8.8:  Multilevel Models


# load the 'faraway' package:

library(faraway)

# loading the 'jsp' dataset:

data(jsp)

# attaching the 'jsp' dataset:

attach(jsp)

# Only using the math scores for the THIRD year (the third year is coded year=2):

jspr <- jsp[jsp$year==2,]

# Some descriptive plots

plot(jitter(math)~jitter(raven), data=jspr, xlab='Raven score', ylab='Math score')
x11() # opens new plotting window
boxplot(math~social, data=jspr, xlab='Social class', ylab='Math score')

###
## A multilevel model will reflect the fact that students within the same school
## (and perhaps those within the same class) are likely to be dependent.

# Centering the 'raven' variable to aid later interpretation:
jspr$craven <- jspr$raven - mean(jspr$raven)

# load the 'lme4' package:

library(lme4)

## The multilevel level also allows us to include random effects that capture
## variation among schools and variation among classes within schools.

mmod.math <- lmer(math ~ craven*social*gender + (1|school) + (1|school:class), data=jspr)
# Note the 'craven*social*gender' code is shorthand to tell R to include 
# all main effects AND all interaction effects involving these three predictors.

# Examining the fixed-effect tests:

anova(mmod.math)

## Removing gender from the model:

mmod.math2 <- lmer(math ~ craven*social + (1|school) + (1|school:class), data=jspr)

# Comparing models via AIC:

AIC(mmod.math, mmod.math2)

# Examining the fixed-effect tests:

anova(mmod.math2)

summary(mmod.math2)

#######
### Some basic diagnostics:

qqnorm(resid(mmod.math2))

plot(fitted(mmod.math2), resid(mmod.math2), xlab='Fitted', ylab='Residuals')

# Should some transformation of the response variable 'math' be considered?

# We have enough data here that it makes sense to check the normality of the random effects:

# Random effects for school:

qqnorm(ranef(mmod.math2)$school[[1]], main='School effects')

# Random effects for class nested within school:

qqnorm(ranef(mmod.math2)$"school:class"[[1]], main='Class effects')

### Adjusted mean math scores by school:

# printing the school effects:

adjscores <- ranef(mmod.math2)$school[[1]]
rawscores <- coef(lm(math~school-1, jspr)) # The '-1' tells R not to include an intercept
rawscores <- rawscores - mean(rawscores) # centering by subtracting the mean

# comparing adjusted means to (centered) raw means:

print(cbind(rawscores, adjscores, rank(rawscores), rank(adjscores) ))

# scatter plot of adjusted scores vs. raw scores

plot(rawscores, adjscores, type='n') # sets up blank plot
sint <-  c(9,14,29)  # picking some schools of interest
text(rawscores, adjscores, labels=(1:50)[-10], cex=0.75)
points(rawscores[sint], adjscores[sint], cex=2.5, col='red')

###
### Including a compositional effect:
###

# Finding predicted raven score for each observation 
# based ONLY on the school of the student:
schraven <- lm(raven ~ school, data=jspr)$fit

mmod.math.c <- lmer(math ~ craven*social + schraven*social + (1|school) + (1|school:class), data=jspr)
anova(mmod.math.c)

# Comparing models via AIC:

summary(mmod.math.c)

# Compare to the mmod.math2 above which had an AIC of 5961 ...

# Does not appear that we need the compositional effect in the model.