######################### # # # STAT 599 Spring 2013 # # # # Example R code # # # # Chapter 8 # # # ######################### ### Installing the add-on packages needed for this course: # If you haven't already done so, # type at the command line (without the preceding # ): # install.packages(c('faraway','mgcv','acepack','mda','brlr','sm','wavethresh','SuppDists','lme4')) # and choose a U.S. mirror site when the window pops up. # This will install the needed packages for you. # load the 'faraway' package: library(faraway) # loading the 'pulp' dataset: data(pulp) # attaching the 'pulp' dataset: attach(pulp) ## Section 8.1: ### Fitting a basic fixed-effects one-way ANOVA: op <- options(contrasts=c("contr.sum", "contr.poly")) # Using the 'aov' function to fit the one-way ANOVA: lmod <- aov(bright ~ operator, data=pulp) summary(lmod) coef(lmod) options(op) # estimate of sigma^2_alpha: (anova(lmod)["operator", "Mean Sq"] - anova(lmod)["Residuals", "Mean Sq"]) / 5 # Note 5 = number of observations per factor level. ### Fitting a random-effects ANOVA: # load the 'lme4' package: library(lme4) mmod <- lmer(bright ~1 + (1|operator), data=pulp) summary(mmod) # The first part indictes the fixed effects: # the '1' indicates the intercept is fixed. # The term(s) in parentheses describe the random effects: # The (1|operator) indicates the random levels are based on 'operator' # and the '1' indicates the random effect is constant within each group. # The default above is to compute REML estimates. # To get ML estimates, use: method='ML': smod <- lmer(bright ~1 + (1|operator), data=pulp, method='ML') summary(smod) #### Section 8.2: Inference for the random effects model: # If we treat operators as fixed levels, our F-test for an operator effect is # given in the summary output: summary(lmod) # If we treat operators as random levels, we could use a likelihood ratio # approach to test for significant variation among the random effects: # Our reduced model here is just a null model with no operator effects: nullmod <- lm(bright ~ 1, data=pulp) # Our full model is the model above that includes random operator effects. # Comparing the log likelihoods: LR.test.stat <- as.numeric(2*(logLik(smod) - logLik(nullmod))) print(LR.test.stat) # Assuming a chi-square null distribution: pchisq(LR.test.stat, 1, lower=F) # This p-value is probably not trustworthy here; # the null distribution is probably not chi-square. # The parametric bootstrap will give a better P-value: ######## #### # num.reps <- 100 lrstat <- numeric(num.reps) # create a dummy vector (of zeroes) of length 'num.reps' for(i in 1:num.reps){ y <- unlist(simulate(nullmod)) # Simulate data under the null model bnull <- lm(y ~ 1) # fit null model balt <- lmer(y ~ 1 + (1|operator), data=pulp, method='ML') # fit alternative (full) model lrstat[i] <- as.numeric(2*(logLik(balt)-logLik(bnull))) # calculate LR statistic } ## Bootstrap p-value: boot.p <- mean(lrstat > LR.test.stat) print(boot.p) # Monte Carlo standard error of this estimate: sqrt(boot.p*(1-boot.p)/num.reps) ##### Section 8.3: Predicting Random Effects: ## Predicting each random effect using an empirical Bayes approach: ranef(mmod)$operator # Compare these to the fixed effects: model.tables(lmod) # Each fixed effect estimate is exactly 1.3121 times larger than # the corresponding random effect estimate. ## Getting the Best Linear Unbiased Predictors (BLUPs) ## for observations in any of the groups: fixef(mmod) + ranef(mmod)$operator ## Diagnostic Plots: # Normal Q-Q plot of residuals: qqnorm(resid(mmod),main='Normality Check for Residuals') # Plot of Residuals vs. Fitted values: plot(fitted(mmod),resid(mmod),xlab='Fitted Values', ylab='Residuals'); abline(h=0) # The normality assumption and the constant variance assumption seems reasonable... ############################################################ # ## Section 8.4: Blocks As Random Effects # load the 'faraway' package: library(faraway) # loading the 'penicillin' dataset: data(penicillin) # attaching the 'penicillin' dataset: attach(penicillin) ### Fitting a basic fixed-effects two-way ANOVA: op <- options(contrasts=c("contr.sum", "contr.poly")) # Using the 'aov' function to fit the two-way ANOVA: # This considers both treatment effects and block effects to be FIXED. pen.mod <- aov(yield ~ blend + treat, data=penicillin) summary(pen.mod) coef(pen.mod) # Using the 'lmer' function in the 'lme4' package to fit the two-factor mixed effects model: # load the 'lme4' package: library(lme4) m.pen.mod <- lmer(yield ~ treat + (1|blend), data=penicillin) summary(m.pen.mod) options(op) ## Getting the Best Linear Unbiased Predictors (BLUPs) ## for observations with any of the blends: ranef(m.pen.mod)$blend ##### #### Testing for Treatment Effect: ## ANOVA method: anova(m.pen.mod) # Identical to result obtain when blocks were considered fixed... ## Likelihood Ratio Method: # defining the (full) model under the alternative: m.pen.mod.alt <- lmer(yield ~ treat + (1|blend), data=penicillin, method='ML') # defining the model under the null: m.pen.mod.null <- lmer(yield ~ 1 + (1|blend), data=penicillin, method='ML') # Assuming a chi-square null distribution for the LR statistic: anova(m.pen.mod.alt, m.pen.mod.null) # NOT assuming a chi-square null distribution for the LR statistic: # Instead, using a parametric bootstrap approach: ######## #### # num.reps <- 100 lrstat <- numeric(num.reps) # create a dummy vector (of zeroes) of length 'num.reps' for(i in 1:num.reps){ yy <- unlist(simulate(m.pen.mod.null)) # Simulate data under the null model bnull <- lmer(yy ~ 1 + (1|blend), data=penicillin, method='ML') # fit null model balt <- lmer(yy ~ treat + (1|blend), data=penicillin, method='ML') # fit alternative (full) model lrstat[i] <- as.numeric(2*(logLik(balt)-logLik(bnull))) # calculate LR statistic } ## Checking whether the LR statistic seems to be ch-square(3 df) distributed: plot(qchisq((1:num.reps)/(num.reps+1),3),sort(lrstat),xlab=expression(chi[3]^2),ylab='Simulated LR stat'); abline(0,1) ## Bootstrap p-value: LR.test.stat <- 4.05 # from table above boot.p <- mean(lrstat > LR.test.stat) print(boot.p) # Monte Carlo standard error of this estimate: sqrt(boot.p*(1-boot.p)/num.reps) ##### #### Testing for Block Effect: ## Likelihood Ratio Method: # defining the (full) model under the alternative: m2.pen.mod.alt <- lmer(yield ~ treat + (1|blend), data=penicillin) # defining the model under the null (no blend effect): m2.pen.mod.null <- lm(yield ~ treat, data=penicillin) LR.stat <- 2*(logLik(m2.pen.mod.alt) - logLik(m2.pen.mod.null,REML=TRUE)) print(LR.stat) # Using a parametric bootstrap approach: num.reps <- 100 lrstatf <- numeric(num.reps) # create a dummy vector (of zeroes) of length 'num.reps' for(i in 1:num.reps){ yyy <- unlist(simulate(m2.pen.mod.null)) # Simulate data under the null model bnull <- lm(yyy ~ treat, data=penicillin) # fit null model balt <- lmer(yyy ~ treat + (1|blend), data=penicillin) # fit alternative (full) model lrstatf[i] <- as.numeric(2*(logLik(balt)-logLik(bnull,REML=TRUE))) # calculate LR statistic } ## Bootstrap p-value: boot.p <- mean(lrstatf > LR.stat) print(boot.p) # Monte Carlo standard error of this estimate: sqrt(boot.p*(1-boot.p)/num.reps) ############################################################ # ## Section 8.6: Models with Nested Effects # load the 'faraway' package: library(faraway) # loading the 'eggs' dataset: data(eggs) # attaching the 'eggs' dataset: attach(eggs) # load the 'lme4' package: library(lme4) # In this analysis, 'labs' are random, # 'technicians' are nested within labs # and 'samples' are nested within technicians. # The technicians and samples are also considered random. nested.mod <- lmer(Fat ~ 1 + (1|Lab) + (1|Lab:Technician) + (1|Lab:Technician:Sample), data=eggs) summary(nested.mod) # A simpler model, removing the component for variance among samples: nested.mod.simp <- lmer(Fat ~ 1 + (1|Lab) + (1|Lab:Technician), data=eggs) summary(nested.mod.simp) #Testing whether the variance due to samples is significant: anova(nested.mod, nested.mod.simp) # Is this p-value trustworthy? Doubtful. # Finding the LR statistic for our data and models: LR.stat.n <- 2*(logLik(nested.mod) - logLik(nested.mod.simp)) print(LR.stat.n) # Testing whether the variance due to samples is significant, # using the parametric bootstrap approach: num.reps <- 100 lrstatn <- numeric(num.reps) # create a dummy vector (of zeroes) of length 'num.reps' for(i in 1:num.reps){ rFat <- unlist(simulate(nested.mod.simp)) # Simulate data under the null model nullmod <- lmer(rFat ~ 1 + (1|Lab) + (1|Lab:Technician), data=eggs) # fit null model altmod <- lmer(rFat ~ 1 + (1|Lab) + (1|Lab:Technician) + (1|Lab:Technician:Sample), data=eggs) # fit alternative (full) model lrstatn[i] <- as.numeric(2*(logLik(altmod)-logLik(nullmod))) # calculate LR statistic } ## Bootstrap p-value: boot.p <- mean(lrstatn > LR.stat.n) print(boot.p) # Monte Carlo standard error of this estimate: sqrt(boot.p*(1-boot.p)/num.reps) ############# # A still simpler model, also removing the component for variance among technicians: nested.mod.simp2 <- lmer(Fat ~ 1 + (1|Lab), data=eggs) summary(nested.mod.simp2) LR.stat.n2 <- 2*(logLik(nested.mod.simp) - logLik(nested.mod.simp2)) print(LR.stat.n2) # Now testing whether the variance due to technicians is significant, # using the parametric bootstrap approach: num.reps <- 100 lrstatt <- numeric(num.reps) # create a dummy vector (of zeroes) of length 'num.reps' for(i in 1:num.reps){ rFat <- unlist(simulate(nested.mod.simp2)) # Simulate data under the null model nullmod2 <- lmer(rFat ~ 1 + (1|Lab), data=eggs) # fit null model altmod2 <- lmer(rFat ~ 1 + (1|Lab) + (1|Lab:Technician), data=eggs) # fit alternative (full) model lrstatt[i] <- as.numeric(2*(logLik(altmod2)-logLik(nullmod2))) # calculate LR statistic } ## Bootstrap p-value: boot.p <- mean(lrstatt > LR.stat.n2) print(boot.p) # Monte Carlo standard error of this estimate: sqrt(boot.p*(1-boot.p)/num.reps) ############################################################ # ## Section 8.8: Multilevel Models # load the 'faraway' package: library(faraway) # loading the 'jsp' dataset: data(jsp) # attaching the 'jsp' dataset: attach(jsp) # Only using the math scores for the THIRD year (the third year is coded year=2): jspr <- jsp[jsp$year==2,] # Some descriptive plots plot(jitter(math)~jitter(raven), data=jspr, xlab='Raven score', ylab='Math score') x11() # opens new plotting window boxplot(math~social, data=jspr, xlab='Social class', ylab='Math score') ### ## A multilevel model will reflect the fact that students within the same school ## (and perhaps those within the same class) are likely to be dependent. # Centering the 'raven' variable to aid later interpretation: jspr$craven <- jspr$raven - mean(jspr$raven) # load the 'lme4' package: library(lme4) ## The multilevel level also allows us to include random effects that capture ## variation among schools and variation among classes within schools. mmod.math <- lmer(math ~ craven*social*gender + (1|school) + (1|school:class), data=jspr) # Note the 'craven*social*gender' code is shorthand to tell R to include # all main effects AND all interaction effects involving these three predictors. # Examining the fixed-effect tests: anova(mmod.math) ## Removing gender from the model: mmod.math2 <- lmer(math ~ craven*social + (1|school) + (1|school:class), data=jspr) # Comparing models via AIC: AIC(mmod.math, mmod.math2) # Examining the fixed-effect tests: anova(mmod.math2) summary(mmod.math2) ####### ### Some basic diagnostics: qqnorm(resid(mmod.math2)) plot(fitted(mmod.math2), resid(mmod.math2), xlab='Fitted', ylab='Residuals') # Should some transformation of the response variable 'math' be considered? # We have enough data here that it makes sense to check the normality of the random effects: # Random effects for school: qqnorm(ranef(mmod.math2)$school[[1]], main='School effects') # Random effects for class nested within school: qqnorm(ranef(mmod.math2)$"school:class"[[1]], main='Class effects') ### Adjusted mean math scores by school: # printing the school effects: adjscores <- ranef(mmod.math2)$school[[1]] rawscores <- coef(lm(math~school-1, jspr)) # The '-1' tells R not to include an intercept rawscores <- rawscores - mean(rawscores) # centering by subtracting the mean # comparing adjusted means to (centered) raw means: print(cbind(rawscores, adjscores, rank(rawscores), rank(adjscores) )) # scatter plot of adjusted scores vs. raw scores plot(rawscores, adjscores, type='n') # sets up blank plot sint <- c(9,14,29) # picking some schools of interest text(rawscores, adjscores, labels=(1:50)[-10], cex=0.75) points(rawscores[sint], adjscores[sint], cex=2.5, col='red') ### ### Including a compositional effect: ### # Finding predicted raven score for each observation # based ONLY on the school of the student: schraven <- lm(raven ~ school, data=jspr)$fit mmod.math.c <- lmer(math ~ craven*social + schraven*social + (1|school) + (1|school:class), data=jspr) anova(mmod.math.c) # Comparing models via AIC: summary(mmod.math.c) # Compare to the mmod.math2 above which had an AIC of 5961 ... # Does not appear that we need the compositional effect in the model.