# R example to analyze Nested Design 

# these are the training-school data from class and Chapter 26  

# Save the data file into a directory and 
# use the full path name:

trainingschool.data <- read.table(file = "z:/My Documents/teaching/stat_705/trainingschooldata.txt", 
header=FALSE, col.names = c('score', 'school', 'instructor', 'obs'))

# attaching the data frame:

attach(trainingschool.data)

# Making "school" and "instructor" factors:

school <- factor(school)
instructor <- factor(instructor)

# We run lm() and anova() with "school" and "instructor" as the two factors. 
# We indicate that the levels of "instructor" are nested within the levels 
# of "school" with the "school/instructor" syntax for the second factor.                      

trainingschool.fit <- lm(score ~ school + school/instructor)

anova(trainingschool.fit)

# We see that the 3 schools differ in mean score (F* = 11.18, P-value = .0095)  
# and that instructors within at least one school have different mean scores    
# (F* = 27.02, P-value = .0007).                                                

# ########################################################################*  

# To determine in WHICH schools the instructor effects are significant, we     
# further decompose the SSB(A) into SSB(A_1), SSB(A_2), and SSB(A_3).  

anova.table <- function(inputmatrix){
column.1 <- inputmatrix[,1]
column.2 <- inputmatrix[,2]
my.lm <- lm(column.1 ~ factor(column.2))
my.anova.table <- anova(my.lm)
return(my.anova.table)
}

by(cbind(score, instructor), INDICES = school, anova.table)      

# Note R gives (over 3 pieces) the following results:                               
# Atlanta school: MSB(A_1)=210.25 => F* = 210.25/7 = 30.0  (we divide this by hand)  
# Chicago school: MSB(A_2)=132.25 => F* = 132.25/7 = 18.9  (we divide this by hand)  
# San Fran school: MSB(A_3)=225.0 => F* = 225/7 = 32.14    (we divide this by hand)  
# Note that each of these is divided by the original MSE of 7.                       

# Since F(0.95,1,6) = 5.99, then: At the 0.05 level, instructors within the      
# Atlanta school have different mean scores; at the 0.05 level, instructors      
# within the Chicago school have different mean scores; and at the 0.05          
# level, instructors within the San Francisco school have different mean         
# scores.  The FAMILY significance level for this SET of tests is at most 0.15.  

# ########################################################################**  

# Some Residual Plots to Check the Standard Model Assumptions:  

# Residual Plots and Q-Q plots:  

# Residuals Plotted vs. Fitted Values: 
plot(fitted(trainingschool.fit), residuals(trainingschool.fit), xlab="Fitted Values", ylab="Residuals"); abline(h=0)
 
# Residuals Plotted for each SCHOOL Level  

plot(as.numeric(school), residuals(trainingschool.fit), xlab="School", ylab="Residuals"); abline(h=0)

# Normal Q-Q Plot of Residuals:  

qqnorm(residuals(trainingschool.fit))

# ########################################################################

# Further Investigation of Treatment Means: 

# Tukey Simultaneous 90% CIs for differences in pairs of school mean scores can by found by: 

TukeyHSD(aov(trainingschool.fit),conf.level=0.90)$school

# Getting the Bonferroni CIs at the bottom of p. 1102 is best done by hand, since the relevant 
# mean differences are only a few of the pairwise comparisons among all (3)(2) = 6 treatments.