# This example shows the analyses for the one-way ANOVA  
# using the rice data example we looked at in class  

# Entering the data and defining the variables:  

##########
##
# Reading the data into R:

my.datafile <- tempfile()
cat(file=my.datafile, "
                                  1       934
                                  1      1041
                                  1      1028
                                  1       935
                                  2       880
                                  2       963
                                  2       924
                                  2       946
                                  3       987
                                  3       951
                                  3       976
                                  3       840
                                  4       992
                                  4      1143
                                  4      1140
                                  4      1191
", sep=" ")

options(scipen=999) # suppressing scientific notation

rice <- read.table(my.datafile, header=FALSE, col.names=c("variety", "yield")) 
  
# Note we could also save the data columns into a file and use a command such as:
# rice <- read.table(file = "z:/stat_516/filename.txt", header=FALSE, col.names = c("variety", "yield"))

attach(rice)

# The data frame called rice is now created, 
# with two variables, variety and yield.
##
#########

############################################################################* 

# lm() and anova() will do a standard analysis of variance                        
# We specify that variety is a (qualitative) factor with the factor() function: 
                                    
# Making "variety" a factor:

variety <- factor(variety)

# The lm statement specifies that yield is the response                  
# and variety is the factor 
# The ANOVA table is produced by the anova() function         

rice.fit <- lm(yield ~ variety);
anova(rice.fit)

############################################################################* 

# The following code produces some residual plots  
# and Levene's test for unequal variances 

## A short function to perform the Levene test 
## for equal variances across the populations:

Levene.test <- function (y, group) 
{
    group <- as.factor(group) # precautionary
    means <- tapply(y, group, mean, na.rm = TRUE)
    resp <- abs(y - means[group])
    anova(lm(resp ~ group))[, c(1, 4, 5)]
} 

# Implementing the function for our data:

Levene.test(yield, variety)

# printing the sample mean yields for each variety level:

fitted.values(rice.fit)

# plotting residuals versus fitted values:

plot(fitted.values(rice.fit), residuals(rice.fit), xlab="Fitted Values", ylab="Residuals"); abline(h=0)

# A normal Q-Q plot of the residuals:

qqnorm(residuals(rice.fit))

# Note that according to Levene's test (P-value = 0.4654) we would   
# FAIL TO REJECT the null hypothesis that all variances are equal.   
# So the equal-variance assumption seems reasonable for these data.  

# Notice there is some evidence of nonnormality based on the Q-Q plot 


############################################################################* 

#  Estimating and testing contrasts  

# The following code estimates the contrasts in the example from class              

# We also use a t-test to test whether a contrast is zero  

# Getting the sample means for each group:

samp.means <- tapply(yield, variety, mean, na.rm = TRUE)
print(samp.means)

### variety 4 vs. Others:

# Defining the correct coefficients for the contrast:

contrast.coefficients <- c(1/3, 1/3, 1/3, -1)

# Just need to specify name of fit object (like "rice.fit" here)
# and name of factor (like "variety") in code below:

L.hat <- sum(contrast.coefficients * samp.means)
se.L.hat <- sqrt( sum(anova(rice.fit)["Residuals","Mean Sq"]*((contrast.coefficients^2)/table(variety)) ) )
t.star <- L.hat/se.L.hat; 
two.sided.P.val <- round(2*pt(abs(t.star), df=anova(rice.fit)["Residuals","Df"], lower.tail=F),4)

print("variety 4 vs. Others:")
data.frame(L.hat, se.L.hat, t.star, two.sided.P.val)


#### A shorter way to test about this contrast:

# Comparing Variety 4 vs. Others:  
#  
contrasts(variety) <- matrix(c(1/3, 1/3, 1/3, -1), nrow=4, ncol=1)

summary(lm(yield ~ variety))$coef["variety1",]

# Look on the 'variety1' line (selected via the above code) 
# for P-value of t-test about this contrast.

# I prefer the longer way, since it is more instructive about what is going on.
# The long way also gives the correct L-hat point estimate.
##
#####

### variety 1 vs. variety 2:

contrast.coefficients <- c(1, -1, 0, 0)

L.hat <- sum(contrast.coefficients * samp.means)
se.L.hat <- sqrt( sum(anova(rice.fit)["Residuals","Mean Sq"]*((contrast.coefficients^2)/table(variety)) ) )
t.star <- L.hat/se.L.hat; 
two.sided.P.val <- round(2*pt(abs(t.star), df=anova(rice.fit)["Residuals","Df"], lower.tail=F),4)

print("variety 1 vs. variety 2:")
data.frame(L.hat, se.L.hat, t.star, two.sided.P.val)

#### A shorter way to test about this contrast:

# Comparing variety 1 vs. variety 2:  
#  
contrasts(variety) <- matrix(c(1, -1, 0, 0), nrow=4, ncol=1)

summary(lm(yield ~ variety))$coef["variety1",]

# Look on the 'variety1' line (selected via the above code) 
# for P-value of t-test about this contrast.

# I prefer the longer way, since it is more instructive about what is going on.
# The long way also gives the correct L-hat point estimate.
##
#####

### When doing simultaneous tests about multiple contrasts:

exper.error.rate <- 0.05
number.of.tests <- 2

adjusted.cutoff <- qt(1 - exper.error.rate/(2*number.of.tests),df=anova(rice.fit)["Residuals","Df"])
print(paste("If |t*| is greater than ", round(adjusted.cutoff,4), "then reject H0 for that test."))

############################################################################* 

# Post Hoc Multiple Comparisons in R  

# Fisher LSD procedure:

#Using an alpha of 0.05:

alpha <- 0.05

# Note from the ANOVA table there are 12 error d.f. in this problem:
# and n = 4 for this problem (4 observations per level):

error.df <- anova(rice.fit)["Residuals","Df"]
n <- 4
MSW <- summary(rice.fit)$sigma^2

least.signif.differ <- qt(1 - alpha/2, error.df) * sqrt(2*MSW/n)

cbind(TukeyHSD(aov(rice.fit))$variety[,1], least.signif.differ)

# Using alpha=0.05 for the comparisonwise error rate, any pair of means 
# whose absolute difference (from first column; take absolute values) is 
# GREATER THAN the Least Significant Difference (second column) are 
# judged to be significantly different by Fisher.   

# Tukey procedure:

# The TukeyHSD function does the Tukey multiple comparison procedure in R. 
# The last column gives the ADJUSTED P-values.  Using alpha=0.05 for the 
# experimentwise error rate, any pair of means with P-value LESS THAN 0.05
# in that column are judged to be significantly different by Tukey.                               

TukeyHSD(aov(rice.fit),conf.level=0.95)

# conf.level=0.95 is actually the default level.  We could choose  
# another level if desired.                                   

# Notice the results for the Fisher LSD and Tukey procedures.  According to    
# Fisher, the mean for variety 4 is significantly different from the means of  
# each other variety.  Tukey gives similar results, but Tukey's method does    
# NOT find a significant difference between varieties 1 and 4.                 
# Recall:  Tukey is more conservative (less likely to reject H_0).  Tukey      
# offers more protection against Type I errors, but less power.                
                            

############################################################################* 

# Simultaneous 95% Confidence Intervals for All Pairwise Differences  
# Uses the Tukey method.                                            

TukeyHSD(aov(rice.fit),conf.level=0.95)

############################################################################*