# This example shows the analyses for the one-way ANOVA # using the rice data example we looked at in class # Entering the data and defining the variables: ########## ## # Reading the data into R: my.datafile <- tempfile() cat(file=my.datafile, " 1 934 1 1041 1 1028 1 935 2 880 2 963 2 924 2 946 3 987 3 951 3 976 3 840 4 992 4 1143 4 1140 4 1191 ", sep=" ") options(scipen=999) # suppressing scientific notation rice <- read.table(my.datafile, header=FALSE, col.names=c("variety", "yield")) # Note we could also save the data columns into a file and use a command such as: # rice <- read.table(file = "z:/stat_516/filename.txt", header=FALSE, col.names = c("variety", "yield")) attach(rice) # The data frame called rice is now created, # with two variables, variety and yield. ## ######### ############################################################################* # lm() and anova() will do a standard analysis of variance # We specify that variety is a (qualitative) factor with the factor() function: # Making "variety" a factor: variety <- factor(variety) # The lm statement specifies that yield is the response # and variety is the factor # The ANOVA table is produced by the anova() function rice.fit <- lm(yield ~ variety); anova(rice.fit) ############################################################################* # The following code produces some residual plots # and Levene's test for unequal variances ## A short function to perform the Levene test ## for equal variances across the populations: Levene.test <- function (y, group) { group <- as.factor(group) # precautionary means <- tapply(y, group, mean, na.rm = TRUE) resp <- abs(y - means[group]) anova(lm(resp ~ group))[, c(1, 4, 5)] } # Implementing the function for our data: Levene.test(yield, variety) # printing the sample mean yields for each variety level: fitted.values(rice.fit) # plotting residuals versus fitted values: plot(fitted.values(rice.fit), residuals(rice.fit), xlab="Fitted Values", ylab="Residuals"); abline(h=0) # A normal Q-Q plot of the residuals: qqnorm(residuals(rice.fit)) # Note that according to Levene's test (P-value = 0.4654) we would # FAIL TO REJECT the null hypothesis that all variances are equal. # So the equal-variance assumption seems reasonable for these data. # Notice there is some evidence of nonnormality based on the Q-Q plot ############################################################################* # Estimating and testing contrasts # The following code estimates the contrasts in the example from class # We also use a t-test to test whether a contrast is zero # Getting the sample means for each group: samp.means <- tapply(yield, variety, mean, na.rm = TRUE) print(samp.means) ### variety 4 vs. Others: # Defining the correct coefficients for the contrast: contrast.coefficients <- c(1/3, 1/3, 1/3, -1) # Just need to specify name of fit object (like "rice.fit" here) # and name of factor (like "variety") in code below: L.hat <- sum(contrast.coefficients * samp.means) se.L.hat <- sqrt( sum(anova(rice.fit)["Residuals","Mean Sq"]*((contrast.coefficients^2)/table(variety)) ) ) t.star <- L.hat/se.L.hat; two.sided.P.val <- round(2*pt(abs(t.star), df=anova(rice.fit)["Residuals","Df"], lower.tail=F),4) print("variety 4 vs. Others:") data.frame(L.hat, se.L.hat, t.star, two.sided.P.val) #### A shorter way to test about this contrast: # Comparing Variety 4 vs. Others: # contrasts(variety) <- matrix(c(1/3, 1/3, 1/3, -1), nrow=4, ncol=1) summary(lm(yield ~ variety))$coef["variety1",] # Look on the 'variety1' line (selected via the above code) # for P-value of t-test about this contrast. # I prefer the longer way, since it is more instructive about what is going on. # The long way also gives the correct L-hat point estimate. ## ##### ### variety 1 vs. variety 2: contrast.coefficients <- c(1, -1, 0, 0) L.hat <- sum(contrast.coefficients * samp.means) se.L.hat <- sqrt( sum(anova(rice.fit)["Residuals","Mean Sq"]*((contrast.coefficients^2)/table(variety)) ) ) t.star <- L.hat/se.L.hat; two.sided.P.val <- round(2*pt(abs(t.star), df=anova(rice.fit)["Residuals","Df"], lower.tail=F),4) print("variety 1 vs. variety 2:") data.frame(L.hat, se.L.hat, t.star, two.sided.P.val) #### A shorter way to test about this contrast: # Comparing variety 1 vs. variety 2: # contrasts(variety) <- matrix(c(1, -1, 0, 0), nrow=4, ncol=1) summary(lm(yield ~ variety))$coef["variety1",] # Look on the 'variety1' line (selected via the above code) # for P-value of t-test about this contrast. # I prefer the longer way, since it is more instructive about what is going on. # The long way also gives the correct L-hat point estimate. ## ##### ### When doing simultaneous tests about multiple contrasts: exper.error.rate <- 0.05 number.of.tests <- 2 adjusted.cutoff <- qt(1 - exper.error.rate/(2*number.of.tests),df=anova(rice.fit)["Residuals","Df"]) print(paste("If |t*| is greater than ", round(adjusted.cutoff,4), "then reject H0 for that test.")) ############################################################################* # Post Hoc Multiple Comparisons in R # Fisher LSD procedure: #Using an alpha of 0.05: alpha <- 0.05 # Note from the ANOVA table there are 12 error d.f. in this problem: # and n = 4 for this problem (4 observations per level): error.df <- anova(rice.fit)["Residuals","Df"] n <- 4 MSW <- summary(rice.fit)$sigma^2 least.signif.differ <- qt(1 - alpha/2, error.df) * sqrt(2*MSW/n) cbind(TukeyHSD(aov(rice.fit))$variety[,1], least.signif.differ) # Using alpha=0.05 for the comparisonwise error rate, any pair of means # whose absolute difference (from first column; take absolute values) is # GREATER THAN the Least Significant Difference (second column) are # judged to be significantly different by Fisher. # Tukey procedure: # The TukeyHSD function does the Tukey multiple comparison procedure in R. # The last column gives the ADJUSTED P-values. Using alpha=0.05 for the # experimentwise error rate, any pair of means with P-value LESS THAN 0.05 # in that column are judged to be significantly different by Tukey. TukeyHSD(aov(rice.fit),conf.level=0.95) # conf.level=0.95 is actually the default level. We could choose # another level if desired. # Notice the results for the Fisher LSD and Tukey procedures. According to # Fisher, the mean for variety 4 is significantly different from the means of # each other variety. Tukey gives similar results, but Tukey's method does # NOT find a significant difference between varieties 1 and 4. # Recall: Tukey is more conservative (less likely to reject H_0). Tukey # offers more protection against Type I errors, but less power. ############################################################################* # Simultaneous 95% Confidence Intervals for All Pairwise Differences # Uses the Tukey method. TukeyHSD(aov(rice.fit),conf.level=0.95) ############################################################################*