# R example of Kruskal-Wallis test    
# (and multiple comparisons)            

# The data are the soil measurements we saw in class.  

# Entering the data:  

##########
##
# Reading the data into R:

ff <- tempfile()
cat(file=ff, "  
1 26.5
1 15.0
1 18.2
1 19.5
1 23.1
1 17.3
2 16.5
2 15.8
2 14.1 
2 30.2
2 25.1 
2 17.4
3 19.2
3 21.4
3 26.0
3 21.6
3 35.0
3 28.9
4 26.7
4 37.3
4 28.0
4 30.1
4 33.5
4 26.3
", sep=" ")

soil <- read.table(ff, header=FALSE, col.names=c("location", "claypct")) 
  
# Note we could also save the data columns into a file and use a command such as:
# soil <- read.table(file = "z:/stat_705/filename.txt", header=FALSE, col.names = c("location", "claypct"))

attach(soil)

# The data frame called soil is now created, 
# with two variables, "location", "claypct".
##
#########
####################################################################

location <- factor(location)

# boxplots for  
# each of the four groups:   

boxplot(claypct ~ location)                           

# The Shapiro-Wilk test result      
# for testing normality for each of the four groups:  
# Note that with such small sample sizes we have little power    
# to detect non-normality.                                  

tapply(claypct, location, shapiro.test)

# The kruskal.test function performs the Kruskal-Wallis test in R.      

# The response is listed first, and then the factor:

kruskal.test(claypct, location)   

# Equivalently, we could do:

kruskal.test(claypct ~ location)   

# Or if the data are listed in separate vectors:

claypct1 <- claypct[location==1]; claypct2 <- claypct[location==2]; claypct3 <- claypct[location==3]; claypct4 <- claypct[location==4]; 

kruskal.test(list(claypct1, claypct2, claypct3, claypct4))

#########################################################

# Distribution-free Bonferroni Multiple Comparisons in R 

# Getting the ranks for the data:  

soilranks <- rank(claypct)

# Printing the ranks for the data:  

cbind(location, claypct, soilranks)

# Performing the Bonferroni Multiple Comparisons:  

group.mean.ranks <- tapply(soilranks,location,mean)

# Getting the pairwise differences in sample mean ranks:

outer(group.mean.ranks,group.mean.ranks,'-')

# Determining which pairs of treatments are significantly different

# (For THIS PROBLEM, 10.77 is the Bonferroni cutoff value for EACH comparison!)

abs(outer(group.mean.ranks,group.mean.ranks,'-')) > 10.77