# STAT_516_Lec_05.R


link <- url("https://people.stat.sc.edu/gregorkb/data/KNNLrust.txt")
rust <- read.csv(link,col.names=c("score","brand","rep"),sep = "", header = FALSE)
head(rust)

boxplot(score ~ brand, data = rust)

aggregate(score ~ brand, mean,data = rust)
aggregate(score ~ brand, sd,data = rust)
aggregate(score ~ brand, length ,data = rust)

lm_out <- lm(score ~ as.factor(brand), data = rust)
summary(lm_out)

Y <- rust$score
Yhat <- lm_out$fitted.values
ehat <- Y - Yhat
N <- length(Y)
a <- 4
n <- 10

sigma_hat <- sqrt(sum(ehat**2)/(N-a))
sigma_hat

# build ANOVA table

SStot <- sum((Y - mean(Y))**2)
SSerror <- sum( ehat**2)
SStrt <- SStot - SSerror

MStrt <- SStrt / (a - 1)
MSerror <- SSerror / (N - a)

Ftest <- MStrt / MSerror

alpha <- 0.05

Fcrit <- qf(1 - alpha, a-1,N-a)
Fcrit

pval <- 1 - pf(Ftest,a-1, N-a)

# also get the table from 

anova(lm_out) 


# compare all pairs of means (naively -- without
# adjusting for multiple comparisons)

# for illustration just compare groups 1 and 2
Y1. <- mean(rust$score[rust$brand==1])
Y2. <- mean(rust$score[rust$brand==2])
Y3. <- mean(rust$score[rust$brand==3])
Y4. <- mean(rust$score[rust$brand==4])

sigma_hat <- sqrt(MSerror)
tval <- qt(1-alpha/2,a*(n-1))

lo <- Y2. - Y1. - tval*sigma_hat *sqrt(2/n)
up <- Y2. - Y1. + tval*sigma_hat *sqrt(2/n)
c(lo,up)

# compare group 4 to group 1 naively...
lo <- Y4. - Y1. - tval*sigma_hat *sqrt(2/n)
up <- Y4. - Y1. + tval*sigma_hat *sqrt(2/n)
c(lo,up)

tval*sqrt(2)

# Tukey's version (get a wider interval):

lo <- Y2. - Y1. - 3.85*sigma_hat *sqrt(1/n)
up <- Y2. - Y1. + 3.85*sigma_hat *sqrt(1/n)
c(lo,up)

# get Tukey intervals automatically:


aov_out <- aov(lm_out)

TukeyHSD(aov_out) # HSD: honest significant difference

plot(TukeyHSD(aov_out))

# Suppose we are using Dunnett's to compare all brands with 
# brand 1 as the baseline:

lo <- Y2. - Y1. - 2.47*sigma_hat *sqrt(2/n)
up <- Y2. - Y1. + 2.47*sigma_hat *sqrt(2/n)
c(lo,up)

library(DescTools)

Dunnett_out <- DunnettTest(score ~ as.factor(brand), 
            data = rust, 
            control="1")
plot(Dunnett_out)

# Bonferroni (always valid) for all pairs

B <- 6 # total number pairs

tvalB <- qt(1-(alpha/B)/2,a*(n-1))

lo <- Y2. - Y1. - tvalB*sigma_hat *sqrt(2/n)
up <- Y2. - Y1. + tvalB*sigma_hat *sqrt(2/n)
c(lo,up)