# We can read the body measurements data from the Internet:

bodym <- read.table("http://www.stat.sc.edu/~hitchcock/bodymeasgrps.txt", header=T)

# If not connected to the Internet, we could save the file and read it 
# similarly to the following:

# bodym<-read.table("Z:/stat_530/bodymeasgrps.txt", header=T)

attach(bodym)

bodym.noIDs <- bodym[,-1]  # removing the ID column before doing the analysis

########################################
############################################
###
###  R Code to do Hotelling's T^2 test the long way:
###

# number of observations for the 'young' group and the 'old' group:
l1 <- length(agegp[agegp=='young'])
l2 <- length(agegp[agegp=='old'])

# Splitting the data matrix (while removing the 9th column, agegp)
# into two subsets, one for old and another for young:
x1 <- bodym.noIDs[agegp=='young', -9]
x2 <- bodym.noIDs[agegp=='old', -9]
my.q <- ncol(x1)

# Sample mean vectors for the 'young' group and the 'old' group:
m1 <- apply(x1, 2, mean)
m2 <- apply(x2, 2, mean)

# "pooled" sample covariance matrix:
S123 <- ((l1-1)*var(x1)+(l2-1)*var(x2))/(l1+l2-2)

# Hotelling T^2, the F-statistic, and the P-value:
T2 <- ((l1*l2)/(l1+l2))* (t(m1-m2) %*% solve(S123) %*% (m1-m2) )  # code on p. 140 of book is wrong here!
Fstat <- ((l1+l2-my.q-1)*T2)/((l1+l2-2)*my.q)
pvalue <- 1-pf(Fstat, my.q, l1+l2-my.q-1)

print(paste("Hotelling T^2 =", round(T2,4), 
"F=", round(Fstat,4), "P-value =", round(pvalue,4) ))
###
###
############################################
############################################

# A simpler approach (in terms of code):

summary(manova(cbind(weight, height, neck, chest,  hip, thigh, biceps, wrist) ~ agegp),test="Hotelling")

# Checking model assumption of normality:

###############
###
chisplot <- function(x) {
    if (!is.matrix(x)) stop("x is not a matrix")
    ### determine dimensions
    n <- nrow(x)
    p <- ncol(x)
    xbar <- apply(x, 2, mean)
    S <- var(x)
    S <- solve(S)
    index <- (1:n)/(n+1)
    xcent <- t(t(x) - xbar)
    di <- apply(xcent, 1, function(x,S) x %*% S %*% x,S)
    quant <- qchisq(index,p)
    plot(quant, sort(di), ylab = "Ordered distances",
         xlab = "Chi-square quantile", lwd=2,pch=1)
}
###
###############

chisplot(as.matrix(x1))
chisplot(as.matrix(x2))

# No strong evidence against normality for either sample.

# Examining the sample covariance matrices for each group:

by(bodym.noIDs[,-9],agegp, var)

# Is there evidence that the covariance matrices are significantly different across the 2 age groups?

### An R function to test the equality of several covariance matrices:

# Need to install the 'asbio' package from the Internet:

library(asbio)  # load the 'asbio' package once it is installed

# The Kullback test is somewhat conservative and may lack power to detect differences in the covariance matrices.
Kullback(bodym.noIDs[,-9],agegp)

### Another R function to test the equality of several covariance matrices:

# Need to install the 'biotools' package from the Internet:

library(biotools)  # load the 'biotools' package once it is installed

# Box's M test is HIGHLY senstitive to the assumption of multivariate normality.
# If the data are even a little non-normal, Box's M test doesn't work well.
# Some suggest not worrying about the result unless the p-value is VERY low.

boxM(bodym.noIDs[,-9],agegp)


##################################################################
##################################################################
#
# One-sample inference about a mean vector
#
##################################################################
##################################################################

# We can read the SAT scores data from the Internet:

satmult <- read.table("http://www.stat.sc.edu/~hitchcock/SATscoresmult.txt", header=T)

# If not connected to the Internet, we could save the file and read it 
# similarly to the following:

# satmult<-read.table("Z:/stat_530/SATscoresmult.txt", header=T)

my.n <- nrow(satmult)  # number of individuals

# Sample mean vectors for the SAT scores data:
xbar.sat <- apply(satmult, 2, mean)

my.q <- length(xbar.sat)  # number of variables

# Sample covariance matrix for the SAT scores data:

S.sat <- var(satmult)

# Testing whether the mean vector is equal to (500, 500, 500)' :

mu.0 <- c(500, 500, 500)

# Hotelling T^2, the F-statistic, and the P-value:
T2 <-  my.n * t(xbar.sat - mu.0) %*% solve(S.sat) %*% (xbar.sat - mu.0)
Fstat <- ( (my.n - my.q) / ((my.n-1)*my.q) ) * T2
pvalue <- 1-pf(Fstat, my.q, my.n-my.q)

print(paste("Hotelling T^2 =", round(T2,4), 
"F=", round(Fstat,4), "P-value =", round(pvalue,4) ))




## Plotting a 2-D confidence ellipse for the mean vector for the math and reading scores:

my.q <- 2  # since now we are just focusing on the math and reading variables

my.x1 <- runif(n=50000, min=min(satmult[,1]), max=max(satmult[,1]) )
my.x2 <- runif(n=50000, min=min(satmult[,2]), max=max(satmult[,2]) )
my.xs <- cbind(my.x1,my.x2)

my.pts<-rep(0,times=length(my.x1))

for (i in 1:(length(my.pts)) ) {
my.pts[i] <- ( (my.n - my.q) / ((my.n-1)*my.q) ) * my.n * t(xbar.sat[c(1,2)] - my.xs[i,]) %*% solve(S.sat[c(1,2),c(1,2)]) %*% (xbar.sat[c(1,2)] - my.xs[i,])
}

my.x1.inside <- my.x1[my.pts <= qf(0.95, my.q, my.n-my.q)]
my.x2.inside <- my.x2[my.pts <= qf(0.95, my.q, my.n-my.q)]
plot(my.x1.inside, my.x2.inside, xlab="Math", ylab="Reading")
conv.hull <- chull(my.x1.inside, my.x2.inside)
polygon(my.x1.inside[conv.hull], my.x2.inside[conv.hull])

# Just plotting the ellipse:

plot(my.x1.inside, my.x2.inside, xlab="Math", ylab="Reading", type='n')
conv.hull <- chull(my.x1.inside, my.x2.inside)
polygon(my.x1.inside[conv.hull], my.x2.inside[conv.hull])


# Showing individual t-based CIs for each of the two variables:

t.test(satmult[,1])$conf.int  # 95% CI for mean math score

t.test(satmult[,2])$conf.int  # 95% CI for mean reading score


abline(v=t.test(satmult[,1])$conf.int[1], lty=2)  # Lower conf. limit of 95% CI, math
abline(v=t.test(satmult[,1])$conf.int[2], lty=2)  # Upper conf. limit of 95% CI, math

abline(h=t.test(satmult[,2])$conf.int[1], lty=2)  # Lower conf. limit of 95% CI, reading
abline(h=t.test(satmult[,2])$conf.int[2], lty=2)  # Upper conf. limit of 95% CI, reading



#####################################################################
#####################################################################
#  
#  Multivariate Analysis of Variance (MANOVA)
#
#####################################################################
#####################################################################


# Consider the built-in "iris" data set in R:

# Attaching the data frame:

attach(iris)

iris.manova <- manova(cbind(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) ~ Species)

# Doing an exploratory examination of the sample mean vectors for each group:

# Examining the sample mean vectors for each group:

by(cbind(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width), Species, colMeans)

# Doing the MANOVA test using Wilks' Lambda:

summary(iris.manova, test="Wilks")

# Using the other test statistics:

summary(iris.manova, test="Roy")
summary(iris.manova, test="Hotelling-Lawley")
summary(iris.manova, test="Pillai")

# We have strong evidence that the mean vectors differ across the 3 species.

# Checking model assumption of normality:

###############
###
chisplot <- function(x) {
    if (!is.matrix(x)) stop("x is not a matrix")
    ### determine dimensions
    n <- nrow(x)
    p <- ncol(x)
    xbar <- apply(x, 2, mean)
    S <- var(x)
    S <- solve(S)
    index <- (1:n)/(n+1)
    xcent <- t(t(x) - xbar)
    di <- apply(xcent, 1, function(x,S) x %*% S %*% x,S)
    quant <- qchisq(index,p)
    plot(quant, sort(di), ylab = "Ordered distances",
         xlab = "Chi-square quantile", lwd=2,pch=1)
}
###
###############

chisplot(residuals(iris.manova))

# No strong evidence against normality -- we are safe.

# Examining the sample covariance matrices for each group:

by(iris[,-5], Species, var)

# Is there evidence that the covariance matrices are significantly different across the 3 species?

### An R function to test the equality of several covariance matrices:

# Need to install the 'asbio' package from the Internet:

library(asbio)  # load the 'asbio' package once it is installed

Kullback(Y=iris[,-5],X=Species)

### Another R function to test the equality of several covariance matrices:

# Need to install the 'biotools' package from the Internet:

library(biotools)  # load the 'biotools' package once it is installed

boxM(iris[,-5],Species)

# The covariance matrices may not be equal across groups...

##### Differences in Means for doing Bonferroni multiple comparisons:

means.by.grps <- cbind(tapply(Sepal.Length,Species,mean), tapply(Sepal.Width,Species,mean), tapply(Petal.Length,Species,mean), tapply(Petal.Width,Species,mean) )

## The matrix E:

my.n <- nrow(iris[,-5])

E <- (my.n - 1)*var(residuals(iris.manova))
E

# Calculation of the Bonferroni CIs mentioned in the notes:

my.alpha <- 0.10 # family confidence level for CIs is 90% here

my.m <- 3  # number of groups
my.q <- 4  # number of variables
group.sample.sizes <- c(50,50,50)

for (k in 1:my.q) {

pair.mean.diffs <- cbind( t(combn(my.m,2)),combn(tapply(iris[,k],Species,mean),2,FUN=diff) )

t.val <- qt(my.alpha/(my.q*my.m*(my.m-1)), df=my.n-my.m, lower=F)

#print(paste("For i=",i,"i.prime=",j,"and k=",k,"pt est is"

CI.L <- pair.mean.diffs[,3] - t.val*sqrt((diag(E)[k]/(my.n-my.m))*(1/group.sample.sizes[pair.mean.diffs[,1]] + 1/group.sample.sizes[pair.mean.diffs[,2]]) )
CI.U <- pair.mean.diffs[,3] + t.val*sqrt((diag(E)[k]/(my.n-my.m))*(1/group.sample.sizes[pair.mean.diffs[,1]] + 1/group.sample.sizes[pair.mean.diffs[,2]]) )

my.table.mat<-cbind(pair.mean.diffs, round(CI.L,3), round(CI.U,3), rep(k,times=nrow(pair.mean.diffs)) )
my.table<-as.data.frame(my.table.mat)
names(my.table)=c('grp1','grp2','diff.samp.means','lower.CI','upper.CI','variable'); print(my.table)
}