# We can read the body measurements data from the Internet: bodym <- read.table("http://www.stat.sc.edu/~hitchcock/bodymeasgrps.txt", header=T) # If not connected to the Internet, we could save the file and read it # similarly to the following: # bodym<-read.table("Z:/stat_530/bodymeasgrps.txt", header=T) attach(bodym) bodym.noIDs <- bodym[,-1] # removing the ID column before doing the analysis ######################################## ############################################ ### ### R Code to do Hotelling's T^2 test the long way: ### # number of observations for the 'young' group and the 'old' group: l1 <- length(agegp[agegp=='young']) l2 <- length(agegp[agegp=='old']) # Splitting the data matrix (while removing the 9th column, agegp) # into two subsets, one for old and another for young: x1 <- bodym.noIDs[agegp=='young', -9] x2 <- bodym.noIDs[agegp=='old', -9] my.q <- ncol(x1) # Sample mean vectors for the 'young' group and the 'old' group: m1 <- apply(x1, 2, mean) m2 <- apply(x2, 2, mean) # "pooled" sample covariance matrix: S123 <- ((l1-1)*var(x1)+(l2-1)*var(x2))/(l1+l2-2) # Hotelling T^2, the F-statistic, and the P-value: T2 <- ((l1*l2)/(l1+l2))* (t(m1-m2) %*% solve(S123) %*% (m1-m2) ) # code on p. 140 of book is wrong here! Fstat <- ((l1+l2-my.q-1)*T2)/((l1+l2-2)*my.q) pvalue <- 1-pf(Fstat, my.q, l1+l2-my.q-1) print(paste("Hotelling T^2 =", round(T2,4), "F=", round(Fstat,4), "P-value =", round(pvalue,4) )) ### ### ############################################ ############################################ # A simpler approach (in terms of code): summary(manova(cbind(weight, height, neck, chest, hip, thigh, biceps, wrist) ~ agegp),test="Hotelling") # Checking model assumption of normality: ############### ### chisplot <- function(x) { if (!is.matrix(x)) stop("x is not a matrix") ### determine dimensions n <- nrow(x) p <- ncol(x) xbar <- apply(x, 2, mean) S <- var(x) S <- solve(S) index <- (1:n)/(n+1) xcent <- t(t(x) - xbar) di <- apply(xcent, 1, function(x,S) x %*% S %*% x,S) quant <- qchisq(index,p) plot(quant, sort(di), ylab = "Ordered distances", xlab = "Chi-square quantile", lwd=2,pch=1) } ### ############### chisplot(as.matrix(x1)) chisplot(as.matrix(x2)) # No strong evidence against normality for either sample. # Examining the sample covariance matrices for each group: by(bodym.noIDs[,-9],agegp, var) # Is there evidence that the covariance matrices are significantly different across the 2 age groups? ### An R function to test the equality of several covariance matrices: # Need to install the 'asbio' package from the Internet: library(asbio) # load the 'asbio' package once it is installed # The Kullback test is somewhat conservative and may lack power to detect differences in the covariance matrices. Kullback(bodym.noIDs[,-9],agegp) ### Another R function to test the equality of several covariance matrices: # Need to install the 'biotools' package from the Internet: library(biotools) # load the 'biotools' package once it is installed # Box's M test is HIGHLY senstitive to the assumption of multivariate normality. # If the data are even a little non-normal, Box's M test doesn't work well. # Some suggest not worrying about the result unless the p-value is VERY low. boxM(bodym.noIDs[,-9],agegp) ################################################################## ################################################################## # # One-sample inference about a mean vector # ################################################################## ################################################################## # We can read the SAT scores data from the Internet: satmult <- read.table("http://www.stat.sc.edu/~hitchcock/SATscoresmult.txt", header=T) # If not connected to the Internet, we could save the file and read it # similarly to the following: # satmult<-read.table("Z:/stat_530/SATscoresmult.txt", header=T) my.n <- nrow(satmult) # number of individuals # Sample mean vectors for the SAT scores data: xbar.sat <- apply(satmult, 2, mean) my.q <- length(xbar.sat) # number of variables # Sample covariance matrix for the SAT scores data: S.sat <- var(satmult) # Testing whether the mean vector is equal to (500, 500, 500)' : mu.0 <- c(500, 500, 500) # Hotelling T^2, the F-statistic, and the P-value: T2 <- my.n * t(xbar.sat - mu.0) %*% solve(S.sat) %*% (xbar.sat - mu.0) Fstat <- ( (my.n - my.q) / ((my.n-1)*my.q) ) * T2 pvalue <- 1-pf(Fstat, my.q, my.n-my.q) print(paste("Hotelling T^2 =", round(T2,4), "F=", round(Fstat,4), "P-value =", round(pvalue,4) )) ## Plotting a 2-D confidence ellipse for the mean vector for the math and reading scores: my.q <- 2 # since now we are just focusing on the math and reading variables my.x1 <- runif(n=50000, min=min(satmult[,1]), max=max(satmult[,1]) ) my.x2 <- runif(n=50000, min=min(satmult[,2]), max=max(satmult[,2]) ) my.xs <- cbind(my.x1,my.x2) my.pts<-rep(0,times=length(my.x1)) for (i in 1:(length(my.pts)) ) { my.pts[i] <- ( (my.n - my.q) / ((my.n-1)*my.q) ) * my.n * t(xbar.sat[c(1,2)] - my.xs[i,]) %*% solve(S.sat[c(1,2),c(1,2)]) %*% (xbar.sat[c(1,2)] - my.xs[i,]) } my.x1.inside <- my.x1[my.pts <= qf(0.95, my.q, my.n-my.q)] my.x2.inside <- my.x2[my.pts <= qf(0.95, my.q, my.n-my.q)] plot(my.x1.inside, my.x2.inside, xlab="Math", ylab="Reading") conv.hull <- chull(my.x1.inside, my.x2.inside) polygon(my.x1.inside[conv.hull], my.x2.inside[conv.hull]) # Just plotting the ellipse: plot(my.x1.inside, my.x2.inside, xlab="Math", ylab="Reading", type='n') conv.hull <- chull(my.x1.inside, my.x2.inside) polygon(my.x1.inside[conv.hull], my.x2.inside[conv.hull]) # Showing individual t-based CIs for each of the two variables: t.test(satmult[,1])$conf.int # 95% CI for mean math score t.test(satmult[,2])$conf.int # 95% CI for mean reading score abline(v=t.test(satmult[,1])$conf.int[1], lty=2) # Lower conf. limit of 95% CI, math abline(v=t.test(satmult[,1])$conf.int[2], lty=2) # Upper conf. limit of 95% CI, math abline(h=t.test(satmult[,2])$conf.int[1], lty=2) # Lower conf. limit of 95% CI, reading abline(h=t.test(satmult[,2])$conf.int[2], lty=2) # Upper conf. limit of 95% CI, reading ##################################################################### ##################################################################### # # Multivariate Analysis of Variance (MANOVA) # ##################################################################### ##################################################################### # Consider the built-in "iris" data set in R: # Attaching the data frame: attach(iris) iris.manova <- manova(cbind(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) ~ Species) # Doing an exploratory examination of the sample mean vectors for each group: # Examining the sample mean vectors for each group: by(cbind(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width), Species, colMeans) # Doing the MANOVA test using Wilks' Lambda: summary(iris.manova, test="Wilks") # Using the other test statistics: summary(iris.manova, test="Roy") summary(iris.manova, test="Hotelling-Lawley") summary(iris.manova, test="Pillai") # We have strong evidence that the mean vectors differ across the 3 species. # Checking model assumption of normality: ############### ### chisplot <- function(x) { if (!is.matrix(x)) stop("x is not a matrix") ### determine dimensions n <- nrow(x) p <- ncol(x) xbar <- apply(x, 2, mean) S <- var(x) S <- solve(S) index <- (1:n)/(n+1) xcent <- t(t(x) - xbar) di <- apply(xcent, 1, function(x,S) x %*% S %*% x,S) quant <- qchisq(index,p) plot(quant, sort(di), ylab = "Ordered distances", xlab = "Chi-square quantile", lwd=2,pch=1) } ### ############### chisplot(residuals(iris.manova)) # No strong evidence against normality -- we are safe. # Examining the sample covariance matrices for each group: by(iris[,-5], Species, var) # Is there evidence that the covariance matrices are significantly different across the 3 species? ### An R function to test the equality of several covariance matrices: # Need to install the 'asbio' package from the Internet: library(asbio) # load the 'asbio' package once it is installed Kullback(Y=iris[,-5],X=Species) ### Another R function to test the equality of several covariance matrices: # Need to install the 'biotools' package from the Internet: library(biotools) # load the 'biotools' package once it is installed boxM(iris[,-5],Species) # The covariance matrices may not be equal across groups... ##### Differences in Means for doing Bonferroni multiple comparisons: means.by.grps <- cbind(tapply(Sepal.Length,Species,mean), tapply(Sepal.Width,Species,mean), tapply(Petal.Length,Species,mean), tapply(Petal.Width,Species,mean) ) ## The matrix E: my.n <- nrow(iris[,-5]) E <- (my.n - 1)*var(residuals(iris.manova)) E # Calculation of the Bonferroni CIs mentioned in the notes: my.alpha <- 0.10 # family confidence level for CIs is 90% here my.m <- 3 # number of groups my.q <- 4 # number of variables group.sample.sizes <- c(50,50,50) for (k in 1:my.q) { pair.mean.diffs <- cbind( t(combn(my.m,2)),combn(tapply(iris[,k],Species,mean),2,FUN=diff) ) t.val <- qt(my.alpha/(my.q*my.m*(my.m-1)), df=my.n-my.m, lower=F) #print(paste("For i=",i,"i.prime=",j,"and k=",k,"pt est is" CI.L <- pair.mean.diffs[,3] - t.val*sqrt((diag(E)[k]/(my.n-my.m))*(1/group.sample.sizes[pair.mean.diffs[,1]] + 1/group.sample.sizes[pair.mean.diffs[,2]]) ) CI.U <- pair.mean.diffs[,3] + t.val*sqrt((diag(E)[k]/(my.n-my.m))*(1/group.sample.sizes[pair.mean.diffs[,1]] + 1/group.sample.sizes[pair.mean.diffs[,2]]) ) my.table.mat<-cbind(pair.mean.diffs, round(CI.L,3), round(CI.U,3), rep(k,times=nrow(pair.mean.diffs)) ) my.table<-as.data.frame(my.table.mat) names(my.table)=c('grp1','grp2','diff.samp.means','lower.CI','upper.CI','variable'); print(my.table) }