## Problem 1: testdata <- read.table("http://www.stat.sc.edu/~hitchcock/testscoredata.txt", header=T) attach(testdata) testdata.noIDs <- testdata[,-1] #to remove the ID numbers summary(manova(cbind(math, reading ) ~ sex) ,test="Hotelling") tapply( math,sex,mean) # boy girl #84.09833 82.96250 tapply( reading,sex,mean) # boy girl #81.78667 82.28000 # Checking assumptions: x1 <- testdata.noIDs[sex=='boy',-3] x2 <- testdata.noIDs[sex=='girl',-3] chisplot(as.matrix(x1)) chisplot(as.matrix(x2)) library(asbio) # load the 'asbio' package once it is installed Kullback(testdata.noIDs[,-3],sex) library(biotools) # load the 'biotools' package once it is installed boxM(testdata.noIDs[,-3],sex) ### PROBLEM 2: hsb <- read.table("http://www.stat.sc.edu/~hitchcock/hsbdata.txt", header=T) attach(hsb) hsb.prob4 <- hsb[,c(5,8,9,10,11,12)] hsb.numeric <- hsb[,c(8,9,10,11,12)] my.manova <- manova(cbind(read, write, math, science, socst) ~ ses) # Doing the MANOVA test using Wilks' Lambda: summary(my.manova, test="Wilks") ############### ### chisplot <- function(x) { if (!is.matrix(x)) stop("x is not a matrix") ### determine dimensions n <- nrow(x) p <- ncol(x) xbar <- apply(x, 2, mean) S <- var(x) S <- solve(S) index <- (1:n)/(n+1) xcent <- t(t(x) - xbar) di <- apply(xcent, 1, function(x,S) x %*% S %*% x,S) quant <- qchisq(index,p) plot(quant, sort(di), ylab = "Ordered distances", xlab = "Chi-square quantile", lwd=2,pch=1) } ### ############### chisplot(residuals(my.manova)) # Examining the sample covariance matrices for each group: by(cbind(read, write, math, science, socst), ses, var) # Optionally, doing the formal tests: library(asbio) Kullback(Y=cbind(read, write, math, science, socst),X=ses) library(biotools) boxM(cbind(read, write, math, science, socst),ses) ############################################## # Examining the sample mean vectors for each group: by(cbind(read, write, math, science, socst), ses, colMeans) ### PROBLEM 3: # Fitting the multivariate linear regression model: bat2015 <- read.csv("http://www.stat.sc.edu/~hitchcock/baseball2015batting.txt", header=T) pitch2015 <- read.csv("http://www.stat.sc.edu/~hitchcock/baseball2015pitching.txt", header=T) baseball2015 <- merge(bat2015,pitch2015,by="Tm") #Response Variables: y1 <- baseball2015$RG # Runs Per Game y2 <- baseball2015$RAG #Runs Allowed Per Game y3 <- baseball2015$WLP # Team WinningPct #Predictor Variables: x1<-baseball2015$BatAge # Average age of team's batters x2<-baseball2015$PAge # Average age of team's batters x3<-baseball2015$SB # Total Stolen bases by team x4<-baseball2015$BB.y # Total bases on balls (walks) given up by team's pitchers x5<-baseball2015$SO.y # Total strikeouts earned by team's pitchers ### (a) # A quick summary: bball.mod <- lm(cbind(y1,y2,y3) ~ x1 + x2 + x3 + x4 + x5) summary(bball.mod) bball.mod.y1 <- lm(y1 ~ x1 + x2 + x3 + x4 + x5) bball.mod.y2 <- lm(y2 ~ x1 + x2 + x3 + x4 + x5) bball.mod.y3 <- lm(y3 ~ x1 + x2 + x3 + x4 + x5) Beta.hat <- cbind(coef(bball.mod.y1), coef(bball.mod.y2), coef(bball.mod.y3) ) Beta.hat #### (b) # Let's get a point estimate for the ( Runs Per Game, Runs Allowed Per Game, Team WinningPct) # for a team with BatAge=29, PAge=26, SB=120, BB.y=550, SO.y=1400) # The predictor values of interest (including a "1" for the intercept term): x0 <- c(1, 29, 26, 120, 550, 1400) pt.est <- t(as.matrix(x0,nc=1)) %*% Beta.hat pt.est ### Extra credit part # Getting the matrix of fitted values: X.mat <- cbind(rep(1,times=nrow(baseball2015)),x1,x2,x3,x4,x5) Y.hat <- X.mat %*% Beta.hat Y.hat # Getting the matrix of residuals: resid.mat <- cbind (y1,y2,y3) - Y.hat resid.mat #### Testing about x1,x2,x3 in the model: ## Full model: my.n <- length(y1) # number of individuals my.p <- ncol(X.mat) - 1 # total number of predictors my.r <- ncol(resid.mat) # total number of responses E.mat.full <- (my.n-1)*var(resid.mat) ## Reduced model (without x1,x2,x3) Beta.hat.redu <- cbind(coef(lm(y1 ~ x4+x5)), coef(lm(y2 ~ x4+x5)), coef(lm(y3 ~ x4+x5)) ) X.mat.redu <- cbind(rep(1,times=nrow(baseball2015)),x4,x5) my.p.redu <- ncol(X.mat.redu) - 1 # number of predictors in the reduced model Y.hat.redu <- X.mat.redu %*% Beta.hat.redu resid.mat.redu <- cbind (y1,y2,y3) - Y.hat.redu E.mat.redu <- (my.n-1)*var(resid.mat.redu) my.test.stat <- -(my.n - my.p - 1 - 0.5*(my.r - my.p + my.p.redu + 1)) * log( det(E.mat.full)/det(E.mat.redu) ) my.test.stat p.value <- pchisq(my.test.stat, df = my.r*(my.p - my.p.redu), lower.tail=F ) p.value ### Checking model assumptions: qqnorm(resid.mat[,1], main = "Normal Q-Q plot, Runs Scored") dev.new() plot(Y.hat[,1],resid.mat[,1], main = "Residual plot vs. fitted values, Runs Scored"); abline(h=0) dev.new() qqnorm(resid.mat[,2], main = "Normal Q-Q plot, Runs Allowed") dev.new() plot(Y.hat[,2],resid.mat[,2], main = "Residual plot vs. fitted values, Runs Allowed"); abline(h=0) dev.new() qqnorm(resid.mat[,3], main = "Normal Q-Q plot, Winning Pct") dev.new() plot(Y.hat[,3],resid.mat[,3], main = "Residual plot vs. fitted values, Winning Pct"); abline(h=0)