## Problem 1:

testdata <- read.table("http://www.stat.sc.edu/~hitchcock/testscoredata.txt", header=T)
attach(testdata)
testdata.noIDs <- testdata[,-1]  #to remove the ID numbers

summary(manova(cbind(math, reading ) ~   sex) ,test="Hotelling")

tapply( math,sex,mean)
#     boy     girl 
#84.09833 82.96250 

 tapply( reading,sex,mean)
#     boy     girl 
#81.78667 82.28000 


# Checking assumptions:
x1 <- testdata.noIDs[sex=='boy',-3]
x2 <- testdata.noIDs[sex=='girl',-3]

 chisplot(as.matrix(x1))
chisplot(as.matrix(x2))
 library(asbio)  # load the 'asbio' package once it is installed
Kullback(testdata.noIDs[,-3],sex)

 library(biotools)  # load the 'biotools' package once it is installed

boxM(testdata.noIDs[,-3],sex)


### PROBLEM 2:

hsb <- read.table("http://www.stat.sc.edu/~hitchcock/hsbdata.txt", header=T)
attach(hsb)

hsb.prob4 <- hsb[,c(5,8,9,10,11,12)]

hsb.numeric <- hsb[,c(8,9,10,11,12)]

my.manova <- manova(cbind(read, write, math, science, socst) ~ ses)

# Doing the MANOVA test using Wilks' Lambda:

summary(my.manova, test="Wilks")

###############
###
chisplot <- function(x) {
    if (!is.matrix(x)) stop("x is not a matrix")
    ### determine dimensions
    n <- nrow(x)
    p <- ncol(x)
    xbar <- apply(x, 2, mean)
    S <- var(x)
    S <- solve(S)
    index <- (1:n)/(n+1)
    xcent <- t(t(x) - xbar)
    di <- apply(xcent, 1, function(x,S) x %*% S %*% x,S)
    quant <- qchisq(index,p)
    plot(quant, sort(di), ylab = "Ordered distances",
         xlab = "Chi-square quantile", lwd=2,pch=1)
}
###
###############

chisplot(residuals(my.manova))


# Examining the sample covariance matrices for each group:

by(cbind(read, write, math, science, socst), ses, var)

# Optionally, doing the formal tests:

library(asbio)

Kullback(Y=cbind(read, write, math, science, socst),X=ses)

library(biotools)  
boxM(cbind(read, write, math, science, socst),ses)

##############################################

# Examining the sample mean vectors for each group:

by(cbind(read, write, math, science, socst), ses, colMeans)



### PROBLEM 3:

# Fitting the multivariate linear regression model:

bat2015 <- read.csv("http://www.stat.sc.edu/~hitchcock/baseball2015batting.txt", header=T)
pitch2015 <- read.csv("http://www.stat.sc.edu/~hitchcock/baseball2015pitching.txt", header=T)
baseball2015 <- merge(bat2015,pitch2015,by="Tm")

#Response Variables:
y1 <- baseball2015$RG  # Runs Per Game
y2 <- baseball2015$RAG  #Runs Allowed Per Game
y3 <- baseball2015$WLP  # Team WinningPct

#Predictor Variables:
x1<-baseball2015$BatAge  # Average age of team's batters
x2<-baseball2015$PAge   # Average age of team's batters
x3<-baseball2015$SB  # Total Stolen bases by team
x4<-baseball2015$BB.y  # Total bases on balls (walks) given up by team's pitchers
x5<-baseball2015$SO.y  # Total strikeouts earned by team's pitchers

### (a)

# A quick summary:
bball.mod <- lm(cbind(y1,y2,y3) ~ x1 + x2 + x3 + x4 + x5)
summary(bball.mod)


bball.mod.y1 <- lm(y1 ~ x1 + x2 + x3 + x4 + x5)
bball.mod.y2 <- lm(y2 ~ x1 + x2 + x3 + x4 + x5)
bball.mod.y3 <- lm(y3 ~ x1 + x2 + x3 + x4 + x5)
Beta.hat <- cbind(coef(bball.mod.y1), coef(bball.mod.y2), coef(bball.mod.y3) )
Beta.hat

#### (b)


# Let's get a point estimate for the ( Runs Per Game, Runs Allowed Per Game, Team WinningPct) 
# for a team with BatAge=29, PAge=26, SB=120, BB.y=550, SO.y=1400)

# The predictor values of interest (including a "1" for the intercept term):

x0 <- c(1, 29, 26, 120, 550, 1400)
pt.est <- t(as.matrix(x0,nc=1)) %*% Beta.hat
pt.est

### Extra credit part


# Getting the matrix of fitted values:

X.mat <- cbind(rep(1,times=nrow(baseball2015)),x1,x2,x3,x4,x5) 
Y.hat <- X.mat %*% Beta.hat
Y.hat

# Getting the matrix of residuals:
resid.mat <- cbind (y1,y2,y3) - Y.hat
resid.mat


#### Testing about x1,x2,x3 in the model:

## Full model:

my.n <- length(y1)  # number of individuals
my.p <- ncol(X.mat) - 1  # total number of predictors
my.r <- ncol(resid.mat)  # total number of responses
 
E.mat.full <- (my.n-1)*var(resid.mat)

## Reduced model (without x1,x2,x3)

Beta.hat.redu <- cbind(coef(lm(y1 ~ x4+x5)), coef(lm(y2 ~ x4+x5)), coef(lm(y3 ~ x4+x5)) )

X.mat.redu <- cbind(rep(1,times=nrow(baseball2015)),x4,x5) 
my.p.redu <- ncol(X.mat.redu) - 1  # number of predictors in the reduced model
Y.hat.redu <- X.mat.redu %*% Beta.hat.redu

resid.mat.redu <- cbind (y1,y2,y3) - Y.hat.redu

E.mat.redu <- (my.n-1)*var(resid.mat.redu)

my.test.stat <-  -(my.n - my.p - 1 - 0.5*(my.r - my.p + my.p.redu + 1)) * log( det(E.mat.full)/det(E.mat.redu) )
my.test.stat
p.value <- pchisq(my.test.stat, df = my.r*(my.p - my.p.redu), lower.tail=F )
p.value


### Checking model assumptions:

qqnorm(resid.mat[,1], main = "Normal Q-Q plot, Runs Scored")
dev.new()
plot(Y.hat[,1],resid.mat[,1], main = "Residual plot vs. fitted values, Runs Scored"); abline(h=0)
dev.new()
qqnorm(resid.mat[,2], main = "Normal Q-Q plot, Runs Allowed")
dev.new()
plot(Y.hat[,2],resid.mat[,2], main = "Residual plot vs. fitted values, Runs Allowed"); abline(h=0)
dev.new()
qqnorm(resid.mat[,3], main = "Normal Q-Q plot, Winning Pct")
dev.new()
plot(Y.hat[,3],resid.mat[,3], main = "Residual plot vs. fitted values, Winning Pct"); abline(h=0)