######################### # # # STAT 599 Spring 2013 # # # # Example R code # # # # Chapter 5 # # # ######################### ### Installing the add-on packages needed for this course: # If you haven't already done so, # type at the command line (without the preceding # ): # install.packages(c('faraway','mgcv','acepack','mda','brlr','sm','wavethresh','SuppDists','lme4')) # and choose a U.S. mirror site when the window pops up. # This will install the needed packages for you. # load the 'faraway' package: library(faraway) # loading the 'nes96' dataset: data(nes96) # attaching the 'nes96' dataset: attach(nes96) # defining a 3-category party variable: sPID <- nes96$PID levels(sPID) <- c("Democrat","Democrat", "Independent", "Independent", "Independent", "Republican", "Republican") # creating a numeric income variable (see p. 98 for details) inca <- c(1.5,4,6,8,9.5,10.5,11.5,12.5,13.5,14.5,16,18.5,21,23.5,27.5,32.5,37.5,42.5,47.5,55,67.5,82.5,97.5,115) nincome <- inca[unclass(nes96$income)] ###################################### ## # Section 5.1 Multinomial Logit Model ## ###################################### # loading the 'nnet' package: library(nnet) # Fitting a multinomial logit model: multin.nes96 <- multinom(sPID ~ age + educ + nincome, data=nes96) # A stepwise procedure to help decide the set of predictors: step(multin.nes96) # Model with only age and nincome: multin.nes96.2 <- multinom(sPID ~ age + nincome, data=nes96) # Deviance test indicates we should drop 'educ' from the model: pchisq(deviance(multin.nes96.2) - deviance(multin.nes96), df=multin.nes96$edf-multin.nes96.2$edf, lower=F) # Model with only nincome: multin.nes96.3 <- multinom(sPID ~ nincome, data=nes96) # Deviance test indicates we should drop 'age' from the model: pchisq(deviance(multin.nes96.3) - deviance(multin.nes96.2), df=multin.nes96.2$edf-multin.nes96.3$edf, lower=F) ## Predicting party affiliation corresponding to a set of income values: inc.vals <- c(8,26,42,58,74,90,107) predict(multin.nes96.3, data.frame(nincome=inc.vals), type='probs') # Just giving the most probable category: predict(multin.nes96.3, data.frame(nincome=inc.vals)) # Examining estimated regression coefficients: summary(multin.nes96.3) ###### ### Section 5.2: Hierarchical (Nested) Models: # loading and attaching the 'cns' data set: data(cns) attach(cns) # Combining the 'Type of CNS' categories into one overall CNS category: CNS <- An + Sp + Other # A plot of empirical logits against water hardness, separate by type of worker: plot(log(CNS/NoCNS) ~ Water, data=cns, pch=as.character(Work)) # Fitting the initial binomial model: # Choosing whether to include 'Water' and 'Work' as predictors (simpler model) # or 'Area ' and 'Work' as predictors (more complicated model): bin.mod.w <- glm(cbind(CNS, NoCNS) ~ Water + Work, data=cns, family="binomial") bin.mod.a <- glm(cbind(CNS, NoCNS) ~ Area + Work, data=cns, family="binomial") anova(bin.mod.w, bin.mod.a, test="Chi") # Comparing deviances indicates the simpler model is reasonable. # Checking residuals: halfnorm(residuals(bin.mod.w)) # Getting the estimated coefficients: summary(bin.mod.w) #### Second-stage multinomial model for CNS type, conditional on the fact that CNS has occurred: library(nnet) cond.multin.model <- multinom(cbind(An, Sp, Other) ~ Water + Work, data=cns) summary(cond.multin.model) # A stepwise procedure for variable selection: step(cond.multin.model) # Or simply using AICs to compare models: cond.multin.model.simp1 <- multinom(cbind(An, Sp, Other) ~ Water, data=cns) cond.multin.model.simp2 <- multinom(cbind(An, Sp, Other) ~ Work, data=cns) cond.multin.model.null <- multinom(cbind(An, Sp, Other) ~ 1, data=cns) AIC(cond.multin.model, cond.multin.model.simp1, cond.multin.model.simp2, cond.multin.model.null) # The AIC is best for the null model ... use your best judgment. # Deviance test to compare the null model to the model with both predictors: pchisq(deviance(cond.multin.model.null)-deviance(cond.multin.model), df=cond.multin.model$edf-cond.multin.model.null$edf, lower=F) # The simpler (null) model seems fine here. ####################################### # #### Section 5.3: Models for Ordinal Multinomial Responses # load the 'faraway' package: library(faraway) # loading the 'nes96' dataset: data(nes96) # attaching the 'nes96' dataset: attach(nes96) # defining a 3-category party variable: sPID <- nes96$PID levels(sPID) <- c("Democrat","Democrat", "Independent", "Independent", "Independent", "Republican", "Republican") # creating a numeric income variable (see p. 98 for details) inca <- c(1.5,4,6,8,9.5,10.5,11.5,12.5,13.5,14.5,16,18.5,21,23.5,27.5,32.5,37.5,42.5,47.5,55,67.5,82.5,97.5,115) nincome <- inca[unclass(nes96$income)] # loading the 'MASS' package: library(MASS) # Proportional Odds Model with all three predictors: pomod <- polr(sPID ~ age + educ + nincome, data=nes96) # Proportional Odds Model with only income as predictor: pomod.i <- polr(sPID ~ nincome, data=nes96) # Deviance test to compare these models: pchisq(deviance(pomod.i) -deviance(pomod), df=pomod$edf-pomod.i$edf, lower=F) # The AICs also support the simpler model: AIC(pomod.i, pomod) # Checking the proportional odds assumption: pim <- prop.table(table(nincome,sPID),1) print(pim) # The log of the odds proportions are simply differences in logits (log-odds) for categories 1 and 2: plot(logit(pim[,1]) - logit(pim[,1] + pim[,2]), type='b',ylab="logit differences", xlab="Income levels") # This plot should be roughly constant (no trend). # Looking at the estimated coefficients: summary(pomod.i) # Getting the estimates of the theta_j's: ilogit(pomod.i[[2]][1]) # Gets estimate of theta_1 (Democrat intercept) ilogit(pomod.i[[2]][2]) - ilogit(pomod.i[[2]][1]) # Gets estimate of theta_2 (Independent intercept) ## Predicting party affiliation corresponding to a set of income values: inc.vals <- c(8,26,42,58,74,90,107) predict(pomod.i, data.frame(nincome=inc.vals, row.names=inc.vals), type='probs') ### Ordered Probit Model with only income as predictor: opmod.i <- polr(sPID ~ nincome, data=nes96, method="probit") summary(opmod.i) predict(opmod.i, data.frame(nincome=inc.vals, row.names=inc.vals), type='probs') # A proportional hazards model would be fit using the 'cloglog' method: phmod.i <- polr(sPID ~ nincome, data=nes96, method="cloglog") summary(phmod.i)