######################### # # # STAT 599 Spring 2013 # # # # Example R code # # # # Chapter 2 (and 6) # # # ######################### ### Installing the add-on packages needed for this course: # If you haven't already done so, # type at the command line (without the preceding # ): # install.packages(c('faraway','mgcv','acepack','mda','brlr','sm','wavethresh','SuppDists','lme4')) # and choose a U.S. mirror site when the window pops up. # This will install the needed packages for you. # load the 'faraway' package: library(faraway) # loading the 'orings' dataset: data(orings) # attaching the 'orings' dataset: attach(orings) # Now the columns of the data frame can be referred to by name. # To see the data, type the name of the data object: orings # For each observation, the measurement is the count of damaged rings (out of 6 possible). # We can plot the proportions by diving the damage counts by 6: plot(damage/6 ~ temp, data = orings, xlim=c(25,85), ylim=c(0,1), xlab="Temperature", ylab="Prob of damage") # A naive simple linear model is fit and overlain on the plot by: lmod <- lm(damage/6 ~ temp, data=orings) abline(lmod) # The result matches Figure 2.1 in the book. #### The Binomial GLM fit with the o-rings data: # The 'damage' vector contains counts of damaged o-rings. # Subtracting 'damage' from 6 will give the counts of non-damaged o-rings for each mission: nondamage <- 6 - damage # Fitting the logistic regression model and summarizing the fitted model: my.logit.mod <- glm(cbind(damage, nondamage) ~ temp, family=binomial, data=orings) # Note that the logit link is the default link for a binomial response. summary(my.logit.mod) # Plotting the fitted probability curve along with the observed sample proportions: plot(damage/6 ~ temp, data = orings, xlim=c(25,85), ylim=c(0,1), xlab="Temperature", ylab="Prob of damage") # creating a grid of x-values from 25 to 85: x <- seq(25, 85, 0.1) lines(x, ilogit(coef(my.logit.mod)[1] + coef(my.logit.mod)[2]*x) ) # Suppose we preferred the probit link: my.probit.mod <- glm(cbind(damage, nondamage) ~ temp, family=binomial(link=probit), data=orings) summary(my.probit.mod) lines(x, pnorm(coef(my.probit.mod)[1] + coef(my.probit.mod)[2]*x), lty=2 ) ### Predicting the probability of damage when temp = 31: # logit fitted model: ilogit(coef(my.logit.mod)[1] + coef(my.logit.mod)[2]*31) # probit fitted model: pnorm(coef(my.probit.mod)[1] + coef(my.probit.mod)[2]*31) #### Inference about the logistic regression, with the o-rings data: # Testing the goodness of fit of the chosen model, relative to a saturated model: pchisq(deviance(my.logit.mod), df.residual(my.logit.mod), lower=F) # A model without temperature (just with an intercept term): smaller.model <- glm(cbind(damage, nondamage) ~ 1, family=binomial, data=orings) deviance(smaller.model) # Note this is the same as the 'Null deviance' reported in the original output. # Testing whether temperature is needed in the model: pchisq(deviance(smaller.model)-deviance(my.logit.mod), df.residual(smaller.model)-df.residual(my.logit.mod), lower=F) # Profile-likelihood confidence intervals for the regression parameters (the betas): library(MASS) # this is necessary confint(my.logit.mod) #### Goodness of Fit for binary-response models: # A simple example: y <- c(0,0,1,0,0,1,1,0,0,1,1,1,1,0,1,0,1,1,1) x <- c(1.2,1.5,1.8,1.9,2.0,2.3,2.5,2.6,2.8,3.1,3.2,3.3,3.6,3.7,3.9,4.0,4.2,4.3,4.6) simple.logistic <- glm(y ~ x, family=binomial) #### # A function to do the Hosmer-Lemeshow test in R. # R Function is due to Peter D. M. Macdonald, McMaster University. # hosmerlem <- function (y, yhat, g = 10) { cutyhat <- cut(yhat, breaks = quantile(yhat, probs = seq(0, 1, 1/g)), include.lowest = T) obs <- xtabs(cbind(1 - y, y) ~ cutyhat) expect <- xtabs(cbind(1 - yhat, yhat) ~ cutyhat) chisq <- sum((obs - expect)^2/expect) P <- 1 - pchisq(chisq, g - 2) c("X^2" = chisq, Df = g - 2, "P(>Chi)" = P) } # ###### # Doing the Hosmer-Lemeshow test # (after copying the above function into R): hosmerlem(y, fitted(simple.logistic)) # Large P-value indicates a reasonable fit. ###################### ###################### # load the 'faraway' package: library(faraway) # loading the 'babyfood' dataset: data(babyfood) # attaching the 'babyfood' dataset: attach(babyfood) # Now the columns of the data frame can be referred to by name. # To see the data, type the name of the data object: babyfood # Fitting models with and without interaction: logit.model.inter <- glm(cbind(disease, nondisease) ~ sex + food + sex:food, family=binomial, data=babyfood) logit.model <- glm(cbind(disease, nondisease) ~ sex + food, family=binomial, data=babyfood) # Testing whether the interaction is needed in the model: pchisq(deviance(logit.model)-deviance(logit.model.inter), df.residual(logit.model)-df.residual(logit.model.inter), lower=F) # Summary of the no-interaction model: summary(logit.model) # Testing the significance of each predictor individually: drop1(logit.model, test="Chi") # CIs for odds ratios: library(MASS) exp(confint(logit.model)) ################################# # 2.7 Choice of Link Functions ################################# library(faraway) # loading the Bliss insect data data(bliss) # Attaching the data frame: attach(bliss) # Examining the data: print(bliss) # Fitting the model with the logit link (the default): model.logit <- glm(cbind(dead,alive) ~ conc, family=binomial, data=bliss) # Fitting the model with the probit link: model.probit <- glm(cbind(dead,alive) ~ conc, family=binomial(link=probit), data=bliss) # Fitting the model with the complementary log-log link: model.cloglog <- glm(cbind(dead,alive) ~ conc, family=binomial(link=cloglog), data=bliss) # In three columns, we list the fitted values for the three fits: cbind(fitted(model.logit), fitted(model.probit), fitted(model.cloglog)) # Over the observed data range of concentration values (from 0 to 4) # the fitted values are not too different. # Let's try plotting the fitted curves over # the concentration range from -2 to 8: x <- seq(-2,8, by=0.2) prob.logit <- ilogit(model.logit$coef[1] + model.logit$coef[2]*x) prob.probit <- pnorm(model.probit$coef[1] + model.probit$coef[2]*x) prob.cloglog <- 1-exp(-exp((model.cloglog$coef[1] + model.cloglog$coef[2]*x))) plot(x, prob.logit, type='l', ylab='Probability', xlab='Dose') # solid curve = logit lines(x, prob.probit, lty=2) # dashed = probit lines(x, prob.cloglog, lty=5) # longdash = c-log-log # Ratio of tail probabilities: probit compared to logit: matplot(x, cbind(prob.probit/prob.logit, (1-prob.probit)/(1-prob.logit)), type='l', xlab='Dose', ylab='Ratio') # Ratio of tail probabilities: c-log-log compared to logit: matplot(x, cbind(prob.cloglog/prob.logit, (1-prob.cloglog)/(1-prob.logit)), type='l', xlab='Dose', ylab='Ratio') # Outside of [0, 4] there is a substantial discrepancy in the fitted probabilities. ######################### # 2.8 Estimation Problems ######################### score <- c(30,40,50,70,80,90) succ <- c(0,0,0,1,1,1) my.model <- glm(succ ~ score, family=binomial) summary(my.model) # Notice the warning message and # look at the crazy standard errors for the estimates! ## Hormone example (pg. 38-39): library(faraway) # loading the hormone data data(hormone) # Attaching the data frame: attach(hormone) # Plotting the data in the space of the two predictor variables, # with separate symbols for the two levels of the response, orientation # g = gay, s = straight: plot (estrogen ~ androgen, data=hormone, pch=as.character(orientation)) # Attempting to fit a binomial model: model.horm <- glm(orientation ~ estrogen + androgen, data=hormone, family=binomial) summary(model.horm) # Again we see the warning message and # the huge standard errors for the estimates. # Bias-reduction logistic regression: library(brlr) model.horm.b <- brlr(orientation ~ estrogen + androgen, data=hormone, family=binomial) summary(model.horm.b) # The estimates and standard errors appear more sensible. # Note that the line corresponding to p = 0.5 (where 'g' and 's' are equally likely) # is almost the same for each model: plot (estrogen ~ androgen, data=hormone, pch=as.character(orientation)) abline(-84.5/90.2, 100.9/90.2) # based on original fit abline(-3.65/3.586, 4.074/3.586, lty=2) # dotted, based on brlr fit ######################### # 6.4 GLM Diagnostics ######################### library(faraway) # loading the Bliss insect data data(bliss) # Attaching the data frame: attach(bliss) # Fitting the model with the logit link with the Bliss insect data: model.logit.bliss <- glm(cbind(dead,alive) ~ conc, family=binomial, data=bliss) #### Various type of residuals and outlier diagnostics: # The deviance residuals (the default): residuals(model.logit.bliss) # The Pearson residuals: residuals(model.logit.bliss, "pearson") # The raw response residuals (not so useful with GLMs): residuals(model.logit.bliss, "response") # Jacknife residuals: rstudent(model.logit.bliss) # The hat diagonals, which measure leverage: influence(model.logit.bliss)$hat ##### Diagnostics for influential observations: # The Cook distance statistics: cooks.distance(model.logit.bliss) # Assessing the change in coefficients resulting from casewise deletion: summary(model.logit.bliss) # with the full data set influence(model.logit.bliss)$coef #### Goodness of fit measures: # Pearson's Chi-square statistic: sum(residuals(model.logit.bliss, "pearson")^2) # The deviance: deviance(model.logit.bliss) # P-value from the chi-square test for goodness of fit: 1 - pchisq(deviance(model.logit.bliss), df.residual(model.logit.bliss) ) # The large P-value indicates that the fit is reasonable. # Is the concentration predictor needed? Deviance test: anova(model.logit.bliss, test="Chi") # Tiny p-value indicates the concentration variable is needed in the model. # Suppose we added a quadratic term. # Would that quadratic term be useful? model.bliss.quad <- model.logit <- glm(cbind(dead,alive) ~ conc + I(conc^2), family=binomial, data=bliss) anova(model.logit.bliss, model.bliss.quad, test="Chi") # The more complex model is not necessary. # The quadratic term is not useful, given that # the first-order term is present in the model. ###### Graphical methods for model diagnostics: # Consider the binomial regression of admission vs. department and gender: # Data from Agresti (2002, Table 6.7) yes<-c(32,21,6,3,12,34,3,4,52,5,8,6,35,30,9,11,6,15,17,4,9,21,26,25,21,7,25,31,3,9, 10,25, 25,39,2,4,3,0,29,6,16,7,23,36,4,10) no<-c(81,41,0,8,43,110,1,0,149,10,7,12,100,112,1,11,3,6,0,1,9,19,7,16,10,8,18,37,0, 6,11,53,34,49,123,41,3,2,13,3,33,17,9,14,62,54) table.6.7<-cbind(expand.grid(gender=c("female","male"), dept=c("anth","astr","chem","clas","comm","comp","engl","geog","geol","germ", "hist","lati","ling","math","phil","phys","poli","psyc","reli","roma","soci", "stat","zool")),yes,no) attach(table.6.7) print(table.6.7) # binomial regression with department and gender as predictors: model.admit2 <- glm(cbind(yes,no) ~ dept + gender, family=binomial) summary(model.admit2) # P-value from the chi-square test for goodness of fit: 1 - pchisq(deviance(model.admit2), df.residual(model.admit2) ) # Small P-value: evidence of lack of fit. # Testing the significance of each predictor individually: drop1(model.admit2, test="Chi") ### Plots to examine outliers and influential observations: # half-normal plot of the jacknife residuals: halfnorm(rstudent(model.admit2)) # half-normal plot of the hat diagonals: halfnorm(influence(model.admit2)$hat) # half-normal plot of the Cook distances: halfnorm(cooks.distance(model.admit2)) # Which observation appears to most influence the fit? plot(cooks.distance(model.admit2), ylab='Cook Distance', xlab='Observation number') # Model fit without the influential observation: model.admit2b <- glm(cbind(yes,no) ~ dept + gender, family=binomial, subset=-35) summary(model.admit2b) 1 - pchisq(deviance(model.admit2b), df.residual(model.admit2b) ) # The fit (as measured by the deviance) appears *slightly* better. #### Plots to check model assumptions: # plotting deviance residuals against the linear predictor values: plot(residuals(model.admit2) ~ predict(model.admit2, type='link'), xlab=expression(hat(eta)), ylab='Deviance residuals') # There is no pattern that indicates any problem with the variance function. # Checking plots of the linearized response: props <- yes/(yes+no) mu <- predict(model.admit2, type='response') z <- predict(model.admit2) + (props-mu)/mu # We won't bother plotting the linearized response # against each of the predictors in this example # because the predictors are categorical here. ## Partial residual plots could be obtained by #plot(residuals(model.admit2,type='partial')[,'dept'] ~ dept) #plot(residuals(model.admit2,type='partial')[,'gender'] ~ gender) ## but they are not useful here because of the categorical predictors. plot(z ~ predict(model.admit2), xlab='Linear predictor', ylab='Linearized response') # No problem with the link function. ########################################## # 2.10 Prediction in Binomial Regression ########################################## ### Example: Bliss insect data library(faraway) # loading the Bliss insect data data(bliss) # Attaching the data frame: attach(bliss) # Fitting the model with the logit link with the Bliss insect data: model.logit.bliss <- glm(cbind(dead,alive) ~ conc, family=binomial, data=bliss) # A prediction of the probability of success (insect death) when conc = 2.5: pred2.5 <- predict(model.logit.bliss, newdata=data.frame(conc=2.5), se=T) ilogit(pred2.5$fit) # A 95% CI for this probability: ilogit(c(pred2.5$fit - 1.96*pred2.5$se.fit, pred2.5$fit + 1.96*pred2.5$se.fit)) ### If there are several predictors, just put in the code above something like: ### newdata=data.frame(conc=2.5,x2=4.6,x3=5.8) ### or whatever is the desired set of predictor values... ### Effective Dose calculation: # What is the concentration that yields a probability of 0.5 of an insect dying? library(MASS) dose.p(model.logit.bliss, p=0.5) # A 95% CI for the ED50: c(2 - 1.96*0.1784367, 2 + 1.96*0.1784367) # What is the concentration that yields a probability of 0.8 of an insect dying? dose.p(model.logit.bliss, p=0.8) ########################################## # 2.11 Overdispersion ########################################## ### Example: trout egg data library(faraway) # loading the trout egg data data(troutegg) # Attaching the data frame: attach(troutegg) # Data table is shown on pages 45-46 of the book. # Fitting a regular binomial GLM: died <- total - survive mod.trout <- glm(cbind(survive, died) ~ location + period, family=binomial, data=troutegg) summary(mod.trout) # Deviance shows a poor fit. # Checking for outliers: halfnorm(residuals(mod.trout)) # Page 47 shows a model to check the structure of the model # via interaction plots of the empirical logits: elogits <- log((survive+0.5)/(died+0.5)) with(troutegg, interaction.plot(period, location, elogits)) # The plots seem roughly parallel, so no sign of a structural problem. # Introduce an overdispersion parameter. # Estimate it by: sigma2hat <- sum(residuals(mod.trout,type='pearson')^2)/12 print(sigma2hat) # The estimated dispersion is quite a bit bigger than 1. # Note 12 = the number of residual degrees of freedom in the model above. summary(mod.trout, dispersion=sigma2hat) # Note the estimated coefficients do not change, but the standard errors do. # Testing about the individual predictors # in the model accounting for overdisperion: drop1(mod.trout, scale=sigma2hat, test="F") # Note the F-test is used rather than the Chi-square test. ########################################## # 2.12 Matched case-control Studies ########################################## ### Example: x-ray data library(faraway) # loading the x-ray data data(amlxray) # Attaching the data frame: attach(amlxray) # Removing cases with Down syndrome and the corresponding controls # (see top of page 50) # This is the correct way to remove these data rows # since set with ID=7018 contains 2 controls, not just one: ID.down <- ID[which(downs=='yes')] ramlxray <- amlxray[ID!=ID.down,] # This is the way the book removes them (not quite correctly). # and this will produce output matching the book's: downs.indices <- which(downs=='yes') ramlxray <- amlxray[-c(downs.indices, downs.indices+1),] # loading the 'survival' package: library(survival) # Fitting the conditional logit model for # the matched case-control study: cmod <- clogit(disease ~ Sex + Mray + Fray + CnRay + strata(ID), data = ramlxray) summary(cmod) # Note the ID variable defines the matched sets # which is specified by the strata() function. # Clearly Sex and Mray are not needed # and the quadratic and cubic effects associated with # CnRay are not significant. Let's drop them: cmodr <- clogit(disease ~ Fray + unclass(CnRay) + strata(ID), data=ramlxray) summary(cmodr) # We used the 'unclass' function to convert CnRay to its numerical coding 1-4. # The Fray variable is nonsignificant but CnRay is significant. # The exp(coef) column is important: # The estimated odds of disease increase by a factor of 2.23 as we # move from one category of CnRay to the next higher category (holding Fray fixed).