# R example of multiple logistic regression # The data set is the disease outbreak data, # table 14.3, p. 574 of the book # We will use the binary response variable, disease status # We will include two predictors, age and city sector. # We will read in the other columns of the data set but # will ignore them in the analysis. # Save the data file into a directory and # use the full path name: disout.data <- read.table(file = "z:/My Documents/teaching/stat_704/diseaseoutbreakdata.txt", header=FALSE, col.names = c('obsnum', 'age', 'col3', 'col4', 'sector', 'disease')) # attaching the data frame: attach(disout.data) # fitting the regression model: disease.logit <- glm(disease ~ age + sector, family=binomial) # Note that the logit link is the default for the binomial family # getting the summary regression output: summary(disease.logit) ################### INFERENCE IN LOGISTIC REGRESSION ##################### # Inference for the whole set of betas: Note the Likelihood Ratio Test is # highly significant (P-value < .0001). See following code: # Getting the LR test statistic and P-value in R # (multiple logistic regression example): anova(disease.logit) LR.test.stat <- sum(anova(disease.logit)[2:3,2]); LR.test.stat LR.test.df <- sum(anova(disease.logit)[2:3,1]) LR.test.Pvalue <- 1 - pchisq(LR.test.stat, df=LR.test.df); LR.test.Pvalue # To test the significance of individual predictors, we see the Wald test # is significant at the .05 level for both predictors: (P-value = .0262 # for age, P-value = .0006 for sector). # This is given in R's summary output (find it!). # getting an approximate 95% CI for the odds ratio # associated with each predictor (an indirect way): alpha <- 0.05 b1 <- summary(disease.logit)\$coef[2,1] s.b1 <- summary(disease.logit)\$coef[2,2] lower.OR1 <- exp(b1 - qnorm(1-alpha/2)*s.b1) upper.OR1 <- exp(b1 + qnorm(1-alpha/2)*s.b1) print(paste(100*(1-alpha), "percent CI for odds ratio 1:", lower.OR1, upper.OR1)) alpha <- 0.05 b2 <- summary(disease.logit)\$coef[3,1] s.b2 <- summary(disease.logit)\$coef[3,2] lower.OR2 <- exp(b2 - qnorm(1-alpha/2)*s.b2) upper.OR2 <- exp(b2 + qnorm(1-alpha/2)*s.b2) print(paste(100*(1-alpha), "percent CI for odds ratio 2:", lower.OR2, upper.OR2)) # To get, say, 99% CIs, just change the specified alpha to 0.01. ######################################################################### # Model Selection Criteria: # Note for this model, AIC = 108.259 and BIC = 116.014. # Find AIC in the R summary output and compare to entries in Table 14.6 of the book. # BIC can be calculated via: BIC.value <- AIC(disease.logit, k=log(nrow(disout.data))); BIC.value ############ Goodness of Fit: ############## #### # A function to do the Hosmer-Lemeshow test in R. # R Function is due to Peter D. M. Macdonald, McMaster University. # hosmerlem <- function (y, yhat, g = 10) { cutyhat <- cut(yhat, breaks = quantile(yhat, probs = seq(0, 1, 1/g)), include.lowest = T) obs <- xtabs(cbind(1 - y, y) ~ cutyhat) expect <- xtabs(cbind(1 - yhat, yhat) ~ cutyhat) chisq <- sum((obs - expect)^2/expect) P <- 1 - pchisq(chisq, g - 2) c("X^2" = chisq, Df = g - 2, "P(>Chi)" = P) } # ###### # Doing the Hosmer-Lemeshow test # (after copying the above function into R): hosmerlem(disease, fitted(disease.logit)) # The P-value will not match SAS's P-value perfectly but may be close. (not in this case!)