# R example of simple logistic regression # The data set is the programming task data, # table 14.1, p. 566 of the book # We will use the binary response variable, success # We will use the predictor "experience". ################################################ # Save the data file into a directory and # use the full path name: progtask.data <- read.table(file = "z:/My Documents/teaching/stat_704/programmingtaskdata.txt", header=FALSE, col.names = c('exper', 'success')) # attaching the data frame: attach(progtask.data) # A simple plot of the data: plot(exper, success) # fitting the regression model: progtask.logit <- glm(success ~ exper, family=binomial) # Note that the logit link is the default for the binomial family # getting the summary regression output: summary(progtask.logit) # the fitted values: fitted(progtask.logit) # The sample X values along with the corresponding fitted values: cbind(exper, fitted(progtask.logit)) # A plot of the data with the estimated logistic curve on top: library(boot); xrange <- seq(from=min(exper), to=max(exper), length=100) lines(xrange, inv.logit(progtask.logit$coef[1]+ progtask.logit$coef[2]*xrange)) ############### INFERENCE IN LOGISTIC REGRESSION ###################### # To test the significance of the predictor, we may use the Wald test, which # is significant at the .05 level for "experience": (P-value = .0129). # This is given in R's summary output (find it!). # With a single predictor, the LR test is testing the same hypothesis, but # its P-value is slightly different, .0029. # Getting the LR test statistic and P-value in R (simple logistic regression): anova(progtask.logit) LR.test.stat <- anova(progtask.logit)[2,2]; LR.test.stat LR.test.df <- anova(progtask.logit)[2,1] LR.test.Pvalue <- 1 - pchisq(LR.test.stat, df=LR.test.df); LR.test.Pvalue # getting an approximate 95% CI for the odds ratio # associated with success (an indirect way): alpha <- 0.05 b1 <- summary(progtask.logit)$coef[2,1] s.b1 <- summary(progtask.logit)$coef[2,2] lower <- exp(b1 - qnorm(1-alpha/2)*s.b1) upper <- exp(b1 + qnorm(1-alpha/2)*s.b1) print(paste(100*(1-alpha), "percent CI for odds ratio:", lower, upper)) # Here the CI is: (1.035, 1.335) # To get, say, a 99% CI, just change the specified alpha to 0.01. ################################################################################# ############ Goodness of Fit: ############## #### # A function to do the Hosmer-Lemeshow test in R. # R Function is due to Peter D. M. Macdonald, McMaster University. # hosmerlem <- function (y, yhat, g = 10) { cutyhat <- cut(yhat, breaks = quantile(yhat, probs = seq(0, 1, 1/g)), include.lowest = T) obs <- xtabs(cbind(1 - y, y) ~ cutyhat) expect <- xtabs(cbind(1 - yhat, yhat) ~ cutyhat) chisq <- sum((obs - expect)^2/expect) P <- 1 - pchisq(chisq, g - 2) c("X^2" = chisq, Df = g - 2, "P(>Chi)" = P) } # ###### # Doing the Hosmer-Lemeshow test # (after copying the above function into R): hosmerlem(success, fitted(progtask.logit)) # The P-value will not match SAS's P-value perfectly but should be close. ############ Diagnostics: ############## # Getting the Pearson residuals: residuals(progtask.logit,type="pearson") # Note that observations 2 and 25 have somewhat large absolute Pearson residuals. # Getting some influence measures: influence.measures(progtask.logit) ######### CI for pi_h: ############## # Including an extra observation with "experience" = 10 months and # no value for "success": xh.value <- data.frame(exper = 10) # Estimates for pi_h: pi.h.hat <- predict(progtask.logit, xh.value, se.fit=TRUE,type="response") # This gives the point estimate for pi_h: pi.h.hat$fit # The 90% CI for pi_h is obtained by: alpha <- 0.10 lin.pred.hat <- predict(progtask.logit, xh.value, se.fit=TRUE, type="link") lower90 <- plogis(lin.pred.hat$fit - qnorm(1-alpha/2)*lin.pred.hat$se.fit) upper90 <- plogis(lin.pred.hat$fit + qnorm(1-alpha/2)*lin.pred.hat$se.fit) print(paste(100*(1-alpha), "percent CI for pi_h:", lower90, upper90))