# R example of logistic regression # The data set is the TIF data from Table 11.8 and from class # Entering the data and defining the variables: ########## ## # Reading the data into R: my.datafile <- tempfile() cat(file=my.datafile, " 0 9.2 0 12.9 0 9.2 1 9.6 0 9.3 1 10.1 0 9.4 1 10.3 0 9.5 1 10.9 0 9.5 1 10.9 0 9.5 1 11.1 0 9.6 1 11.1 0 9.7 1 11.1 0 9.7 1 11.5 0 9.8 1 11.8 0 9.8 1 11.9 0 9.9 1 12.1 0 10.5 1 12.2 0 10.5 1 12.5 0 10.9 1 12.6 0 11 1 12.6 0 11.2 1 12.6 0 11.2 1 12.9 0 11.5 1 12.9 0 11.7 1 12.9 0 11.8 1 12.9 0 12.1 1 13.1 0 12.3 1 13.2 0 12.5 1 13.5 ", sep=" ") options(scipen=999) # suppressing scientific notation citydata <- read.table(my.datafile, header=FALSE, col.names=c("Y", "income")) # Note we could also save the data columns into a file and use a command such as: # citydata <- read.table(file = "z:/stat_516/filename.txt", header=FALSE, col.names = c("Y", "income")) attach(citydata) # The data frame called citydata is now created, # with two variables, "Y", "income". ## ######### #################################################################### # attaching the data frame: attach(citydata) # A simple plot of the data: plot(income, Y) # fitting the regression model: city.reg <- glm(Y ~ income, family=binomial) # Note that the logit link is the default for the binomial family # getting the summary regression output: summary(city.reg) # the fitted values: # fitted(city.reg) # The sample X values along with the corresponding fitted values: # cbind(income, fitted(city.reg)) # A plot of the data with the estimated logistic curve on top: library(boot); xrange <- seq(from=min(income), to=max(income), length=100) lines(xrange, inv.logit(city.reg$coef[1]+ city.reg$coef[2]*xrange)) ############### INFERENCE IN LOGISTIC REGRESSION ###################### # Getting the LR test statistic and P-value in R (simple logistic regression): anova(city.reg) LR.test.stat <- anova(city.reg)[2,2]; LR.test.stat LR.test.df <- anova(city.reg)[2,1] LR.test.Pvalue <- 1 - pchisq(LR.test.stat, df=LR.test.df); LR.test.Pvalue # Output: R provides a likelihood-ratio test of H0: beta_1 = 0. Since the P-value # is very small ( < .0001), we reject H0, conclude beta_1 is not zero, and conclude # that income has a significant effect on the probability a city uses TIF. est.odds.ratio <- exp(summary(city.reg)$coef["income","Estimate"]) print(est.odds.ratio) # getting an approximate 95% CI for the odds ratio # associated with Y (an indirect way): conf.level <- 0.95 alpha <- 1 - conf.level b1 <- summary(city.reg)$coef["income","Estimate"] s.b1 <- summary(city.reg)$coef["income","Std. Error"] lower <- exp(b1 - qnorm(1-alpha/2)*s.b1) upper <- exp(b1 + qnorm(1-alpha/2)*s.b1) print(paste(100*(1-alpha), "percent CI for odds ratio:", round(lower,4), round(upper,4))) # The estimates beta_0-hat and beta_1-hat are -11.347 and 1.002. # Estimated odds ratio = 2.723, and 95% CI for odds ratio is (1.526, 4.858). # To get, say, a 99% CI, just change the specified conf.level to 0.99. ################################################################################# ############ Goodness of Fit: ############## #### # A function to do the Hosmer-Lemeshow test in R. # R Function is due to Peter D. M. Macdonald, McMaster University. # hosmerlem <- function (y, yhat, g = 10) { cutyhat <- cut(yhat, breaks = quantile(yhat, probs = seq(0, 1, 1/g)), include.lowest = T) obs <- xtabs(cbind(1 - y, y) ~ cutyhat) expect <- xtabs(cbind(1 - yhat, yhat) ~ cutyhat) chisq <- sum((obs - expect)^2/expect) P <- 1 - pchisq(chisq, g - 2) c("X^2" = chisq, Df = g - 2, "P(>Chi)" = P) } # ###### # Doing the Hosmer-Lemeshow test # (after copying the above function into R): hosmerlem(Y, fitted(city.reg)) # The P-value will not match SAS's P-value perfectly but should be close.