# R example of logistic regression

# The data set is the TIF data from Table 11.8 and from class

# Entering the data and defining the variables:  
  

##########
##
# Reading the data into R:

my.datafile <- tempfile()
cat(file=my.datafile, "  
0 9.2 
0 12.9 
0 9.2 
1 9.6 
0 9.3 
1 10.1 
0 9.4 
1 10.3 
0 9.5 
1 10.9 
0 9.5 
1 10.9 
0 9.5 
1 11.1 
0 9.6 
1 11.1 
0 9.7 
1 11.1 
0 9.7 
1 11.5 
0 9.8 
1 11.8 
0 9.8 
1 11.9 
0 9.9 
1 12.1 
0 10.5 
1 12.2 
0 10.5 
1 12.5 
0 10.9 
1 12.6 
0 11 
1 12.6 
0 11.2 
1 12.6 
0 11.2 
1 12.9 
0 11.5 
1 12.9 
0 11.7 
1 12.9 
0 11.8 
1 12.9 
0 12.1 
1 13.1 
0 12.3 
1 13.2 
0 12.5 
1 13.5
", sep=" ")

options(scipen=999) # suppressing scientific notation

citydata <- read.table(my.datafile, header=FALSE, col.names=c("Y", "income")) 
  
# Note we could also save the data columns into a file and use a command such as:
# citydata <- read.table(file = "z:/stat_516/filename.txt", header=FALSE, col.names = c("Y", "income"))

attach(citydata)

# The data frame called citydata is now created, 
# with two variables, "Y", "income".
##
#########
####################################################################

# attaching the data frame:

attach(citydata)

# A simple plot of the data:

plot(income, Y)

# fitting the regression model:

city.reg <- glm(Y ~ income, family=binomial)

# Note that the logit link is the default for the binomial family

# getting the summary regression output:

summary(city.reg)

# the fitted values:

# fitted(city.reg)

# The sample X values along with the corresponding fitted values:

# cbind(income, fitted(city.reg))

# A plot of the data with the estimated logistic curve on top:

library(boot);
xrange <- seq(from=min(income), to=max(income), length=100)
lines(xrange, inv.logit(city.reg$coef[1]+ city.reg$coef[2]*xrange))



############### INFERENCE IN LOGISTIC REGRESSION ######################


      

# Getting the LR test statistic and P-value in R (simple logistic regression):
anova(city.reg)
LR.test.stat <- anova(city.reg)[2,2]; LR.test.stat
LR.test.df <- anova(city.reg)[2,1]

LR.test.Pvalue <- 1 - pchisq(LR.test.stat, df=LR.test.df); LR.test.Pvalue      

# Output: R provides a likelihood-ratio test of H0: beta_1 = 0.  Since the P-value   
# is very small ( < .0001), we reject H0, conclude beta_1 is not zero, and conclude    
# that income has a significant effect on the probability a city uses TIF.  

est.odds.ratio <-   exp(summary(city.reg)$coef["income","Estimate"])  
print(est.odds.ratio)  

# getting an approximate 95% CI for the odds ratio 
# associated with Y (an indirect way):

conf.level <- 0.95
alpha <- 1 - conf.level
b1 <- summary(city.reg)$coef["income","Estimate"]
s.b1 <- summary(city.reg)$coef["income","Std. Error"]
lower <- exp(b1 - qnorm(1-alpha/2)*s.b1)
upper <- exp(b1 + qnorm(1-alpha/2)*s.b1)
print(paste(100*(1-alpha), "percent CI for odds ratio:", round(lower,4), round(upper,4)))

# The estimates beta_0-hat and beta_1-hat are -11.347 and 1.002.              
# Estimated odds ratio = 2.723, and 95% CI for odds ratio is (1.526, 4.858).  


# To get, say, a 99% CI, just change the specified conf.level to 0.99.
         
#################################################################################


############ Goodness of Fit: ##############

####
# A function to do the Hosmer-Lemeshow test in R.
# R Function is due to Peter D. M. Macdonald, McMaster University.
# 
hosmerlem <-
function (y, yhat, g = 10) 
{
    cutyhat <- cut(yhat, breaks = quantile(yhat, probs = seq(0, 
        1, 1/g)), include.lowest = T)
    obs <- xtabs(cbind(1 - y, y) ~ cutyhat)
    expect <- xtabs(cbind(1 - yhat, yhat) ~ cutyhat)
    chisq <- sum((obs - expect)^2/expect)
    P <- 1 - pchisq(chisq, g - 2)
    c("X^2" = chisq, Df = g - 2, "P(>Chi)" = P)
}
#
######

# Doing the Hosmer-Lemeshow test
# (after copying the above function into R):

hosmerlem(Y, fitted(city.reg))

# The P-value will not match SAS's P-value perfectly but should be close.