# STAT_516_Lec_12

experience <- c(14,29,6,25,18,4,18,12,22,6,30,11,30,5,20,13,9,32,24,13,19,4,28,22,8)
success <- c(0,0,0,1,1,0,0,0,1,0,1,0,1,0,1,0,0,1,0,1,0,0,1,1,1)

plot(success ~ experience, xlim= c(0,50), ylim=c(0,2))

# do not use lm() for logistic regression
lm_out <- lm(success ~ experience)
abline(lm_out)

# use glm() with family = "binomial" to do logistic regression

glm_out <- glm(success ~ experience, family="binomial")
summary(glm_out)
parms <- coef(glm_out)


plot(success ~ experience)
x <- seq(min(experience),max(experience),length=200)
pihat_x <- exp(parms[1] + parms[2]*x) / (1 + exp(parms[1] + parms[2]*x))
lines(pihat_x ~ x)

# these are my pi hats:
points(glm_out$fitted.values ~ experience, pch = 19)

# get confidence interval
confint.default(glm_out)

summary(glm_out)

exp(coef(glm_out))

# confidence interval for the odds ratio
exp(confint.default(glm_out))

plot(glm_out,which=1)


library(foreign) # credit-g dataset from https://www.openml.org/
link <- url("https://people.stat.sc.edu/gregorkb/data/dataset_31_credit-g.arff")
credg <- read.arff(link)
colnames(credg)

credg$checking_status
glm_out <- glm(class ~ ., family = "binomial",data = credg)
summary(glm_out)


# full-reduced test for logistic
glm_full <- glm(class ~ ., family = "binomial",data = credg)

# remove first column, which is checking_status
glm_red <- glm(class ~ ., family = "binomial", data = credg[,-1])

dev_full <- glm_full$deviance
dev_red <- glm_red$deviance
s <- nlevels(credg$checking_status) - 1
Wtest <- dev_red - dev_full

# p value
1 - pchisq(Wtest,df = s)


# build ROC curve

glm_full <- glm(class ~ ., family = "binomial",data = credg)

glm_small <- glm(class ~ checking_status, family = "binomial",data = credg)

pihat <- glm_full$fitted.values
pihat <- glm_red$fitted.values
pihat <- glm_small$fitted.values

c_thresh <- 0.5

Y <- credg$class == "good"
Yhat <- pihat >= c_thresh

# misclassification rate
M <- mean(Yhat != Y) # takes sum and divides by n

# Get TP and FP

nc <- 101
cc <- seq(0,1,length=nc)
TP <- numeric(nc)
FP <- numeric(nc)
for(i in 1:nc){

  Yhat <- pihat >= cc[i]
  TP[i] <- sum(Yhat == 1 & Y == 1) / sum( Y == 1)
  FP[i] <- sum(Yhat == 1 & Y == 0) / sum( Y == 0)
  
}

plot(TP ~ FP, type = "l")
abline(0,1)
lines(TP ~ FP, col="red")
lines(TP ~ FP, col="blue")


table(credg$class)


# make training and testing data sets:

n <- nrow(credg)
i_train <- sample(n,floor(n/2),replace=FALSE)


credg_train <- credg[i_train,]
credg_test <- credg[-i_train,]


glm_train <- glm(class ~ ., family = "binomial", data = credg_train)

# misclassification on training set
pihat_train <- glm_train$fitted.values
c_thresh <- 0.5

Y_train <- credg_train$class == "good"
Yhat_train <- pihat_train >= c_thresh
M_train <- mean(Yhat_train != Y_train)
M_train

# misclassification on the testing set

pihat_test <- predict(glm_train,newdata = credg_test,type="response")

c_thresh <- 0.5

Y_test <- credg_test$class == "good"
Yhat_test <- pihat_test >= c_thresh
M_test <- mean(Yhat_test != Y_test)
M_test