# STAT_516_Lec_12 experience <- c(14,29,6,25,18,4,18,12,22,6,30,11,30,5,20,13,9,32,24,13,19,4,28,22,8) success <- c(0,0,0,1,1,0,0,0,1,0,1,0,1,0,1,0,0,1,0,1,0,0,1,1,1) plot(success ~ experience, xlim= c(0,50), ylim=c(0,2)) # do not use lm() for logistic regression lm_out <- lm(success ~ experience) abline(lm_out) # use glm() with family = "binomial" to do logistic regression glm_out <- glm(success ~ experience, family="binomial") summary(glm_out) parms <- coef(glm_out) plot(success ~ experience) x <- seq(min(experience),max(experience),length=200) pihat_x <- exp(parms[1] + parms[2]*x) / (1 + exp(parms[1] + parms[2]*x)) lines(pihat_x ~ x) # these are my pi hats: points(glm_out$fitted.values ~ experience, pch = 19) # get confidence interval confint.default(glm_out) summary(glm_out) exp(coef(glm_out)) # confidence interval for the odds ratio exp(confint.default(glm_out)) plot(glm_out,which=1) library(foreign) # credit-g dataset from https://www.openml.org/ link <- url("https://people.stat.sc.edu/gregorkb/data/dataset_31_credit-g.arff") credg <- read.arff(link) colnames(credg) credg$checking_status glm_out <- glm(class ~ ., family = "binomial",data = credg) summary(glm_out) # full-reduced test for logistic glm_full <- glm(class ~ ., family = "binomial",data = credg) # remove first column, which is checking_status glm_red <- glm(class ~ ., family = "binomial", data = credg[,-1]) dev_full <- glm_full$deviance dev_red <- glm_red$deviance s <- nlevels(credg$checking_status) - 1 Wtest <- dev_red - dev_full # p value 1 - pchisq(Wtest,df = s) # build ROC curve glm_full <- glm(class ~ ., family = "binomial",data = credg) glm_small <- glm(class ~ checking_status, family = "binomial",data = credg) pihat <- glm_full$fitted.values pihat <- glm_red$fitted.values pihat <- glm_small$fitted.values c_thresh <- 0.5 Y <- credg$class == "good" Yhat <- pihat >= c_thresh # misclassification rate M <- mean(Yhat != Y) # takes sum and divides by n # Get TP and FP nc <- 101 cc <- seq(0,1,length=nc) TP <- numeric(nc) FP <- numeric(nc) for(i in 1:nc){ Yhat <- pihat >= cc[i] TP[i] <- sum(Yhat == 1 & Y == 1) / sum( Y == 1) FP[i] <- sum(Yhat == 1 & Y == 0) / sum( Y == 0) } plot(TP ~ FP, type = "l") abline(0,1) lines(TP ~ FP, col="red") lines(TP ~ FP, col="blue") table(credg$class) # make training and testing data sets: n <- nrow(credg) i_train <- sample(n,floor(n/2),replace=FALSE) credg_train <- credg[i_train,] credg_test <- credg[-i_train,] glm_train <- glm(class ~ ., family = "binomial", data = credg_train) # misclassification on training set pihat_train <- glm_train$fitted.values c_thresh <- 0.5 Y_train <- credg_train$class == "good" Yhat_train <- pihat_train >= c_thresh M_train <- mean(Yhat_train != Y_train) M_train # misclassification on the testing set pihat_test <- predict(glm_train,newdata = credg_test,type="response") c_thresh <- 0.5 Y_test <- credg_test$class == "good" Yhat_test <- pihat_test >= c_thresh M_test <- mean(Yhat_test != Y_test) M_test