/* SAS example of simple logistic regression */

/* The data set is the programming task data,  */
/* table 14.1, p. 566 of the book              */

/* We will use the binary response variable, success */
/* We will use the predictor "experience".           */


DATA progtask;
INPUT exper success;
cards;
14  0  
29  0 
 6  0  
25  1 
18  1  
 4  0  
18  0  
12  0  
22  1  
 6  0 
30  1  
11  0  
30  1  
 5  0
20  1  
13  0  
 9  0  
32  1  
24  0  
13  1  
19  0  
 4  0 
28  1  
22  1  
 8  1 
;
run;


/* A simple plot of the data set */

goptions reset=all;
PROC GPLOT DATA=progtask;
PLOT success*exper;
run;

/* PROC LOGISTIC will fit a logistic regression model.                */
/* The DESCENDING option is important!                                */
/* It defines pi as P(Y=1) as we did in class, rather than as P(Y=0). */

/* We create an output data set called NEW that contained the predicted probabilities. */


PROC LOGISTIC DESCENDING DATA=progtask;
MODEL success = exper / LACKFIT INFLUENCE;
OUTPUT OUT=NEW P=PRED;
RUN;

/* printing the fitted values: */

PROC PRINT data=NEW;
VAR exper PRED;
run;


/**************************************************************************************************/
/* PLOT OF ESTIMATED LOGISTIC CURVE */


/* Using PROC GPLOT gives a nice-looking plot of the estimated logistic regression curve. */
/* This plot comes up in a separate window from the OUTPUT window.                        */
/* First the data must be sorted based on the X variable before using PROC GPLOT.         */
/* (since we are going to connect the dots).                                              */

/* We use a circle as the plotting symbol for the first curve (the original binary data)  */
/* and a star as the plotting symbol for the 2nd curve (the predicted probabilities).     */

PROC SORT DATA=NEW; 
BY exper;
symbol1 v=circle l=32  c = black;
symbol2 i = join v=star l=32  c = black;
PROC GPLOT DATA=NEW;
PLOT success*exper PRED*exper / OVERLAY;
RUN;


/******************** INFERENCE IN LOGISTIC REGRESSION **********************/



/* To test the significance of the predictor, we may use the Wald test, which */
/* is significant at the .05 level for "experience":  (P-value = .0129).      */
/* Note SAS gives the Chi-square statistic, the square of z*.                 */

/* With a single predictor, the LR test is testing the same hypothesis, but   */
/* its P-value is slightly different, .0029.                                  */

/* An approximate 95% CI for the odds ratio associated with success   */
/* is given as well:  (1.035, 1.335)                                  */

/* To obtain, say, an approximate 99% CI for the odds ratio associated with        */
/* success, we would include ALPHA=0.01 as an option in the MODEL statement above: */

*   MODEL success = exper / LACKFIT INFLUENCE ALPHA=0.01;

/****************************************************************************/


/******** Goodness of Fit: *******/

/* Output: With the LACKFIT option, SAS provides a Hosmer-Lemeshow test for */
/* "H0: the logistic regression fits well".                                 */
/* With a P-value of 0.5253, we cannot reject this null hypothesis.         */
/* We have no reason to doubt the logistic model's fit.                     */
/* So it's fine to use the logistic regression model.                       */

/******** Diagnostics: *******/

/* The INFLUENCE option in the MODEL statement gives Pearson residuals and  */
/* other diagnostic measures (e.g., hat diagonal elements). Note that       */
/* observations 2 and 25 have somewhat large absolute Pearson residuals.    */


/******** CI for pi_h: *******/

/* Including an extra observation with "experience" = 10 months and */
/* a missing value for "success"                                    */

DATA Xvalues;
INPUT exper success;
cards;
10 .
;
DATA progtask;
SET progtask Xvalues;
run;

/* Getting the 90% CI for pi_h (The ALPHA=0.10 option will yield a 90% CI) */

PROC LOGISTIC DESCENDING DATA=progtask;
MODEL success = exper / LACKFIT;
OUTPUT OUT=NEW P=PRED L=LOWER_90 U=UPPER_90 / ALPHA=0.10;
RUN;

/* Printing the data, the point estimate, and the 90% CI: */
PROC PRINT DATA = NEW;
VAR exper success PRED LOWER_90 UPPER_90;
run;