/* SAS example of multiple logistic regression */ /* The data set is the disease outbreak data, */ /* We will use the binary response variable, disease status */ /* We will include two predictors, age and city sector. */ /* We will read in the other columns of the data set but */ /* will ignore them in the analysis. */ DATA disout; INPUT obsnum age col3 col4 sector disease; cards; 1 33 0 0 0 0 2 35 0 0 0 0 3 6 0 0 0 0 4 60 0 0 0 0 5 18 0 1 0 1 6 26 0 1 0 0 7 6 0 1 0 0 8 31 1 0 0 1 9 26 1 0 0 1 10 37 1 0 0 0 11 23 0 0 0 0 12 23 0 0 0 0 13 27 0 0 0 0 14 9 0 0 0 1 15 37 0 0 1 1 16 22 0 0 1 1 17 67 0 0 1 1 18 8 0 0 1 0 19 6 0 0 1 1 20 15 0 0 1 1 21 21 1 0 1 1 22 32 1 0 1 1 23 16 0 0 1 1 24 11 1 0 1 0 25 14 0 1 1 0 26 9 1 0 1 0 27 18 1 0 1 0 28 2 0 1 0 0 29 61 0 1 0 0 30 20 0 1 0 0 31 16 0 1 0 0 32 9 1 0 0 0 33 35 1 0 0 0 34 4 0 0 0 0 35 44 0 1 1 0 36 11 0 1 1 1 37 3 1 0 1 0 38 6 0 1 1 0 39 17 1 0 1 1 40 1 0 1 1 0 41 53 1 0 1 1 42 13 0 0 1 1 43 24 0 0 1 0 44 70 0 0 1 1 45 16 0 1 1 1 46 12 1 0 1 0 47 20 0 1 1 1 48 65 0 1 1 0 49 40 1 0 1 1 50 38 1 0 1 1 51 68 1 0 1 1 52 74 0 0 1 1 53 14 0 0 1 1 54 27 0 0 1 1 55 31 0 0 1 0 56 18 0 0 1 0 57 39 0 0 1 0 58 50 0 0 1 0 59 31 0 0 1 0 60 61 0 0 1 0 61 18 0 1 0 0 62 5 0 1 0 0 63 2 0 1 0 0 64 16 0 1 0 0 65 59 0 1 0 1 66 22 0 1 0 0 67 24 0 0 0 0 68 30 0 0 0 0 69 46 0 0 0 0 70 28 0 0 0 0 71 27 0 0 0 0 72 27 0 0 0 1 73 28 0 0 0 0 74 52 0 0 0 1 75 11 0 1 0 0 76 6 1 0 0 0 77 46 0 1 0 0 78 20 1 0 0 1 79 3 0 0 0 0 80 18 1 0 0 0 81 25 1 0 0 0 82 6 0 1 0 0 83 65 0 1 0 1 84 51 0 1 0 0 85 39 1 0 0 0 86 8 0 0 0 0 87 8 1 0 0 0 88 14 0 1 0 0 89 6 0 1 0 0 90 6 0 1 0 0 91 7 0 1 0 0 92 4 0 1 0 0 93 8 0 1 0 0 94 9 1 0 0 0 95 32 0 1 0 1 96 19 0 1 0 0 97 11 0 1 0 0 98 35 0 1 0 0 ; run; /* PROC LOGISTIC will fit a logistic regression model. */ /* The DESCENDING option is important! */ /* It defines pi as P(Y=1) as we did in class, rather than as P(Y=0). */ /* We create an output data set called NEW that contained the predicted probabilities. */ PROC LOGISTIC DESCENDING DATA = disout; MODEL disease = age sector / LACKFIT; OUTPUT OUT=NEW P=PRED; RUN; /******************** INFERENCE IN LOGISTIC REGRESSION **********************/ /* Inference for the whole set of betas: Note the Likelihood Ratio Test is */ /* highly significant (P-value < .0001, bottom of first page of output). */ /* To test the significance of individual predictors, we see the Wald test */ /* is significant at the .05 level for both predictors: (P-value = .0262 */ /* for age, P-value = .0006 for sector). */ /* Note SAS gives the Chi-square statistic, the square of z*. */ /* Approximate 95% CIs for the odds ratio associated with each predictor */ /* are given as well. */ /* To obtain, say, approximate 99% CIs for those odds ratios just */ /* include ALPHA=0.01 as an option in the MODEL statement above: */ * MODEL disease = age sector / LACKFIT ALPHA=0.01; /****************************************************************************/ /* Model Selection Criteria: */ /* Note for this model, AIC = 108.259 and BIC = 116.014 (SAS calls this SC).*/ /******** Goodness of Fit: *******/ /* Output: With the LACKFIT option, SAS provides a Hosmer-Lemeshow test for */ /* "H0: the logistic regression fits well". */ /* With a P-value of 0.9295, we cannot reject this null hypothesis. */ /* We have no reason to doubt the logistic model's fit. */ /* So it's fine to use the logistic regression model. */ /*****************************************************/ /* CI for probability of success for new observation */ /* Including an extra observation with "age" = 37, sector = 1, and */ /* a missing value for "disease" */ /* INPUT statement must be exactly the same as in original DATA step */ DATA Xvalues; INPUT obsnum age col3 col4 sector disease; cards; . 37 . . 1 . ; DATA disout; SET disout Xvalues; run; /* Getting the 90% CI for P(Y_h=1 | X1=37,X2=1) (The ALPHA=0.10 option will yield a 90% CI) */ PROC LOGISTIC DESCENDING DATA=disout; MODEL disease = age sector / LACKFIT; OUTPUT OUT=NEW P=PRED L=LOWER_90 U=UPPER_90 / ALPHA=0.10; RUN; /* Printing the data, the point estimate, and the 90% CI: */ PROC PRINT DATA = NEW; VAR age sector disease PRED LOWER_90 UPPER_90; run;