/* Multiple Regression example */ /* California rain data*/ /* I am calling the data set "calirain". */ /* The variables are: number city precip altitude latitude distance. */ /* Since "city" is a qualitative (non-numeric) variable, */ /* we put a $ after its name. */ DATA calirain; INPUT number city $ precip altitude latitude distance; cards; 1 Eureka 39.57 43 40.8 1 2 RedBluff 23.27 341 40.2 97 3 Thermal 18.20 4152 33.8 70 4 FortBragg 37.48 74 39.4 1 5 SodaSprings 49.26 6752 39.3 150 6 SanFrancisco 21.82 52 37.8 5 7 Sacramento 18.07 25 38.5 80 8 SanJose 14.17 95 37.4 28 9 GiantForest 42.63 6360 36.6 145 10 Salinas 13.85 74 36.7 12 11 Fresno 9.44 331 36.7 114 12 PtPiedras 19.33 57 35.7 1 13 PasaRobles 15.67 740 35.7 31 14 Bakersfield 6.00 489 35.4 75 15 Bishop 5.73 4108 37.3 198 16 Mineral 47.82 4850 40.4 142 17 SantaBarbara 17.95 120 34.4 1 18 Susanville 18.20 4152 40.3 198 19 TuleLake 10.03 4036 41.9 140 20 Needles 4.63 913 34.8 192 21 Burbank 14.74 699 34.2 47 22 LosAngeles 15.02 312 34.1 16 23 LongBeach 12.36 50 33.8 12 24 LosBanos 8.26 125 37.8 74 25 Blythe 4.05 268 33.6 155 26 SanDiego 9.94 19 32.7 5 27 Daggett 4.25 2105 34.1 85 28 DeathValley 1.66 -178 36.5 194 29 CrescentCity 74.87 35 41.7 1 30 Colusa 15.95 60 39.2 91 ; RUN; /* PROC REG gives us a basic regression output: */ PROC REG DATA=calirain; MODEL precip = altitude latitude distance; RUN; /* PROC GLM gives us more more stuff that is useful for inference about the model: */ PROC GLM DATA=calirain; MODEL precip = altitude latitude distance; RUN; /* Can you interpret what the Type I SS section and Type III SS section are saying? */ /************************************************************************************/ /* Testing about sets of coefficients in PROC REG: */ PROC REG DATA=calirain; MODEL precip = altitude latitude distance; TEST altitude=0, distance=0; /* This tests whether beta_1=beta_3=0, given that X2 is in the model */ RUN; /************************************************************************************/ /* INFERENCES ABOUT THE RESPONSE VARIABLE */ /* We want to (1) estimate the mean precipitation for cities of altitude 100 feet, */ /* latitude 40 degrees, and 70 miles from the coast. */ /* and (2) predict the precipitation of a new city of altitude 100 feet, */ /* latitude 40 degrees, and 70 miles from the coast. */ DATA Xvalues; INPUT number city $ precip altitude latitude distance; CARDS; . . . 100 40 70 ; DATA calirain; SET calirain Xvalues; ; /* The options clm and cli will give us CIs for the mean of Y and PIs for Y, */ /* for the values of X1, X2, X3 in the data set. */ PROC REG DATA=calirain; MODEL precip = altitude latitude distance / clm alpha=.10 cli alpha=.10; RUN; /************************************************************************************/ /*** The following code will produce residual plots for this multiple regression ****/ PROC REG DATA=calirain; MODEL precip = altitude latitude distance / P R; OUTPUT OUT=NEW P=PRED R=RES; PROC SGPLOT DATA=NEW; SCATTER y=RES x=PRED; REFLINE 0; PROC UNIVARIATE noprint ; QQPLOT RES / normal; RUN; /****************************************************************************************/ /* Getting Variance Inflation Factors and Influence Statistics is easy: */ /* just add some options to the MODEL statement: */ PROC REG DATA=calirain; MODEL precip = altitude latitude distance / VIF; RUN; PROC REG DATA=calirain; MODEL precip = altitude latitude distance / influence R; RUN; PROC REG DATA=calirain; MODEL precip = altitude latitude distance / VIF influence R; RUN; /* In the output, note that "Student Residual" lists the */ /* internally studentized residuals, while "RStudent" lists */ /* the externally studentized residuals. */ /* The studentized residuals we discussed in class */ /* (which we compare in absolute value to 2.5) are the */ /* internally studentized residuals. */ /****************************************************************************************/ /* SAS can help with automated variable selection guides: */ /* We add the "selection" option to the MODEL statement: */ /* Using the C(p) and Adjusted R^2 Criteria to find the best model(s): */ PROC REG DATA=calirain; MODEL precip = altitude latitude distance / selection = rsquare cp adjrsq; RUN; /* The following is discussed in "Other Selection Procedures" on pg. 434-438 */ PROC STEPWISE DATA=calirain; MODEL precip = altitude latitude distance / f b stepwise; RUN; /****************************************************************************************/