/* Model Building Example */ /* Surgical Unit Data, Table 9.1 in book */ DATA surg; INPUT x1 x2 x3 x4 x5 x6 x7 x8 y lny; label x1 = 'blood-clotting' x2 = 'prognostic' x3 = 'enzyme' x4 = 'liver function' x5 = 'age' x6 = 'gender' x7 = 'moderate alcohol use' x8 = 'heavy alcohol use' y = 'survival' lny = 'log survival'; cards; 6.7 62 81 2.59 50 0 1 0 695 6.544 5.1 59 66 1.70 39 0 0 0 403 5.999 7.4 57 83 2.16 55 0 0 0 710 6.565 6.5 73 41 2.01 48 0 0 0 349 5.854 7.8 65 115 4.30 45 0 0 1 2343 7.759 5.8 38 72 1.42 65 1 1 0 348 5.852 5.7 46 63 1.91 49 1 0 1 518 6.250 3.7 68 81 2.57 69 1 1 0 749 6.619 6.0 67 93 2.50 58 0 1 0 1056 6.962 3.7 76 94 2.40 48 0 1 0 968 6.875 6.3 84 83 4.13 37 0 1 0 745 6.613 6.7 51 43 1.86 57 0 1 0 257 5.549 5.8 96 114 3.95 63 1 0 0 1573 7.361 5.8 83 88 3.95 52 1 0 0 858 6.754 7.7 62 67 3.40 58 0 0 1 702 6.554 7.4 74 68 2.40 64 1 1 0 809 6.695 6.0 85 28 2.98 36 1 1 0 682 6.526 3.7 51 41 1.55 39 0 0 0 205 5.321 7.3 68 74 3.56 59 1 0 0 550 6.309 5.6 57 87 3.02 63 0 0 1 838 6.731 5.2 52 76 2.85 39 0 0 0 359 5.883 3.4 83 53 1.12 67 1 1 0 353 5.866 6.7 26 68 2.10 30 0 0 1 599 6.395 5.8 67 86 3.40 49 1 1 0 562 6.332 6.3 59 100 2.95 36 1 1 0 651 6.478 5.8 61 73 3.50 62 1 1 0 751 6.621 5.2 52 86 2.45 70 0 1 0 545 6.302 11.2 76 90 5.59 58 1 0 1 1965 7.583 5.2 54 56 2.71 44 1 0 0 477 6.167 5.8 76 59 2.58 61 1 1 0 600 6.396 3.2 64 65 0.74 53 0 1 0 443 6.094 8.7 45 23 2.52 68 0 1 0 181 5.198 5.0 59 73 3.50 57 0 1 0 411 6.019 5.8 72 93 3.30 39 1 0 1 1037 6.944 5.4 58 70 2.64 31 1 1 0 482 6.179 5.3 51 99 2.60 48 0 1 0 634 6.453 2.6 74 86 2.05 45 0 0 0 678 6.519 4.3 8 119 2.85 65 1 0 0 362 5.893 4.8 61 76 2.45 51 1 1 0 637 6.457 5.4 52 88 1.81 40 1 0 0 705 6.558 5.2 49 72 1.84 46 0 0 0 536 6.283 3.6 28 99 1.30 55 0 0 1 582 6.366 8.8 86 88 6.40 30 1 1 0 1270 7.147 6.5 56 77 2.85 41 0 1 0 538 6.288 3.4 77 93 1.48 69 0 1 0 482 6.178 6.5 40 84 3.00 54 1 1 0 611 6.416 4.5 73 106 3.05 47 1 1 0 960 6.867 4.8 86 101 4.10 35 1 0 1 1300 7.170 5.1 67 77 2.86 66 1 0 0 581 6.365 3.9 82 103 4.55 50 0 1 0 1078 6.983 6.6 77 46 1.95 50 0 1 0 405 6.005 6.4 85 40 1.21 58 0 0 1 579 6.361 6.4 59 85 2.33 63 0 1 0 550 6.310 8.8 78 72 3.20 56 0 0 0 651 6.478 ; RUN; /* PRODUCING THE SCATTERPLOT MATRIX */ /******************** BEGIN MACRO **************************************/ /*-------------------------------------------------------------------* * Name: SCATTER.SAS * * Title: Construct a scatterplot matrix - all pairwise plots * * for n variables. * * Doc: http://www.math.yorku.ca/SCS/sasmac/scatter.html * * * * %scatter(data=, var=, group=); * *-------------------------------------------------------------------* * Author: Michael Friendly * * Created: 23 Oct 1996 * * Revised: 1 Oct 1997 12:00:16 * * Version: 1.1 * *-------------------------------------------------------------------*/ %macro scatter(data=_LAST_, var=_NUMERIC_, /* variables to plot */ class=, /* name of class/group variable */ group=, /* name of class/group variable */ where=, /* where clause to select observations */ id=, symbols=square plus circle diamond X up down star, colors=BLACK RED GREEN BLUE BROWN YELLOW ORANGE PURPLE,out=_paint_ ); %if %sysprod(insight) ^= 1 %then %do; %put This program requires SAS/INSIGHT; %goto done; %end; %if &sysenv = BACK %then %do; %put This program does not run in batch; %goto done; %end; %if &group=%str() %then %if &class ^= %str() %then %let group = &class; *-- Parse variables list; data _null_; set &data (obs=1); if upcase(symget('data')) eq '_LAST_' then call symput('data', symget('syslast')); call symput('abort', put(_error_ ne 0, 1.)); %if %index(&var,-) > 0 or %upcase(&var)=_NUMERIC_ %then %do; * find the number of variables in the list and convert shorthand variable list to long form; length _vname_ $ 8 _vlist_ $ 200; array _xx_ &var; _vname_ = ' '; do over _xx_; call vname(_xx_,_vname_); if _vname_ ne "&group" then do; nvar + 1; if nvar = 1 then startpt = 1; else startpt = length(_vlist_) + 2; endpt = length(_vname_); substr(_vlist_,startpt,endpt) = _vname_; end; end; call symput( 'VAR', _vlist_ ); put nvar=; %end; %else %do; * find the number of variables in the list; nvar = n(of &var) + nmiss(of &var); %end; call symput('NVAR',trim(left(put(nvar,2.)))); RUN; %put nvar= &nvar; %if &nvar < 2 or &nvar > 15 %then %do; %put Cannot do a scatterplot matrix for &nvar variables ; %goto DONE; %end; %if &group ^= %str() %then %do; %put Assigning color/symbol codes for &group variable... results in &out; %paint(data=&data, out=&out, var=&group,level=nominal,colors=&colors, symbols=&symbols); %let data=&out; %end; proc insight data=&data %if %length(&where) %then %do; (where = (&where)) %end; ; scatter &var * &var %if &id ^= %str() %then %do; / label=&id; %end; ; run; %done:; %mend; /******************** END MACRO **************************************/ /* invoking the macro with our data set */ %scatter(data = surg, var = lny x1 x2 x3 x4 x5 x6 x7 x8) ; /* No obvious curved trends between lny and any predictor */ /* Some moderate correlation between predictors */ /* Could examine this further with PROC CORR: */ * PROC CORR DATA = surg; * VAR lny x1 x2 x3 x4 x5 x6 x7 x8; * RUN; /* Preliminary Winnowing Down of Models Using STEPWISE option */ /* (We're trusting the book's advice that including possible */ /* interaction terms here is not necessary.) */ PROC REG data = surg; model lny = x1 x2 x3 x4 x5 x6 x7 x8 / selection = stepwise slentry= .15 slstay= .20; run; /* We've narrowed the pool of candidate predictors down to x1, x2, x3, x6, x8. */ /* "All-possible Models" selection using RSQUARE */ PROC REG data = surg ; model lny = x1 x2 x3 x6 x8 / selection=rsquare adjrsq cp mse; run; /* The model with the highest adjusted R-square is the model with */ /* all 5 predictors but, just slightly lower is the model with the */ /* 4 predictors x1, x2, x3, x8. Both of these choices have nice */ /* small Cp values that are close to "p" for that model. */ /* Either candidate model is reasonable, although I would tend to */ /* favor the 4-variable model on the principle of simplicity, given */ /* that the final predictor (x6) isn't adding that much benefit. */ /* Fitting Candidate Model with predictors x1, x2, x3, x8 */ PROC REG data = surg ; model lny = x1 x2 x3 x8 / vif; RUN; /* Fitting Candidate Model with predictors x1, x2, x3, x6, x8 */ PROC REG data = surg ; model lny = x1 x2 x3 x6 x8 / vif; RUN; /* Note that in the presence of the other four predictors, x6 */ /* is not really significant (p-value = 0.14). The four-variable */ /* model seems to be the best one. */