/* SAS example of multiple linear regression */

/* The data are from the portrait studio example discussed in class. */

/* people16 and income will be our predictor variables (X1 and X2). */
/* sales will be our response variable (Y). */

data studio;
  input people16 income sales;
cards;
  68.5  16.7  174.4
  45.2  16.8  164.4
  91.3  18.2  244.2
  47.8  16.3  154.6
  46.9  17.3  181.6
  66.1  18.2  207.5
  49.5  15.9  152.8
  52.0  17.2  163.2
  48.9  16.6  145.4
  38.4  16.0  137.2
  87.9  18.3  241.9
  72.8  17.1  191.1
  88.4  17.4  232.0
  42.9  15.8  145.3
  52.5  17.8  161.1
  85.7  18.4  209.7
  41.3  16.5  146.4
  51.7  16.3  144.0
  89.6  18.1  232.6
  82.7  19.1  224.1
  52.3  16.0  166.5
  ;
run;

/* In the MODEL statement, the response variable goes on the */
/* left of the equals sign, and the predictors go on the right. */
/* The option "clb" produces 95% CIs for the individual betas. */

PROC REG DATA = studio;
  MODEL sales = people16 income / clb;
run;

 /* NOTE: */
 /* 95% is the default confidence level */
 /* to specify another confidence level, use (e.g., for 90%) */

 * MODEL sales = people16 income / clb alpha = 0.10;


/* Find the results of the overall F-test in the SAS output. */
/* Also find the coefficient of multiple determination R^2.  */

/* Find the 95% CIs for beta_1 and for beta_2 on the SAS output.   */
/* Also the the p-value for the two-sided test for                 */
/* whether the each of the true coefficients is zero.              */

/* Compare the results in the SAS output to the results in the */
/* book (Sec. 6.9) for the portrait studio example.            */

/* Bonferroni and Holm adjustments to the t-test P-values if doing simultaneous tests */

DATA a;
INPUT indepvar $ raw_p;
cards;
people16 .0001
income .0333
;
run;

PROC MULTTEST PDATA=a BON HOLM OUT=adjps;
run;
PROC PRINT DATA=adjps;
run;


/**********************************************************************************/

/* Confidence Interval for E(Y_h)     */
/* & Prediction Interval for Y_h(new) */

/* Example 1:  */
/* We want to estimate the mean sales                      */
/* for cities with 65.4 thousand people 16 or younger      */
/* and per capita disposable income 17.6 thousand dollars, */
/* with a 95% CI.                                          */

/* Example 2:  */
/* We want to estimate the sales for a new city            */
/* having 65.4 thousand people 16 or younger               */
/* and per capita disposable income 17.6 thousand dollars, */
/* with a 95% prediction interval.                         */

/* Let's add the value X1 = 65.4 and X2 = 17.6 to our data set:  */
/* We'll leave a missing value (.) for sales.      */

DATA Xvalues;
input people16 income sales;
CARDS;
65.4 17.6 .
;

DATA studio;
SET studio Xvalues;
;

/* The option clm will give us CIs for the mean of Y, */ 
/* for the values of X1 and X2 in the data set.       */
/* The option cli will give us PIs for the new Y,     */ 
/* for the values of X1 and X2 in the data set.       */

/* The option alpha=0.05 tells SAS we want a 95% CI/PI.  */   
/* We don't actually need that option since 95% is       */
/* the default coverage.                                */

PROC REG DATA = studio;
  MODEL sales = people16 income / clm cli alpha = 0.05;
run;


/* Look at the last line of the "Output Statistics".     */
/* This shows the predictions and confidence intervals   */
/* for the city with x1 = 65.4 and X2 = 17.6.            */
/* We see the fitted value for this observation was      */ 
/* 191.104 thousand dollars.                             */

/* Example 1 answer:  */
/* The 95% CI for mean sales for all cities with     */
/* 65.4 thousand people 16 or younger and per capita */
/* disposable income 17.6 thousand dollars           */
/* is between 185.3 and 196.9 thousand dollars.      */

/* Example 2 answer:  */
/* The 95% PI for sales for a new city having        */
/* 65.4 thousand people 16 or younger and per capita */
/* disposable income 17.6 thousand dollars           */
/* is between 167.3 and 214.9 thousand dollars.      */


/*********** Residual Plots ************/

/*** The following code will produce residual plots for this multiple regression ****/

PROC REG DATA=studio;
MODEL sales = people16 income / P R SPEC;
OUTPUT OUT=NEW P=PRED R=RES;
PROC GPLOT DATA=NEW;
PLOT RES*PRED='+'/ VREF=0;
PROC UNIVARIATE noprint ;
  QQPLOT RES / normal;
RUN; 

/*** The SPEC option in the model statement produces a modification of ***/
/*** the Breusch-Pagan test that is called the White test.             ***/
/*** The P-value is 0.4316.  What is the conclusion?                   ***/

/*** And the Shapiro-Wilk test for normality is produced by: ****/

PROC UNIVARIATE DATA=NEW normal;
  VAR RES;
RUN; 

/* Look in the output under "Tests of Normality".            */
/* We see a test statistic of 0.954 and a P-value of 0.4056. */
/* So the hypothesis of normal errors is reasonable.         */

/**** Alternatively, you can use PROC MODEL to perform the Breusch-Pagan test: *****/

proc model data=studio;
  parms beta0 beta1 beta2 ;
  sales = beta0 + beta1*people16 + beta2*income ;
  fit sales / white breusch=(1 people16 income);
run;