/*  SAS Example of investigating treatment differences */
/* in the Single-Factor ANOVA model                    */

/* We again analyze the Kenton Foods data from the example in class */

/* The response variable is sales and the factor is package design. */
/* The store label is also given in the data set.                   */

DATA kenton;
INPUT SALES DESIGN STORE;
cards;
  11  1  1
  17  1  2
  16  1  3
  14  1  4
  15  1  5
  12  2  1
  10  2  2
  15  2  3
  19  2  4
  11  2  5
  23  3  1
  20  3  2
  18  3  3
  17  3  4
  27  4  1
  33  4  2
  22  4  3
  26  4  4
  28  4  5
;
run;

/* *********************************************************************************** */

/* Investigating differences among treatment means through plots and inferences */

/* The F* statistic in the ANOVA table is significant, so we conclude the  */
/* population means are significantly different.                           */

/* Bar graph given via the PROC GCHART code.  The title statement gives a  */
/* title to the graphs, and the final title; statement cancels this title. */

/* Main effects plot given via the PROC GPLOT code                */
/* Note 18.63 is the overall sample mean response (Y-bar-dot-dot) */
/* found on the main PROC GLM output page.                        */

/* The CL option to the LSMEANS statement produces (here, 95%) confidence */
/* intervals for each population factor level mean.                       */
/* Note the CI for the mean sales for package design 1:  (11.5, 17.7).    */

/* PDIFF gives CIs for the difference between any two factor level means.         */
/* It also gives P-values for the test of equality of any two factor level means. */

PROC GLM DATA = kenton;
CLASS DESIGN;
MODEL SALES = DESIGN;
LSMEANS DESIGN / CL ALPHA = 0.05 PDIFF;
OUTPUT OUT=smpmeans p=YBAR r=resid;
run;

title "Mean Sales for Each Package Design";
PROC GCHART DATA = kenton;
BLOCK DESIGN / SUMVAR=SALES TYPE=MEAN DISCRETE;
run;

symbol1 i = join v=circle l=32  c = black;
PROC GPLOT data=smpmeans;
 PLOT YBAR*DESIGN/vref=18.63;
run; title;

/* ***************************************************************************** */

/* Contrasts:  CIs and Hypothesis Tests */

/* Example:  We want a 95% CI for the difference in the mean sales of the */
/* cartoon designs and the mean sales of the non-cartoon designs */

/* The relevant contrast here is: (1/2)mu_1 - (1/2)mu_2 + (1/2)mu_3 - (1/2)mu_4 */

/* The ESTIMATE statement defines the coefficients of the contrast (these must   */
/* be in the proper order!) and gives the test statistic and P-value of the test */
/* for whether the contrast equals zero.                                         */
/* The CLPARM option to the MODEL statement tells SAS to give a CI (by default,  */
/* a 95% CI) for the contrast.                                                   */

PROC GLM DATA = kenton;
CLASS DESIGN;
MODEL SALES = DESIGN / CLPARM;
LSMEANS DESIGN;
ESTIMATE 'CartoonVsNoncartoon' DESIGN 1 -1 1 -1 / divisor=2;
RUN;

/* ***************************************************************************** */

/* Multiple Comparison Procedures */

/* In the MEANS statement, the CLDIFF option gives CIs for all pairwise treatment */
/* mean differences, based on the Tukey procedure.  The ALPHA=0.10 ensures that   */
/* the family confidence level is 90%.  SAS also provides an indication of which  */
/* pairs of treatment means are judged to be significantly different, at the      */
/* 0.10 family significance level, by the Tukey procedure.                        */

PROC GLM DATA = kenton;
CLASS DESIGN;
MODEL SALES = DESIGN;
LSMEANS DESIGN;
MEANS DESIGN / TUKEY ALPHA=0.10 CLDIFF; /* Produces Tukey CIs and testing results */
run;

/* We could change TUKEY to SCHEFFE or BON to get the Scheffe or Bonferroni results, */
/* but if we're interested in all pairwise comparisons, these will not be as         */
/* efficient as the Tukey procedure.                                                 */

/* ***************************************************************************** */

/* Testing Multiple Contrasts Simultaneously */

PROC GLM DATA = kenton;
CLASS DESIGN;
MODEL SALES = DESIGN / CLPARM;
LSMEANS DESIGN;
ESTIMATE 'CartoonVsNoncartoon' DESIGN 1 -1 1 -1 / divisor=2;
ESTIMATE 'Design1VsDesign3' DESIGN 1 0 -1 0;  *Which is the better cartoon design?;
RUN;

/* The P-values from the INDIVIDUAL t-tests about these contrasts are 0.0464 and 0.0399, respectively. */
/* Adjusting these for the multiple tests: */

DATA Pvals;
INPUT raw_p;  * Need to name the variable containing the raw p-values raw_p;
cards;
.0464
.0399
;
run;

PROC MULTTEST inpvalues=Pvals BON FDR HOLM;
run;

/* The BON option performs the Bonferroni adjustment to the P-values.            */
/* This controls the familywise (experimentwise) error rate (FDR), which is the  */
/* probability of having at least one H0 rejected if all the H0's are true.      */
 
/* The FDR option performs the Benjamini-Hochberg adjustment to the P-values.     */
/* This controls the false discovery rate (FDR), which is the expected proportion */
/* of incorrectly rejected hypotheses among all rejected hypotheses:              */

/* The Bonferroni method is more conservative and will not detect significant differences */
/* as often, ESPECIALLY when the number of simultaneous tests is fairly large.            */
/* For this reason, it has become popular to choose the control the FDR instead.          */

/* The HOLM option is an adjustment to the Bonferroni method that still controls the */
/* familywise error rate, but is slightly less conservative than the Bonferroni.     */