/* In SAS Studio, upload data files into folder and then: */ filename cohort08 ("/home/davidhitchcock/sasuser.v94/stat541/Fall8.txt" "/home/davidhitchcock/sasuser.v94/stat541/Fall9.txt" "/home/davidhitchcock/sasuser.v94/stat541/Fall10.txt"); data all; infile cohort08 dlm='09'X missover dsd truncover; input major :$38. degree :$16. class :$9. cltotgpa gender $ nobs; run; *Create separate data sets by year; data fall08 fall09 fall10; set all; if _n_ le 3364 then output fall08; else if _n_ le 6728 then output fall09; else output fall10; run; *NOTSORTED option; proc freq data=fall08; by gender notsorted; table major; run; proc print data=fall08 (obs=15); run; /* A bit easier to understand */ proc means data=sashelp.cars mean; var weight; by type notsorted; run; /* A nice example of grouped but not sorted data */ data salesmo; input sales month $; cards; 34 Jan 23 Jan 27 Jan 38 Jan 22 Feb 32 Feb 17 Feb 35 Feb 25 Mar 36 Mar 34 Mar 24 Mar 23 Apr 12 Apr 13 Apr 16 Apr 18 May 25 May 26 May 36 May 14 Jun 18 Jun 29 Jun 23 Jun ; run; proc means data=salesmo mean maxdec=2; by month; run; /* causes error*/ proc means data=salesmo mean maxdec=2; by month notsorted; run; /* works great */ /* CLASS statement does not require presorting */ proc means data=salesmo mean maxdec=2; CLASS month; run; *GROUPFORMAT statement; proc format; value $ class "Senior","Junior"="Upper Class" "Freshman","Sophomore"="Under Class" Other="Missing"; run; *This doesn't work correctly because data is not pregrouped; *The output is interesting though; data class08(keep=count class); set fall08 (keep=class); format class $class.; by class groupformat notsorted; if first.class then count=0; count+1; if last.class; run; proc print data=class08 (obs=10); run; proc print data=fall08 (obs=15); run; /* Again on the sashelp.cars data set */ *GROUPFORMAT statement; proc format; value $ cartype "Asia","Europe"="Foreign" "USA"="Domestic"; run; data cars3(keep = make model origin count); set sashelp.cars; format origin $cartype.; by origin groupformat notsorted; if first.origin then count=0; count+1; if last.origin; run; proc print data=cars3; run; /* Similar example on the sales-month data */ *GROUPFORMAT statement; proc format; value $ season "Jan", "Feb", "Mar"="Winter" "Apr", "May", "Jun"="Spring"; run; data salesmos; set salesmo; format month $season.; by month groupformat notsorted; /* also try without NOTSORTED option */ run; proc print data=salesmos; run; proc means data=salesmos mean maxdec=2; BY month notsorted; run; /* CLASS statement does not require presorting */ proc means data=salesmos mean maxdec=2; CLASS month; run; *Checking for sorted data; data fall08; set fall08; proc sort data=fall08; by major gender; proc contents data=fall08; run; *SORTEDBY option--fall08 had been sorted earlier; data fall08soph (sortedby=major gender); set fall08; where class="Sophomore"; run; /* Just to check, see LOG */ proc sort data=fall08soph; by major gender; run; proc means data=fall08soph; by major; *BY GENDER will not work, but CLASS GENDER will work; run; /* Removing duplicate observations */ *NODUPKEY option; data salesmo; input sales month $; cards; 34 Jan 23 Jan 27 Jan 38 Jan 22 Feb 32 Feb 17 Feb 35 Feb 25 Mar 36 Mar 34 Mar 24 Mar 23 Apr 12 Apr 13 Apr 16 Apr 18 May 25 May 26 May 36 May 14 Jun 18 Jun 29 Jun 23 Jun ; run; proc sort data=salesmo nodupkey; by month; run; proc print; run; data nchamp; FILENAME webpage URL 'http://people.stat.sc.edu/hitchcock/champ36pres.txt'; INFILE webpage firstobs=7 truncover; INPUT year 4-7 team $ 9-27 @30 wins 30-31 losses 33 ties 35 Coach $19.; retain maxyear; maxyear=max(maxyear, year); if year= . then year + maxyear; drop maxyear; run; proc print data=nchamp; run; data nchamp2; set nchamp; run; proc sort data=nchamp2 nodupkey; BY team coach; run; proc print; run; data nchamp2; set nchamp; run; proc sort data=nchamp2 nodupkey; BY coach team; run; proc print; run; * NODUPRECS option; data carslimit; set sashelp.cars; keep make type origin drivetrain enginesize cylinders; run; proc print data=carslimit; run; proc sort data=carslimit noduprecs; by make drivetrain; run; proc print; run; * first. and last. options; data carslimit; set sashelp.cars; keep make type origin drivetrain enginesize cylinders; run; proc print data=carslimit; run; proc sort data=carslimit out=carslimsorted; by enginesize; run; data carslimit2; set carslimsorted; by enginesize; if first.enginesize; run; proc print; run; * try with LAST.enginesize; *Chapter 24--INDEX quantiles; * Code to import data set that I have saved in a folder ; * I generated this using the Import Data menu choice; /* Generated Code (IMPORT) */ /* Source File: MedDB.csv */ /* Source Path: /home/davidhitchcock/sasuser.v94/stat541 */ /* Code generated on: Thursday, January 28, 2016 10:25:22 PM */ %web_drop_table(WORK.MEDDB); FILENAME REFFILE "/home/davidhitchcock/sasuser.v94/stat541/MedDB.csv" TERMSTR=CR; PROC IMPORT DATAFILE=REFFILE DBMS=CSV OUT=WORK.MEDDB; GETNAMES=YES; DATAROW=2; RUN; PROC CONTENTS DATA=WORK.MEDDB; RUN; %web_open_table(WORK.MEDDB); *Be sure to assign formats to meddb when you import it; proc sql; create table meddb as select ssn format=ssn11., btrim(tos) as tos label='Type of Service', dos format=mmddyy9. label='Date of Service', provider, claim format=dollar11.2 from meddb; quit; proc contents data=meddb; run; options msglevel=i; data meddb (index=(dos)); set work.meddb; run; proc contents data=meddb centiles; run; data subdos; set meddb; where dos='19JAN2000'd; run; data subdos2; set meddb; where dos > '19JAN2000'd; run; options msglevel=i; data meddb (index=(tos)); set work.meddb; run; proc contents data=meddb centiles; run; /* PROC MEANS stuff */ /* see pages 883-902 */ DATA baseball24; length Experience $7.; SET sashelp.baseball; IF position in ('1B','2B','3B','C','LF','CF','RF','DH','SS','OF','UT'); BatAvg=nhits/natbat; IF YrMajor LE 5 then Experience='Young'; ELSE Experience='Veteran'; run; /* Mean BatAvg for each Position */ * PROC MEANS; PROC MEANS DATA=baseball24 (KEEP= Position BatAvg) mean maxdec=3; CLASS Position; VAR BatAvg; run; *PROC REPORT; PROC REPORT DATA=baseball24 (KEEP= Position BatAvg); COLUMN Position BatAvg; DEFINE Position / group width=10 'Position'; DEFINE BatAvg / mean format=4.3 'Mean Batting Avg' width=8; run; *PROC SQL; PROC SQL; SELECT position, avg(BatAvg) label='Mean Batting Avg' format=4.3 FROM baseball24 GROUP BY position; quit; *PROC TABULATE; PROC TABULATE DATA=baseball24 (KEEP= Position BatAvg) format=4.3; CLASS position; VAR BatAvg; TABLE position, BatAvg*mean; run; /* Summary Statistics for Combinations of Class variables */ * all possible combinations; PROC MEANS data=baseball24 maxdec=3; CLASS Position League Experience; /* Could rearrange order of CLASS variables for different looking output */ VAR BatAvg; OUTPUT OUT=summary1 mean=average; run; proc print data=summary1; run; * specific combinations; PROC MEANS data=baseball24 maxdec=3; CLASS Position League Experience; VAR BatAvg; TYPES Position*League Position*Experience; OUTPUT OUT=summary2 mean=average; run; * equivalent to above; PROC MEANS data=baseball24 maxdec=3; CLASS Position League Experience; VAR BatAvg; TYPES Position*(League Experience); OUTPUT OUT=summary2 mean=average; run; * specific combinations using NWAY option and multiple PROC steps; PROC MEANS data=baseball24 NWAY maxdec=3; CLASS Position League; VAR BatAvg; OUTPUT OUT=summary3a mean=average; run; PROC MEANS data=baseball24 NWAY maxdec=3; CLASS Position Experience; VAR BatAvg; OUTPUT OUT=summary3b mean=average; run; * specific combinations using WHERE= option; PROC MEANS data=baseball24 maxdec=3; CLASS Position League Experience; VAR BatAvg; OUTPUT OUT=summary4 (where=(_type_ in (5,6) )) n=players mean=average; run; proc print data=summary4; run; /* Explanation of the _TYPE_ variable: Suppose a PROC MEANS step has a CLASS statement with three class variables (categorical variables) A, B, and C: PROC MEANS data=mydata mea; CLASS A B C; VAR salary; OUTPUT out=summary1 mean=average; run; The SAS creates a _TYPE_ variable with (in this case) values 1,2,3,4,5,6,7 for the seven possible combinations of the three variables: _TYPE_=1 --> rightmost variable only --> C --> 1-way _TYPE_=2 --> middle variable only --> B --> 1-way _TYPE_=3 --> rightmost variable and middle variable --> B*C --> 2-way _TYPE_=4 --> leftmost variable only --> A --> 1-way _TYPE_=5 --> leftmost variable and rightmost variable --> A*C --> 2-way _TYPE_=6 --> leftmost variable and middle variable --> A*B --> 2-way _TYPE_=7 --> rightmost variable and and middle variable and leftmost variable --> A*B*C --> 3-way If the CLASS statement has more than 3 variables, then the _TYPE_ variable will have more values. */ * All n-way combinations using the WAYS statement; PROC MEANS data=baseball24 maxdec=3; CLASS Position League Experience; VAR BatAvg; WAYS 1 2; OUTPUT OUT=summary5 mean=average; run;