PROC IMPORT OUT=WORK.FALL2008 DATAFILE="Z:\stat 704\Fall 2008.txt" DBMS=TAB REPLACE; GETNAMES=YES; DATAROW=2; RUN; *Identify categorical variables; proc contents data=fall2008; run; *Let's pick out categorical variables without too many levels as recoding candidates; proc freq data=fall2008; table class degree enroll gender housing major race regstat school; run; *Degree, Major and School have too many levels--let's recode the others with proc format statements; data fall2008; set fall2008; label cltotgpa='GPA' satv='Verbal SAT' satm='Math SAT' regstat='Registration Status' enroll='Enrollment Status'; *Three variables for class with Freshman as the baseline; if class='Sophomore' then do; class2=1; class3=0; class4=0; end; else if class='Junior' then do; class2=0; class3=1; class4=0; end; else if class='Senior' then do; class2=0; class3=0; class4=1; end; else do; class2=0; class3=0; class4=0; end; *binary variables are easier to recode--housing was already numeric; if enroll='Full-time' then enrollft=1; else enrollft=0; if gender='F' then genderF=1; else genderF=0; *Four categories for race (white, African-American, Other, Unknown); *White will be baseline; if race='African American' then do; raceaa=1; raceo=0; raceu=0; end; else if race='White' then do; raceaa=0; raceo=0; raceu=0; end; else if race='Unknown' then do; raceaa=0; raceo=0; raceu=1; end; else do; raceaa=0; raceo=1; raceu=0; end; *Three categories for registration status (Continuing, New, Other); if regstat='Continuing' then do; regn=0; rego=0; end; else if regstat='New Freshman' or regstat='New Transfer' then do; regn=1; rego=0; end; else do; regn=0; rego=1; end; run; proc reg data=fall2008; *Grouping information is ignored for best subsets regression; *model cltotgpa=satv satm {class2 class3 class4} housing {raceaa raceo raceu} genderf enrollft {regn rego}/selection=cp best=10; model cltotgpa=satv satm class2 class3 class4 housing raceaa raceo raceu genderf enrollft regn rego/selection=cp best=10; run; proc glmselect data=fall2008; class class race regstat gender enroll housing; model cltotgpa=satv satm class housing race gender enroll regstat satv*race satv*gender satv*enroll satm*race satm*gender satm*enroll/select=sl sle=0.1 sls=0.15 hier=single; run; *A little follow-up; proc sgplot data=fall2008; vbox cltotgpa/category=gender; run; proc sgplot data=fall2008; vbox cltotgpa/category=class; run; proc sgplot data=fall2008; vbox cltotgpa/category=housing; run; proc sgplot data=fall2008; vbox cltotgpa/category=race; run; proc sgplot data=fall2008; vbox cltotgpa/category=regstat; run; *We detect a serious data quality problem here; proc sgplot data=fall2008; vbox cltotgpa/category=enroll; run; proc sgplot data=fall2008; scatter x=satm y=cltotgpa/group=race; reg x=satm y=cltotgpa/group=race nomarkers; run; proc sgplot data=fall2008; scatter x=satm y=cltotgpa/group=gender; reg x=satm y=cltotgpa/group=gender nomarkers; run;