*CASE supplement for Chapter 4 and Chapter 5; *Import USC non-citizens enrolled in Spring 2014; data epi2012t13; input first_name $18. @20 last_name $11. @32 Sponsor $13.; datalines; Basen Malik AAL-SAID NSCOR Abdulbakr Saud A ALATWA Private Meghan OTOOLE Fulbright John M GREGO Private Akeem ALSHARIF Fulbright Mary Chapin NICHOLSON Private Nizar ALTUUBI Saudi Mission ; run; data usc2014; input last_name $15. @17 first_name $17. @35 College $; datalines; Al Atwa Abdulbakr Saud A. ENGR Aal-Said Basen ENGR O'Toole Meghan CAS Nicholson Mary C. CAS Grego John BUSI Al Tuubi Nizar J. MED Al-Sharif Ali ENGR ; run; proc sql; select * from epi2012t13; select * from USC2014; quit; *Demonstrate a couple approaches to match merging by name; *EPI concatenates last names (with one exception--AAL-SAID), while USC does not; *EPI omits periods for initials, while USC does not; *Use upper case conversion. *Concatenate dashes and remove periods; * Note: compress(last_name,"-.'") removes the dash, period, and apostrophe from the values in last_name; * NOTE: scan(VARNAME, 1) returns the first word in VARNAME; * NOTE: scan(VARNAME, 2) returns the second word in VARNAME; *This code removes -,., and ' from first names and last names, and saves a couple; *different versions of first names for possible matching options; proc sql; create table epimerge as select upcase(compress(first_name,"-.'")) as first, upcase(compress(last_name,"-.'")) as last, scan(upcase(compress(first_name,"-.'")),1) as first1, sponsor from epi2012t13; select * from epimerge; quit; *Similar to above code, but we compress the prefix 'Al' in last names; *and explore other options for handling last names; *There may be other prefixes we would need to compress; proc sql; create table uscmerge as select upcase(compress(first_name,"-.'")) as first, upcase(compress(last_name,"-. '")) as last, scan(upcase(compress(first_name,"-.'")),1) as first1, scan(upcase(compress(last_name,'-.')),1) as last1, upcase(compress(last_name,"-.'")) as last1wspace, scan(upcase(compress(last_name,"-.'")),2) as last2, college from usc2014; select * from uscmerge; quit; *The output for uscmerge actually creates any version of last I might need; *I use a CASE statement below that compresses spaces from last names only when ; *the last name has a prefix--it's more selective than the previous approach; *though ultimately unnecessary; proc sql; create table uscmerge as select upcase(compress(first_name,"-.'")) as first, case when scan(upcase(compress(last_name,'-.')),1)='AL' then upcase(compress(last_name,"-. '")) else upcase(compress(last_name,"-.'")) end as last, scan(upcase(compress(first_name,"-.'")),1) as first1, scan(upcase(compress(last_name,"-.'")),2) as last2, college from usc2014; select * from uscmerge; quit; *The final match is a left join to track EPI students; *We match on the first name and the compressed last name while also checking; *whether we might have missed a prefix. Note that a one-to-many match is possible though; *that does not occur here. Only one EPI record is not found in the USC database; proc sql; select e.first, e.last, e.sponsor, u.college from epimerge as e left join uscmerge as u on (e.last=u.last or e.last contains u.last2) and e.first1=u.first1; quit;