*CASE supplement for Chapter 4 and Chapter 5;
*Import USC non-citizens enrolled in Spring 2014;
data epi2012t13;
input first_name $18. @20 last_name $11. @32 Sponsor $13.;
datalines;
Basen Malik        AAL-SAID    NSCOR
Abdulbakr Saud A   ALATWA      Private
Meghan             OTOOLE      Fulbright
John M             GREGO       Private
Akeem              ALSHARIF    Fulbright
Mary Chapin        NICHOLSON   Private 
Nizar              ALTUUBI     Saudi Mission
;
run;

data usc2014;
input last_name $15. @17 first_name $17. @35 College $;
datalines;
Al Atwa         Abdulbakr Saud A. ENGR
Aal-Said        Basen             ENGR
O'Toole         Meghan            CAS
Nicholson       Mary C.           CAS
Grego           John              BUSI
Al Tuubi        Nizar J.          MED
Al-Sharif       Ali               ENGR
;
run; 

proc sql;
select * from epi2012t13;
select * from USC2014;
quit;

*Demonstrate a couple approaches to match merging by name;
*EPI concatenates last names (with one exception--AAL-SAID), while USC does not;
*EPI omits periods for initials, while USC does not;
*Use upper case conversion.
*Concatenate dashes and remove periods;

* Note: compress(last_name,"-.'") removes the dash, period, and apostrophe from the values in last_name;
* NOTE:  scan(VARNAME, 1) returns the first word in VARNAME;
* NOTE:  scan(VARNAME, 2) returns the second word in VARNAME;

*This code removes -,., and ' from first names and last names, and saves a couple;
*different versions of first names for possible matching options;
proc sql;
create table epimerge as select 
       upcase(compress(first_name,"-.'")) as first,
	   upcase(compress(last_name,"-.'")) as last,
       scan(upcase(compress(first_name,"-.'")),1) as first1,      
	   sponsor
       from epi2012t13;

select * from epimerge;
quit;

 
*Similar to above code, but we compress the prefix 'Al' in last names;
*and explore other options for handling last names;
*There may be other prefixes we would need to compress;



proc sql; 
create table uscmerge as select
        upcase(compress(first_name,"-.'")) as first,
		upcase(compress(last_name,"-. '")) as last,
        scan(upcase(compress(first_name,"-.'")),1) as first1,
        scan(upcase(compress(last_name,'-.')),1) as last1,       
        upcase(compress(last_name,"-.'")) as last1wspace,
		scan(upcase(compress(last_name,"-.'")),2) as last2,
        college 
        from usc2014;

select * from uscmerge;
quit;

*The output for uscmerge actually creates any version of last I might need;
*I use a CASE statement below that compresses spaces from last names only when ;
*the last name has a prefix--it's more selective than the previous approach;
*though ultimately unnecessary;

proc sql; 
create table uscmerge as  select upcase(compress(first_name,"-.'")) as first,                
                case
                when scan(upcase(compress(last_name,'-.')),1)='AL' then upcase(compress(last_name,"-. '"))
                else upcase(compress(last_name,"-.'"))
                end as last,
				scan(upcase(compress(first_name,"-.'")),1) as first1,
				scan(upcase(compress(last_name,"-.'")),2) as last2,
				college
      from usc2014;

select * from uscmerge; quit;

*The final match is a left join to track EPI students;
*We match on the first name and the compressed last name while also checking;
*whether we might have missed a prefix.  Note that a one-to-many match is possible though;
*that does not occur here.  Only one EPI record is not found in the USC database;

proc sql;
select e.first, e.last, e.sponsor, u.college from epimerge as e 
left join uscmerge as u
on (e.last=u.last or e.last contains u.last2) and e.first1=u.first1;
quit;