* The portion of the nanumbr data set given in the textbook; data nanumbr; infile cards dlm=','; input name :$27. PhoneNumber :$15. country :$14.; cards; Alexander Mcknight,(738) 766-2114, Canada Alison Campbell, 943.519.8369, United States Amador Alvaro Luna, 3581599311, Mexico Amanda Johnson, 362-686-6286, Canada Amy Williams, 953-246-7733, United States Ann Keith, (375) 862-7384, Canada Anne Weaver, 793-199-3925, United States Arturo Longoria, 203-752-8263, Mexico Brandon Kerr, 555-677-4102, United States Camilo Indira Mojica Romero, 718.690.4147, Mexico ; run; * Checking for valid phone number formats; data matchphn; set nanumbr; loc = prxmatch('/([2-9]\d\d)-([2-9]\d\d)-(\d{4})/',PhoneNumber); run; proc print data=matchphn; where loc > 0; run; * Does the same thing, but saves the PERL expression as a SAS variable; data matchphn (drop=Exp); set nanumbr; Exp='/([2-9]\d\d)-([2-9]\d\d)-(\d{4})/o'; *the o makes SAS only compile the expression once; loc = prxmatch(Exp,PhoneNumber); run; proc print data=matchphn; where loc > 0; run; *Example about finding specific patterns in addresses; * Example is from SAS help documentation; data one; if _N_=1 then do; retain patternID; /* The i option specifies a case insensitive search. */ pattern="/ave|avenue|dr|drive|rd|road/i"; patternID=prxparse(pattern); end; input street $80.; call prxsubstr(patternID, street, position, length); if position ^= 0 then do; match=substr(street, position, length); put match:$QUOTE. "found in " street:$QUOTE.; end; datalines; 153 First Street 6789 64th Ave 4 Moritz Road 7493 Wilkes Place ; proc print data=one; run; * Getting phone numbers whose area codes are formatted with parentheses; data parsephone (drop=Exp1); set nanumbr; Exp1='/([(][2-9]\d\d[)])\s([2-9]\d\d)-(\d{4})/'; patternID=prxparse(Exp1); call prxsubstr(patternID, PhoneNumber, position, length); run; proc print data=parsephone; where position ^= 0; run; * Substituting text for a specific expression; data redact; length PhoneNumber $24; set nanumbr; PhoneNumber=prxchange('s/([2-9]\d\d)-([2-9]\d\d)-(\d{4})/*NUMBER REDACTED*/', -1, PhoneNumber); run; proc print data=redact; run; * Making the format consistent; data consist; set nanumbr; PhoneNumber=strip(PhoneNumber); Exp='/([2-9]\d\d)-([2-9]\d\d)-(\d{4})/o'; Exp0='/(\d{3})-(\d{3})-(\d{4})/o'; *allows 'illegal' numbers like Anne Weaver's; Exp1='/([(][2-9]\d\d[)])\s([2-9]\d\d)-(\d{4})/o'; Exp2='/([2-9]\d\d)[.]([2-9]\d\d)[.](\d{4})/o'; * The dot must be placed in square brackets, otherwise a dot stands for "any character"; loc = prxmatch(Exp,PhoneNumber); if loc >0 then do; patternID=prxparse(Exp); call prxsubstr(patternID, PhoneNumber, position, length); plain=cat(substr(PhoneNumber, position, 3),substr(PhoneNumber, position+4, 3),substr(PhoneNumber, position+8, 4)); end; loc0 = prxmatch(Exp0,PhoneNumber); if loc0 >0 then do; patternID0=prxparse(Exp0); call prxsubstr(patternID0, PhoneNumber, position, length); plain=cat(substr(PhoneNumber, position, 3),substr(PhoneNumber, position+4, 3),substr(PhoneNumber, position+8, 4)); end; loc1 = prxmatch(Exp1,PhoneNumber); if loc1 >0 then do; patternID1=prxparse(Exp1); call prxsubstr(patternID1, PhoneNumber, position, length); plain=cat(substr(PhoneNumber, position+1, 3),substr(PhoneNumber, position+6, 3),substr(PhoneNumber, position+10, 4)); end; loc2 = prxmatch(Exp2,PhoneNumber); if loc2>0 then do; patternID2=prxparse(Exp2); call prxsubstr(patternID2, PhoneNumber, position, length); plain=cat(substr(PhoneNumber, position, 3),substr(PhoneNumber, position+4, 3),substr(PhoneNumber, position+8, 4)); end; if max(loc,loc0,loc1,loc2)=0 then plain=PhoneNumber; run; proc print data=consist; VAR Name PhoneNumber Country plain loc loc0 loc1 loc2; run;