|
|
Listing of SAS Program for Detecting Duplicate Addresses
Employee/Vendor Address Match
SAS Program
Revise JCL according to your organization's standards.
Revise data element formats according to your vendor file and employee file layouts
000001 //(Job Card)
000002 /*JOBPARM U=NNNN
000003 /*ROUTE PRINT FETCH
000004 //STEP1 EXEC SAS,SORT=15
000005 //SAS.SASLIST DD SYSOUT=(J,,1601)
000006 //INPUTFL1 DD DSN=(Employee Data File Here),DISP=SHR
000007 //INPUTFL2 DD DSN=(Vendor File Here),DISP=SHR
000008 //SYSIN DD *
000009 DATA EMPLADDR(DROP=ZIP AA B C D E F G H I J K CHEK Z); *Temp variables;
000010 INFILE INPUTFL1; *Employee data file;
000011 INPUT @20 NAME $28. @50 Z $1. @50 ADDRESS $28.
000012 @80 CITY $18. @98 ST $2. @100 ZIPCODE $5.;
000013 TYP='EMPLOYEE'; *Type of record;
000014 IF ZIPCODE GE 0 THEN DO; *If zipcode exists;
000015 AA=COMPRESS(ADDRESS,',.''%'); *Characters to be removed from address;
000016 B=INDEX(AA,'-'); *Returns position of any dashes in address;
000017 IF B GT 0 THEN DO;
000018 SUBSTR(AA,B,1)=' '; *Replace dash with space in address;
000019 END;
000020 J=COMPRESS(AA,' '); *Remove spaces from address;
000021 C=INDEX(J,'BOX'); *Returns position of 'BOX' in address (if any);
000022 D=INDEX(J,'DRAWER'); *Returns position of 'DRAWER' in address (if any);
000023 IF C=1 THEN DO; *If position of 'BOX' is 1;
000024 A=SUBSTR(J,C); *Returns address starting at 'BOX';
000025 RETURN; *Input next record;
000026 END;
000027 IF C GT 1 THEN DO; *If position of 'BOX' is > 1;
000028 A=SUBSTR(J,(C-1)); *Returns address starting 1 character before 'BOX';
000029 RETURN; *Input next record;
000030 END;
000031 IF D=1 THEN DO; *If position of 'DRAWER' is 1;
000032 A=SUBSTR(J,D); *Returns address starting at 'DRAWER';
000033 RETURN; *Input next record;
000034 END;
000035 IF D GT 1 THEN DO; *If position of 'DRAWER' is > 1;
000036 A=SUBSTR(J,(D-1)); *Returns address starting 1 character before 'DRAWER';
000037 RETURN; *Input next record;
000038 END;
000039 CHEK='0123456789'; *Numeric check string;
000040 E=VERIFY(Z,CHEK); *Check to see if 1st character of address is number;
000041 IF E NE 0 THEN DO; *If not numeric;
000042 F=SCAN(AA,1); *Returns 1st 'word' string in address (before 1st delimiter, if any);
000043 G=SCAN(AA,2); *Returns 2nd 'word' string in address (if any);
000044 H=LENGTH(F); *Returns length of 1st 'word' in address;
000045 I=LENGTH(G); *Returns length of 2nd 'word' in address;
000046 K=COMPRESS(AA,' ');*Remove spaces from address;
000047 A=SUBSTR(K,H,(I+1)); *Returns string starting with last character in 1st word for length
of 2nd word in address + 1;
000048 RETURN; *Input next record;
000049 END;
000050 ELSE DO; *If numeric;
000051 F=SCAN(AA,1); *Returns 1st 'word' string in address (before 1st delimiter, if any);
000052 G=LENGTH(F); *Returns length of 1st word in address;
000053 K=COMPRESS(AA,' '); *Remove spaces from address;
000054 A=SUBSTR(K,1,(G+1)); *Returns string starting at 1 for length of 1st word in address + 1;
000055 RETURN; *Input next record;
000056 END;
000057 END;
000058 DELETE; *Delete if zipcode = 0, then input next record;
000059 DATA VENDADDR(DROP= AA B C D E F G H I J K CHEK Z ZIP ZIP1);*Temp variables;
000060 FORMAT ZIP1 $5.; *Format zipcode character field, 5 characters;
000061 INFILE INPUTFL2; *Input vendor data file;
000062 INPUT @1 VENDNO 6. @7 NAME $28. @45 ADDRESS $28. @45 Z $1.
000063 @101 CITY $19. @120 ST $2. @122 ZIP PD5.;TYP='VENDOR';
000064 ZIP1=ZIP;
000065 ZIPCODE=SUBSTR(ZIP1,1,5); *Returns first five characters of zipcode;
000066 IF ZIPCODE GE 0 THEN DO; *If zipcode exists;
000067 AA=COMPRESS(ADDRESS,',.''%'); *Characters to be removed from address;
000068 B=INDEX(AA,'-'); *Returns position of any dashes in address;
000069 IF B GT 0 THEN DO;
000070 SUBSTR(AA,B,1)=' '; *Replace dash with space in address;
000071 END;
000072 J=COMPRESS(AA,' '); *Remove spaces from address;
000073 C=INDEX(J,'BOX'); *Returns position of 'BOX' in address (if any);
000074 D=INDEX(J,'DRAWER'); *Returns position of 'DRAWER' in address (if any);
000075 IF C=1 THEN DO; *If position of 'BOX' is 1;
000076 A=SUBSTR(J,C); *Returns address starting at 'BOX';
000077 RETURN; *Input next record;
000078 END;
000079 IF C GT 1 THEN DO; *If position of 'BOX' is > 1;
000080 A=SUBSTR(J,(C-1)); *Returns address starting 1 character before 'BOX';
000081 RETURN; *Input next record;
000082 END;
000083 IF D=1 THEN DO; *If position of 'DRAWER' is 1;
000084 A=SUBSTR(J,D); *Returns address starting at 'DRAWER';
000085 RETURN; *Input next record;
000086 END;
000087 IF D GT 1 THEN DO; *If position of 'DRAWER' is > 1;
000088 A=SUBSTR(J,(D-1)); *Returns address starting 1 character before 'DRAWER';
000089 RETURN; *Input next record;
000090 END;
000091 CHEK='0123456789'; *Numeric check string;
000092 E=VERIFY(Z,CHEK); *Check to see if 1st character of address is number;
000093 IF E NE 0 THEN DO; *If not numeric;
000094 F=SCAN(AA,1); *Returns 1st 'word' string in address (before 1st delimiter, if any);
000095 G=SCAN(AA,2); *Returns 2nd 'word' string in address (if any);
000096 H=LENGTH(F); *Returns length of 1st 'word' in address;
000097 I=LENGTH(G); *Returns length of 2nd 'word' in address;
000098 K=COMPRESS(AA,' '); *Remove spaces from address;
000099 A=SUBSTR(K,H,(I+1)); *Returns string starting with last character in 1st word for length
of 2nd word in address + 1;
000100 RETURN; *Input next record;
000101 END;
000102 ELSE DO; *If numeric;
000103 F=SCAN(AA,1); *Returns 1st 'word' string in address (before 1st delimiter, if any);
000104 G=LENGTH(F); *Returns length of 1st word in address;
000105 K=COMPRESS(AA,' '); *Remove spaces from address;
000106 A=SUBSTR(K,1,(G+1)); *Returns string starting at 1 for length of 1st word in address + 1;
000107 RETURN; *Input next record;
000108 END;
000109 END;
000110 DELETE; *Delete if zipcode = 0, then input next record;
000111 DATA JOIN; *Merge two data sets created above from employee & vendor data files;
000112 SET EMPLADDR VENDADDR;
000113 RETURN; *Input next record;
000114 PROC SORT; *Sort by zipcode, address string, record type;
000115 BY ZIPCODE A TYP;
000116 DATA MATCH1(DROP=ACT FLAG); *First match iteration;
000117 SET JOIN; *Input dataset with sorted, merged records;
000118 BY ZIPCODE A TYP; *Input records as sorted by zipcode, address string, record type;
000119 RETAIN FLAG; *Retain value of flag variable from one iteration of data step to the next (from one input record to the next);
000120 IF FIRST.A THEN DO; *If 1st occurrence of matching address strings (may be only 1);
000121 ACT=0; *Initialize activity variable to 0;
000122 FLAG='E'; *Set flag to E (employee record);
000123 END;
000124 ACT+1; *Increment activity variable by 1 for each matching string;
000125 IF LAST.A THEN DO; *If last occurrence of matching address strings (may be only 1);
000126 IF ACT=1 THEN DO; *If only 1 occurrence;
000127 DELETE; *Delete record since no match, then input next record;
000128 END;
000129 END;
000130 IF FIRST.A THEN DO; *If 1st occurrence of matching address strings (may be only 1);
000131 IF TYP='VENDOR' THEN DO; *If record type = vendor;
000132 FLAG='V'; *Set flag to V (vendor record);
000133 DELETE; *Delete record since no matching employee record, then input next record;
000134 END;
000135 END;
000136 IF FLAG='V' THEN DELETE; *Delete duplicate vendor record (if any), then input next record;
000137 RETURN; *Input next record;
000138 PROC SORT; *Sort by zipcode, address string, type record (vendor first);
000139 BY ZIPCODE A DESCENDING TYP;
000140 DATA MATCH2(DROP=FLAG A); *2nd match iteration;
000141 SET MATCH1; *Input matched records;
000142 BY ZIPCODE A DESCENDING TYP; *Input in sorted order;
000143 RETAIN FLAG; *Retain value of flag variable from one iteration of data step to the next (from one input record to the next);
000144 IF FIRST.A THEN DO; *If 1st occurrence of matching address strings (may be only 1);
000145 FLAG='V'; *Set flag to V for vendor record;
000146 IF TYP='EMPLOYEE' THEN DO; *If employee record type;
000147 FLAG='E'; *Set flag to E for employee record;
000148 DELETE; *Delete record since no matching vendor record, then input next record;
000149 END;
000150 END;
000151 IF FLAG='E' THEN DELETE; *Delete duplicate employee record, then input next record;
000152 RETURN; *Input next record;
000153 PROC PRINT; *Print final matched records (match2);
000154 ID VENDNO TYP; *List vendor number (for both employee and vendor) and record type 1st;
000155 OPTIONS MISSING=' '; *Output a space for any missing variables (instead of SAS default);
000156 FORMAT VENDNO Z6.; *Format vendor number to show leading zeroes;
000157 FORMAT ZIPCODE $5.; *Format zipcode character field 5 characters;
000158 TITLE1 'XXX CORPORATION';
000159 TITLE3 'ACCOUNTS PAYABLE DATA COLLECTION';
000160 TITLE5 'SIMILAR EMPLOYEE AND VENDOR ADDRESSES';
000161 TITLE8;
|