%put Compiling SasPorter1Stemmer macro ... ; /* Porter1 stemmer written in base SAS*/ /* based on algorithm in */ /* and multiple adapted versions in */ /**/ /*part 1: preliminaries*/ /*part 2: auxiliary macros*/ /*part 3: main steps macros*/ /*part 4: suffix lists (by step) */ /*part 5: MAIN MACRO */ /* first read this file, then call up MAIN MACRO with appropriate replacements*/ /* for the four parameters, i.e., : */ /* %sasporter1stemmer(indataset,outdataset,word2stem,stemmedword) ;*/ /* (part 1 of 5) PRELIMINARIES */ %let stemmermaxlength = 100 ; /* (part 2 of 5) AUXILIARY MACROS */ * splits word into stem and sufx after character in "rposition" ; %macro stemsufx(stemwordin,stemposition) ; length stem sufx $ &stemmermaxlength ; stem = "" ; sufx = "" ; if &stemposition lt 1 then sufx = &stemwordin ; else do ; stem = compress( substr( &stemwordin,1,min( &stemposition,length(&stemwordin) ) ) ) ; if &stemposition lt length(&stemwordin) then sufx = compress( substr(&stemwordin,&stemposition+1) ) ; end ; %mend stemsufx ; * "consflag" = 1 if vowel, 0 if not for letter in position "pos" in word "conswordin" ; %macro cons(conswordin,pos,consflag) ; &consflag = 1 ; if indexc(substr(&conswordin,&pos,1),'aeiouy') then &consflag = 0 ; if &pos eq 1 and substr(&conswordin,&pos,1) eq "y" then &consflag = 1 ; else if &pos gt 1 and substr(&conswordin,&pos,1) eq "y" then do ; if indexc(substr(&conswordin,&pos-1,1), 'aeiou') then &consflag = 1 ; end ; %mend cons ; * returns "longwordout" where vowels are v and consonants are c ; %macro cvlong(longwordin,longwordout) ; length &longwordout $ &stemmermaxlength ; &longwordout = "" ; do h = 1 to length(&longwordin) ; %cons(&longwordin,h,wcv) ; if wcv then &longwordout = trim(&longwordout)||"C" ; else &longwordout = trim(&longwordout)||"V" ; end ; &longwordout = upcase(&longwordout) ; drop h wcv ; %mend cvlong ; * returns "shortwordout" where vowel(or groups) are v (similarly for consonants and c) ; %macro cvshort(shortwordin,shortwordout) ; length &shortwordout $ &stemmermaxlength ; &shortwordout = "V" ; do h = 1 to length(&shortwordin) ; %cons(&shortwordin,h,xcv) ; if h eq 1 and xcv then &shortwordout = "C" ; if h gt 1 then do ; cur = substr(&shortwordout,length(&shortwordout),1) ; if cur = "V" and xcv then &shortwordout = trim(&shortwordout)||"C" ; else if cur = "C" and not xcv then &shortwordout = trim(&shortwordout)||"V" ; end ; end ; &shortwordout = upcase(&shortwordout) ; drop h cur xcv ; %mend cvshort ; * returns number of "VC" (or "measure" of word) as "mseq" ; %macro mseq(mseqwordin) ; %cvshort(&mseqwordin,cvwordshort) ; x = length(cvwordshort) ; if substr(cvwordshort,1,1) eq "C" then x = x-1 ; if substr(cvwordshort,length(cvwordshort),1) eq "V" then x = x-1 ; mseq = max(0,x/2) ; drop x cvwordshort ; %mend mseq ; * vowelflag = 1 if at least one vowel in "vowelwordin" ; %macro vowelinstem(vowelwordin,vowelflag) ; &vowelflag = 0 ; %cvlong(&vowelwordin,xout) ; if index(xout,"V") gt 0 then &vowelflag = 1 ; drop xout ; %mend vowelinstem ; * doublec = 1 if last two letters of "dwordin" are consonants and the same ; %macro doublec(dwordin,dpos) ; * see -- dpos >= 2 ; doublec = 0 ; %cons(&dwordin,&dpos,dcv) ; if &dpos ge 2 and dcv and substr(&dwordin,&dpos,1) eq substr(&dwordin,(&dpos-1),1) then doublec = 1 ; drop dcv ; %mend doublec ; /* looks for c-v-c pattern at end of word (and where second c is not w,x or y) */ %macro cvcminuswxy(cvcmwordin,cvcmpos,cvcmflag); &cvcmflag = 0 ; if &cvcmpos ge 3 then do ; xword = compress(substr(&cvcmwordin,(&cvcmpos-2),3)) ; %cvlong(xword,xcvc) ; if compress(xcvc) = "CVC" and indexc(substr(xword,3,1),'wxy') eq 0 then &cvcmflag = 1 ; end; drop xword xcvc ; %mend cvcminuswxy ; /* looks at end of word and tests for match with "estring" */ %macro ends(endswordin,estring,endsflag) ; &endsflag = 0 ; if length(&endswordin) ge length("&estring") then do ; if substr(&endswordin,max(0,length(&endswordin) - length("&estring") + 1)) eq "&estring" then &endsflag = 1 ; end ; %mend ends; * replace end of word with string "instring" ; %macro replaceafter(replacewordin,instring,inpos,replacewordout) ; &replacewordout = compress( substr(&replacewordin,1,&inpos)||"&instring" ) ; %mend replaceafter ; /* (part 3 of 5) MAIN STEPS MACROS */ * step1a macro ; %macro step1a(inword, outword, instring, outstring) ; step1adone = 0 ; %let m=0 ; %do %while(%scan(&instring,&m+1,%str( )) ne %str( )) ; %let instringx = %scan(&instring, &m+1,%str( )) ; %let outstringx = %scan(&outstring,&m+1,%str( )) ; %if &outstringx eq x %then %let outstringx = ; if not step1adone then do ; %ends(&inword,&instringx,ends1a) ; if ends1a then do ; step1adone = 1 ; %replaceafter(&inword,&outstringx,length(&inword)-length("&instringx"),&outword) ; end ; end ; %let m = %eval(&m+1) ; %end ; drop step1adone ends1a ; %mend step1a ; *; * step1b macro ; %macro step1b(inword, outword) ; step1bdone = 0 ; step1bsuccess = 0 ; if not step1bdone then do ; %stemsufx(&inword,length(&inword)-3) ; if sufx = "eed" then do ; step1bdone = 1 ; %mseq(stem) ; if mseq then &outword = compress(stem)||"ee" ; end ; end ; if not step1bdone then do ; %stemsufx(&inword,length(&inword)-3) ; if sufx = "ing" then do ; step1bdone = 1 ; %vowelinstem(stem,stemflag_ing) ; if stemflag_ing then do ; &outword = compress(stem) ; step1bsuccess = 1 ; end ; end ; end ; if not step1bdone then do ; %stemsufx(&inword,length(&inword)-2) ; if sufx = "ed" then do ; step1bdone = 1 ; %vowelinstem(stem,stemflag_ed) ; if stemflag_ed then do ; &outword = compress(stem) ; step1bsuccess = 1 ; end ; end ; end ; if step1bsuccess then do ; substep1b = 0 ; %stemsufx(&outword,length(&outword)-2) ; if sufx = "at" and not substep1b then do ; &outword = compress(stem)||"ate" ; substep1b = 1 ; end ; if sufx = "bl" and not substep1b then do ; &outword = compress(stem)||"ble" ; substep1b = 1 ; end ; if sufx = "iz" and not substep1b then do ; &outword = compress(stem)||"ize" ; substep1b = 1 ; end ; if not substep1b then do ; %doublec(&outword,length(&outword) ) ; if doublec and indexc(substr(&outword,length(&outword)),'lsz') eq 0 then do ; substep1b = 1 ; &outword = substr(&outword,1,length(&outword)-1) ; end ; end ; if not substep1b then do ; %mseq(&outword) ; %cvcminuswxy(&outword,length(&outword),stemflag_cvc); if mseq eq 1 and stemflag_cvc then &outword = compress(&outword)||"e" ; end ; end ; %mend step1b ; * macro for steps 2 to 4 ; %macro steps2to4(inword, outword, instring, outstring, m) ; stepdone = 0 ; %let n=0 ; %do %while(%scan(&instring,&n+1,%str( )) ne %str( )) ; %let instringx = %scan(&instring, &n+1,%str( )) ; %let outstringx = %scan(&outstring,&n+1,%str( )) ; %if &outstringx eq x %then %let outstringx = ; if not stepdone then do ; %stemsufx(&inword,length(&inword) - length("&instringx")) ; if sufx = "&instringx" then do ; stepdone = 1 ; %mseq(stem) ; if mseq gt &m then do ; %replaceafter(&inword,&outstringx,length(&inword)-length("&instringx"),&outword) ; end ; end ; end ; %let n = %eval(&n+1) ; %end ; %mend steps2to4 ; /* (part 4 of 5) LIST OF SUFFIXES MACRO */ %macro suffixlists4stemmer ; %global step1ainlist step1aoutlist step2inlist step2outlist step3inlist step3outlist step4inlist step4outlist ; %*; %let step1ainlist = sses ies ss s ; %let step1aoutlist = ss i ss x ; %let step2inlist = ational fulness iveness ization ousness biliti tional alism aliti ation entli iviti ousli alli anci ator enci izer logi bli eli ; %let step2outlist = ate ful ive ize ous ble tion al al ate ent ive ous al ance ate ence ize log ble e ; %let step3inlist = icate ative alize iciti ical ful ness ; %let step3outlist = ic x al ic ic x x ; %let step4inlist = ement able ance ence ible ment ant ate ent ism iti ive ize ous al er ic ou ; %let step4outlist = x x x x x x x x x x x x x x x x x x ; %mend suffixlists4stemmer ; /****************** (part 5 of 5) MAIN PROGRAM MACRO ***************************/ %macro sasporter1stemmer(indataset,outdataset,word2stem,stemmedword) ; %suffixlists4stemmer ; data &outdataset ; set &indataset ; &stemmedword = lowcase(&word2stem) ; %* ; if length(&stemmedword) gt 2 then do ; %* steps 1a, 1b ; %step1a(&stemmedword, &stemmedword, &step1ainlist, &step1aoutlist) ; %step1b(&stemmedword, &stemmedword) ; %* step 1c ; %stemsufx(&stemmedword,length(&stemmedword)-1) ; %vowelinstem(stem,stemflag_y) ; if sufx eq "y" and stemflag_y then &stemmedword = compress(stem)||"i" ; %* step 2, 3, 4 ; %steps2to4(&stemmedword, &stemmedword, &step2inlist, &step2outlist, 0) ; %steps2to4(&stemmedword, &stemmedword, &step3inlist, &step3outlist, 0) ; %steps2to4(&stemmedword, &stemmedword, &step4inlist, &step4outlist, 1) ; %* step 4: "ion" exception ; if not stepdone then do ; %stemsufx(&stemmedword,length(&stemmedword)-3) ; %mseq(stem) ; if sufx eq "ion" and mseq gt 1 and indexc(substr(stem,length(stem)),'st') gt 0 then &stemmedword = stem ; end ; %* step 5a ; %stemsufx(&stemmedword,length(&stemmedword)-1) ; %mseq(stem) ; if mseq gt 1 and sufx eq "e" then &stemmedword = stem ; if mseq eq 1 then do ; %cvcminuswxy(stem,length(stem),stemflag_5a); if sufx eq "e" and not stemflag_5a then &stemmedword = stem ; end ; %* step 5b ; %stemsufx(&stemmedword,length(&stemmedword)-1) ; %mseq(stem) ; if mseq gt 1 and sufx eq "l" and substr(stem,length(stem),1) eq "l" then &stemmedword = stem ; %*; end ; drop stemflag: mseq stem sufx step1bdone step1bsuccess substep1b doublec stepdone ; run ; %mend sasporter1stemmer ;