* !stem: stemming algorithm by M.F. Porter as a PSPP macro. * See https://tartarus.org/martin/PorterStemmer/ * Author: Frans Houweling (fhouweling@email.it) * Closely follows the Tcl implementation by Aris Theodorakos. * Arguments: * 1. input variable (english words) * 2. name for output variable (stems). DEFINE !stem (!POSITIONAL !TOKENS(1) /!POSITIONAL !TOKENS(1)) COMPUTE #len = LENGTH(RTRIM(!1)). STRING !2 (A28). STRING #stem__ #word__ (A28). COMPUTE #word__ = !1. DO IF #len GE 3. DO IF #len GE 5 AND SUBSTR(#word__, #len - 3, 4) EQ 'sses'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len - 4). COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ss'). ELSE IF #len GE 4 AND SUBSTR(#word__, #len - 2, 3) EQ 'ies'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len - 3). COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'i'). ELSE IF #len GE 3 AND SUBSTR(#word__, #len - 1, 2) NE 'ss' AND SUBSTR(#word__, #len, 1) EQ 's'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len - 1). COMPUTE #word__ = #stem__. END IF. COMPUTE goto1b__ = 0. COMPUTE #len = LENGTH(RTRIM(#word__)). DO IF #len GE 4 AND SUBSTR(#word__, #len - 2, 3) EQ 'eed'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len - 3). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ee'). END IF. ELSE IF #len GE 3 AND SUBSTR(#word__, #len - 1, 2) EQ 'ed'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len - 2). !vowelin #stem__ #bool__. DO IF #bool__. COMPUTE goto1b__ = 1. COMPUTE #word__ = #stem__. END IF. ELSE IF #len GE 4 AND SUBSTR(#word__, #len - 2, 3) EQ 'ing'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len - 3). !vowelin #stem__ #bool__. DO IF #bool__. COMPUTE goto1b__ = 1. COMPUTE #word__ = #stem__. END IF. END IF. DO IF goto1b__. COMPUTE #len = LENGTH(RTRIM(#word__)). DO IF #len GE 3 AND SUBSTR(#word__, #len - 1, 2) EQ 'at'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len - 2). COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ate'). ELSE IF #len GE 3 AND SUBSTR(#word__, #len - 1, 2) EQ 'bl'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len - 2). COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ble'). ELSE IF #len GE 3 AND SUBSTR(#word__, #len - 1, 2) EQ 'iz'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len - 2). COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ize'). ELSE. COMPUTE #len_1 = #len - 1. !doublec #word__ #bool__. DO IF #bool__. DO IF NOT ANY(SUBSTR(#word__, #len, 1), 'l', 's', 'z'). COMPUTE #word__ = SUBSTR(#word__, 1, #len - 1). END IF. ELSE. !m #word__ #measure__. !cvc #word__ #bool__. DO IF #measure__ EQ 1 AND #bool__. COMPUTE #word__ = CONCAT(RTRIM(#word__), 'e'). END IF. END IF. END IF. END IF /* 1b */. * Step 1c. DO IF #len GE 2 AND SUBSTR(#word__, #len, 1) EQ 'y'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len - 1). !vowelin #stem__ #bool__. DO IF #bool__. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'i'). END IF. END IF. * Step 2. COMPUTE #len = LENGTH(RTRIM(#word__)). DO IF #len GE 5 AND SUBSTR(#word__, #len-3, 4) EQ 'logi'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-4). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'log'). END IF. ELSE IF #len GE 8 AND SUBSTR(#word__, #len-6, 7) EQ 'ational'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-7). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ate'). END IF. ELSE IF #len GE 7 AND SUBSTR(#word__, #len-5, 6) EQ 'tional'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-6). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'tion'). END IF. ELSE IF #len GE 5 AND SUBSTR(#word__, #len-3, 4) EQ 'enci'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-4). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ence'). END IF. ELSE IF #len GE 5 AND SUBSTR(#word__, #len-3, 4) EQ 'anci'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-4). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ance'). END IF. ELSE IF #len GE 5 AND SUBSTR(#word__, #len-3, 4) EQ 'izer'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-4). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ize'). END IF. ELSE IF #len GE 4 AND SUBSTR(#word__, #len-2, 3) EQ 'bli'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-3). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ble'). END IF. ELSE IF #len GE 5 AND SUBSTR(#word__, #len-3, 4) EQ 'alli'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-4). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'al'). END IF. ELSE IF #len GE 6 AND SUBSTR(#word__, #len-4, 5) EQ 'entli'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-5). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ent'). END IF. ELSE IF #len GE 4 AND SUBSTR(#word__, #len-2, 3) EQ 'eli'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-3). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'e'). END IF. ELSE IF #len GE 6 AND SUBSTR(#word__, #len-4, 5) EQ 'ousli'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-5). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ous'). END IF. ELSE IF #len GE 8 AND SUBSTR(#word__, #len-6, 7) EQ 'ization'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-7). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ize'). END IF. ELSE IF #len GE 6 AND SUBSTR(#word__, #len-4, 5) EQ 'ation'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-5). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ate'). END IF. ELSE IF #len GE 5 AND SUBSTR(#word__, #len-3, 4) EQ 'ator'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-4). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ate'). END IF. ELSE IF #len GE 6 AND SUBSTR(#word__, #len-4, 5) EQ 'alism'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-5). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'al'). END IF. ELSE IF #len GE 8 AND SUBSTR(#word__, #len-6, 7) EQ 'iveness'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-7). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ive'). END IF. ELSE IF #len GE 8 AND SUBSTR(#word__, #len-6, 7) EQ 'fulness'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-7). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ful'). END IF. ELSE IF #len GE 8 AND SUBSTR(#word__, #len-6, 7) EQ 'ousness'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-7). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ous'). END IF. ELSE IF #len GE 6 AND SUBSTR(#word__, #len-4, 5) EQ 'aliti'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-5). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'al'). END IF. ELSE IF #len GE 6 AND SUBSTR(#word__, #len-4, 5) EQ 'iviti'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-5). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ive'). END IF. ELSE IF #len GE 7 AND SUBSTR(#word__, #len-5, 6) EQ 'biliti'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-6). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ble'). END IF. END IF. *** Step 3. COMPUTE #len = LENGTH(RTRIM(#word__)). DO IF #len GE 6 AND SUBSTR(#word__, #len-4, 5) EQ 'icate'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-5). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ic'). END IF. ELSE IF #len GE 6 AND SUBSTR(#word__, #len-4, 5) EQ 'ative'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-5). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = #stem__. END IF. ELSE IF #len GE 6 AND SUBSTR(#word__, #len-4, 5) EQ 'alize'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-5). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'al'). END IF. ELSE IF #len GE 6 AND SUBSTR(#word__, #len-4, 5) EQ 'iciti'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-5). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ic'). END IF. ELSE IF #len GE 5 AND SUBSTR(#word__, #len-3, 4) EQ 'ical'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-4). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = CONCAT(RTRIM(#stem__), 'ic'). END IF. ELSE IF #len GE 4 AND SUBSTR(#word__, #len-2, 3) EQ 'ful'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-3). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = #stem__. END IF. ELSE IF #len GE 5 AND SUBSTR(#word__, #len-3, 4) EQ 'ness'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-4). !m #stem__ #measure__. DO IF #measure__ GT 0. COMPUTE #word__ = #stem__. END IF. END IF. * Step 4. COMPUTE #len = LENGTH(RTRIM(#word__)). DO IF #len GE 3 AND SUBSTR(#word__, #len-1, 2) EQ 'al'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-2). !m #stem__ #measure__. DO IF #measure__ GT 1. COMPUTE #word__ = #stem__. END IF. ELSE IF #len GE 5 AND SUBSTR(#word__, #len-3, 4) EQ 'ance'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-4). !m #stem__ #measure__. DO IF #measure__ GT 1. COMPUTE #word__ = #stem__. END IF. ELSE IF #len GE 5 AND SUBSTR(#word__, #len-3, 4) EQ 'ence'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-4). !m #stem__ #measure__. DO IF #measure__ GT 1. COMPUTE #word__ = #stem__. END IF. ELSE IF #len GE 3 AND SUBSTR(#word__, #len-1, 2) EQ 'er'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-2). !m #stem__ #measure__. DO IF #measure__ GT 1. COMPUTE #word__ = #stem__. END IF. ELSE IF #len GE 3 AND SUBSTR(#word__, #len-1, 2) EQ 'ic'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-2). !m #stem__ #measure__. DO IF #measure__ GT 1. COMPUTE #word__ = #stem__. END IF. ELSE IF #len GE 5 AND SUBSTR(#word__, #len-3, 4) EQ 'able'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-4). !m #stem__ #measure__. DO IF #measure__ GT 1. COMPUTE #word__ = #stem__. END IF. ELSE IF #len GE 5 AND SUBSTR(#word__, #len-3, 4) EQ 'ible'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-4). !m #stem__ #measure__. DO IF #measure__ GT 1. COMPUTE #word__ = #stem__. END IF. ELSE IF #len GE 4 AND SUBSTR(#word__, #len-2, 3) EQ 'ant'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-3). !m #stem__ #measure__. DO IF #measure__ GT 1. COMPUTE #word__ = #stem__. END IF. ELSE IF #len GE 6 AND SUBSTR(#word__, #len-4, 5) EQ 'ement'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-5). !m #stem__ #measure__. DO IF #measure__ GT 1. COMPUTE #word__ = #stem__. END IF. ELSE IF #len GE 5 AND SUBSTR(#word__, #len-3, 4) EQ 'ment'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-4). !m #stem__ #measure__. DO IF #measure__ GT 1. COMPUTE #word__ = #stem__. END IF. ELSE IF #len GE 4 AND SUBSTR(#word__, #len-2, 3) EQ 'ent'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-3). !m #stem__ #measure__. DO IF #measure__ GT 1. COMPUTE #word__ = #stem__. END IF. ELSE IF #len GE 4 AND SUBSTR(#word__, #len-2, 3) EQ 'ion'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-3). DO IF SUBSTR(#stem__, LENGTH(RTRIM(#stem__)), 1) EQ 's' OR SUBSTR(#stem__, LENGTH(RTRIM(#stem__)), 1) EQ 't'. !m #stem__ DEBUGM. !m #stem__ #measure__. DO IF #measure__ GT 1. COMPUTE #word__ = #stem__. END IF. END IF. ELSE IF #len GE 3 AND SUBSTR(#word__, #len-1, 2) EQ 'ou'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-2). !m #stem__ #measure__. DO IF #measure__ GT 1. COMPUTE #word__ = #stem__. END IF. ELSE IF #len GE 4 AND SUBSTR(#word__, #len-2, 3) EQ 'ism'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-3). !m #stem__ #measure__. DO IF #measure__ GT 1. COMPUTE #word__ = #stem__. END IF. ELSE IF #len GE 4 AND SUBSTR(#word__, #len-2, 3) EQ 'ate'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-3). !m #stem__ #measure__. DO IF #measure__ GT 1. COMPUTE #word__ = #stem__. END IF. ELSE IF #len GE 4 AND SUBSTR(#word__, #len-2, 3) EQ 'iti'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-3). !m #stem__ #measure__. DO IF #measure__ GT 1. COMPUTE #word__ = #stem__. END IF. ELSE IF #len GE 4 AND SUBSTR(#word__, #len-2, 3) EQ 'ous'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-3). !m #stem__ #measure__. DO IF #measure__ GT 1. COMPUTE #word__ = #stem__. END IF. ELSE IF #len GE 4 AND SUBSTR(#word__, #len-2, 3) EQ 'ive'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-3). !m #stem__ #measure__. DO IF #measure__ GT 1. COMPUTE #word__ = #stem__. END IF. ELSE IF #len GE 4 AND SUBSTR(#word__, #len-2, 3) EQ 'ize'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-3). !m #stem__ #measure__. DO IF #measure__ GT 1. COMPUTE #word__ = #stem__. END IF. END IF. * Step 5. COMPUTE #len = LENGTH(RTRIM(#word__)). DO IF #len GE 2 AND SUBSTR(#word__, #len, 1) EQ 'e'. COMPUTE #stem__ = SUBSTR(#word__, 1, #len-1). !m #stem__ #measure__. DO IF #measure__ GT 1. COMPUTE #word__ = #stem__. ELSE IF #measure__ EQ 1. !cvc #stem__ #bool__. DO IF NOT #bool__. COMPUTE #word__ = #stem__. END IF. END IF. END IF. !m #word__ #measure__. DO IF #measure__ GT 1. !doublec #word__ #bool__. DO IF #bool__. DO IF SUBSTR(#word__, LENGTH(RTRIM(#word__)), 1) EQ 'l'. COMPUTE #word__ = SUBSTR(#word__, 1, LENGTH(RTRIM(#word__)) - 1). END IF. END IF. END IF. END IF /* length at least 3 */. COMPUTE !2 = #word__. !ENDDEFINE. DEFINE !cons(!POSITIONAL !TOKENS(1) /!POSITIONAL !TOKENS(1)) (NOT ANY(SUBSTR(!1, !2, 1), 'a', 'e', 'i', 'o', 'u') AND NOT (SUBSTR(!1, !2, 1) EQ 'y' AND !2 GT 1 AND NOT ANY(SUBSTR(!1, !2 - 1, 1), 'a', 'e', 'i', 'o', 'u', 'y'))) !ENDDEFINE. DEFINE !m(!POSITIONAL !TOKENS(1) /!POSITIONAL !TOKENS(1)) COMPUTE #n = 0. COMPUTE #i = 1. COMPUTE #res = $SYSMIS. COMPUTE #len = LENGTH(RTRIM(!1)). LOOP IF #i LE #len AND !cons !1 #i. DO IF #i EQ #len. COMPUTE #res = 0. ELSE. COMPUTE #i = #i + 1. END IF. END LOOP. DO IF MISSING(#res). + LOOP. + LOOP IF #i LE #len AND NOT !cons !1 #i. + DO IF #i EQ #len. + COMPUTE #res = #n. + ELSE. + COMPUTE #i = #i + 1. + END IF. + END LOOP. + DO IF MISSING(#res). + COMPUTE #n = #n + 1. + LOOP IF #i LE #len AND !cons !1 #i. + DO IF #i EQ #len. + COMPUTE #res = #n. + ELSE. + COMPUTE #i = #i + 1. + END IF. + END LOOP. + END IF. + END LOOP IF NOT MISSING(#res). END IF. COMPUTE !2 = #res. !ENDDEFINE. DEFINE !vowelin(!POSITIONAL !TOKENS(1) /!POSITIONAL !TOKENS(1)) COMPUTE !2 = 0. LOOP #i = 1 TO LENGTH(RTRIM(!1)). DO IF NOT !cons !1 #i. COMPUTE !2 = 1. END IF. END LOOP IF !2 EQ 1. !ENDDEFINE. DEFINE !doublec(!POSITIONAL !TOKENS(1) /!POSITIONAL !TOKENS(1)) DO IF LENGTH(RTRIM(!1)) LT 2. COMPUTE !2 = 0. ELSE. DO IF SUBSTR(!1, LENGTH(RTRIM(!1)), 1) NE SUBSTR(!1, LENGTH(RTRIM(!1)) - 1, 1). COMPUTE !2 = 0. ELSE. COMPUTE #pos = LENGTH(RTRIM(!1)). COMPUTE !2 = !cons !1 #pos. END IF. END IF. !ENDDEFINE. DEFINE !cvc(!POSITIONAL !TOKENS(1) /!POSITIONAL !TOKENS(1)) DO IF LENGTH(RTRIM(!1)) LT 3. COMPUTE !2 = 0. ELSE. COMPUTE #pos = LENGTH(RTRIM(!1)). DO IF NOT !cons !1 #pos. COMPUTE !2 = 0. ELSE. COMPUTE #pos = #pos -1. DO IF !cons !1 #pos. COMPUTE !2 = 0. ELSE. COMPUTE #pos = #pos -1. DO IF NOT !cons !1 #pos. COMPUTE !2 = 0. ELSE. DO IF ANY(SUBSTR(!1, LENGTH(RTRIM(!1)), 1), 'w', 'x', 'y'). COMPUTE !2 = 0. ELSE. COMPUTE !2 = 1. END IF. END IF. END IF. END IF. END IF. !ENDDEFINE. **** Test with the sample vocabulary. *DATA LIST LIST FILE='porter/output.txt' /porter (A28). *SAVE OUTFILE = 'output.sav'. **DATA LIST LIST FILE='porter/voc.txt' /token (A28). *!stem token stemmed. *EXECUTE. *MATCH FILES FILE = * /FILE = 'output.sav' /KEEP = token porter stemmed. *COMPUTE diff = stemmed NE porter. *FREQ diff /STATISTICS = NONE. *SELECT IF diff. *LIST.