/************************************************************************** * FPorterStemmer.d - 29-Mai-2007 * - Filter for perferming word stemming based on the Porter algorithm * * Copyright (c) 2007 Daniel Truemper (daniel-truemper at gmx.de) * **************************************************************************/ /** * Note: * * This implementation was converted from the python implementation * made by Vivake Gupta (v@nano.com) so all credit belongs to him! * * ---- original comment * * This is the Porter stemming algorithm, ported to Python from the * version coded up in ANSI C by the author. It may be be regarded * as canonical, in that it follows the algorithm presented in * * Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, * no. 3, pp 130-137, * * only differing from it at the points maked --DEPARTURE-- below. * * See also http://www.tartarus.org/~martin/PorterStemmer * * The algorithm as described in the paper could be exactly replicated * by adjusting the points of DEPARTURE, but this is barely necessary, * because (a) the points of DEPARTURE are definitely improvements, and * (b) no encoding of the Porter stemmer I have seen is anything like * as exact as this version, even with the points of DEPARTURE! * * Vivake Gupta (v@nano.com) * * Release 1: January 2001 * * ---- end of original comment * */ module PorterStemmer; public class PorterStemmer { private { char[] m_b; // buffer for the word int m_k = 0; int m_k0 = 0; int m_j = 0; // offset within the string } /** * constructor */ public this() { } /** * Destructor */ public ~this() { } /** * cons returns true, if b[i] is a consonant */ private bool cons( int i ) { if( m_b[i] == 'a' || m_b[i] == 'e' || m_b[i] == 'i' || m_b[i] == 'o' || m_b[i] == 'u' ) return false; if( m_b[i] == 'y' ) { if( i == m_k0 ) { return true; } else { return !cons( i - 1 ); } } return true; } /** * measures the number of consonant sequences between k0 and j. * if c is a consonant sequence and v a vowel sequence, and <..> * indicates arbitrary presence, * * gives 0 * vc gives 1 * vcvc gives 2 * vcvcvc gives 3 * */ private int m() { int n = 0; int i = m_k0; while( true ) { if( i > m_j ) { return n; } if( !cons( i ) ) { break; } i++; } i++; while( true ) { while( true ) { if( i > m_j ) { return n; } if( cons( i ) ) { break; } i++; } i++; n++; while( true ) { if( i > m_j ) { return n; } if( !cons( i ) ) { break; } i++; } i++; } } /** * returns true if k0...j contains a vowel */ private bool vowelinstem() { for( int i = m_k0; i < m_j + 1; i++ ) { if( !cons( i ) ) return true; } return false; } /** * returns true if j, j-1 contains a double consonant */ private bool doublec( int j ) { if( j < ( m_k0 + 1 ) ) return false; if( m_b[j] != m_b[j-1] ) return false; return cons( j ); } /** * is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant * and also if the second c is not w,x or y. this is used when trying to * restore an e at the end of a short e.g. * * cav(e), lov(e), hop(e), crim(e), but * snow, box, tray. * */ private bool cvc( int i ) { if( i < ( m_k0 + 2 ) || !cons( i ) || cons( i-1 ) || !cons( i-2 ) ) return false; if( m_b[i] == 'w' || m_b[i] == 'x' || m_b[i] == 'y' ) return false; return true; } /** * ends(s) is TRUE <=> k0,...k ends with the string s. */ private bool ends( char[] s ) { int len = s.length; if( s[ len - 1 ] != m_b[ m_k ] ) return false; if( len > ( m_k - m_k0 + 1 ) ) return false; int a = m_k - len + 1; int b = m_k + 1; if( m_b[a..b] != s ) { return false; } m_j = m_k - len; return true; } /** * setto(s) sets (j+1),...k to the characters in the string s, readjusting k. */ private void setto( char[] s ) { m_b = m_b[0..m_j+1] ~ s ~ m_b[ m_j + s.length + 1 .. m_b.length ]; m_k = m_j + s.length; } /** * used further down */ private void r( char[] s ) { if( m() > 0 ) setto( s ); } /** * step1ab() gets rid of plurals and -ed or -ing. e.g. * * caresses -> caress * ponies -> poni * ties -> ti * caress -> caress * cats -> cat * * feed -> feed * agreed -> agree * disabled -> disable * * matting -> mat * mating -> mate * meeting -> meet * milling -> mill * messing -> mess * * meetings -> meet * */ private void step1ab() { if( m_b[ m_k ] == 's' ) if( ends( "sses" ) ) m_k = m_k - 2; else if( ends( "ies" ) ) setto( "i" ); else if( m_b[ m_k - 1 ] != 's' ) m_k--; if( ends( "eed" ) ) { if( m() > 0 ) m_k--; } else if( ( ends( "ed" ) || ends( "ing" ) ) && vowelinstem() ) { m_k = m_j; if( ends( "at" ) ) { setto( "ate" ); } else if( ends( "bl" ) ) { setto( "ble" ); } else if( ends( "iz" ) ) { setto( "ize" ); } else if( doublec( m_k ) ) { m_k--; if( m_b[ m_k ] == 'l' || m_b[ m_k ] == 's' || m_b[ m_k ] == 'z' ) m_k++; } else if( m() == 1 && cvc( m_k ) ) { setto( "e" ); } } } /** * step1c() turns terminal y to i when there is another vowel in the stem. */ private void step1c() { if( ends( "y" ) && vowelinstem() ) { m_b = m_b[0..m_k] ~ 'i' ~ m_b[ m_k+1 .. m_b.length ]; } } /** * step2() maps double suffices to single ones. * so -ization ( = -ize plus -ation) maps to -ize etc. note that the * string before the suffix must give m() > 0.* */ private void step2() { if( m_b[ m_k - 1 ] == 'a' ) { if( ends( "ational" ) ) r( "ate" ); else if( ends( "tional" ) ) r( "tion" ); } else if( m_b[ m_k - 1 ] == 'c' ) { if( ends( "enci" ) ) r( "ence" ); else if( ends( "anci" ) ) r( "ance" ); } else if( m_b[ m_k - 1 ] == 'e' ) { if( ends( "izer" ) ) r( "ize" ); } else if( m_b[ m_k - 1 ] == 'l' ) { if( ends( "bli" ) ) r( "ble" ); /* --DEPARTURE-- * To match the published algorithm, replace this phrase with * if( ends( "abli" ) ) * r( "able" ); */ else if( ends( "alli" ) ) r( "al" ); else if( ends( "entli" ) ) r( "ent" ); else if( ends( "eli" ) ) r( "e" ); else if( ends( "ousli" ) ) r( "ous" ); } else if( m_b[ m_k - 1 ] == 'o' ) { if( ends( "ization" ) ) r( "ize" ); else if( ends( "ation" ) || ends( "ator" ) ) r( "ate" ); } else if( m_b[ m_k - 1 ] == 's' ) { if( ends( "alism" ) ) r( "al" ); else if( ends( "iveness" ) ) r( "ive" ); else if( ends( "fulness" ) ) r( "ful" ); else if( ends( "ousness" ) ) r( "ous" ); } else if( m_b[ m_k - 1 ] == 't' ) { if( ends( "aliti" ) ) r( "al" ); else if( ends( "iviti" ) ) r( "ive" ); else if( ends( "biliti" ) ) r( "ble" ); } else if( m_b[ m_k - 1 ] == 'g' ) { /** * --DEPARTURE-- * To match the published algorithm, delete this phrase */ if( ends( "logi" ) ) r( "log" ); } } /** * step3() dels with -ic-, -full, -ness etc. similar strategy to step2. */ private void step3() { if( m_b[ m_k ] == 'e' ) { if( ends( "icate" ) ) r( "ic" ); else if( ends( "ative" ) ) r( "" ); else if( ends( "alize" ) ) r( "al" ); }else if( m_b[ m_k ] == 'i' ) { if( ends( "iciti" ) ) r( "ic" ); }else if( m_b[ m_k ] == 'l' ) { if( ends( "ical" ) ) r( "ic" ); else if( ends( "ful" ) ) r( "" ); }else if( m_b[ m_k ] == 's' ) { if( ends( "ness" ) ) r( "" ); } } /** * step4() takes off -ant, -ence etc., in context vcvc. */ private void step4() { /* fixes bug 1 */ if( m_k == 0 ) return; switch( m_b[ m_k - 1 ] ) { case 'a': if( ends( "al" ) ) break; return; case 'c': if( ends( "ance" ) || ends( "ence" ) ) break; return; case 'e': if( ends( "er" ) ) break; return; case 'i': if( ends( "ic" ) ) break; return; case 'l': if( ends( "able" ) || ends( "ible" ) ) break; return; case 'n': if( ends( "ant" ) || ends( "ement" ) || ends( "ment" ) || ends( "ent" ) ) break; return; case 'o': if( ends( "ion" ) && m_j >= 0 && ( m_b[ m_j ] == 's' || m_b[ m_j ] == 't' ) ) { /* m_j >= 0 fixes bug 2 */ break; } if( ends( "ou" ) ) break; return; case 's': if( ends( "ism" ) ) break; return; case 't': if( ends( "ate" ) || ends( "iti" ) ) break; return; case 'u': if( ends( "ous" ) ) break; return; case 'v': if( ends( "ive" ) ) break; return; case 'z': if( ends( "ize" ) ) break; return; default: return; } if( m() > 1 ) m_k = m_j; } /** * step5() removes a final -e if m() > 1, and changes -ll to -l if m() > 1. */ private void step5() { m_j = m_k; if( m_b[ m_k ] == 'e' ) { auto a = m(); if( a > 1 || (a == 1 && !cvc( m_k - 1 ) ) ) m_k--; } if( m_b[ m_k ] == 'l' && doublec( m_k ) && m() > 1 ) m_k--; } /** * In stem(p,i,j), p is a char pointer, and the string to be stemmed * is from p[i] to p[j] inclusive. Typically i is zero and j is the * offset to the last character of a string, (p[j+1] == '\0'). The * stemmer adjusts the characters p[i] ... p[j] and returns the new * end-point of the string, k. Stemming never increases word length, so * i <= k <= j. To turn the stemmer into a module, declare 'stem' as * extern, and delete the remainder of this file. */ public char[] stem( char[] p, int i, int j ) { m_b = p; m_k = j; m_k0 = 0; /** * --DEPARTURE-- * * With this line, strings of length 1 or 2 don't go through the * stemming process, although no mention is made of this in the * published algorithm. Remove the line to match the published * algorithm. * */ if( m_k <= m_k0 + 1 ) return m_b; step1ab(); step1c(); step2(); step3(); step4(); step5(); return m_b[ m_k0 .. m_k + 1 ]; } } int main( char[][]args ) { // usage: PorterStemmer stemmer = new PorterStemmer(); char[] toStem = ""; char[] stemmed = stemmer.stem( toStem, 0, toStem.length - 1 ); // do many interesting things with the stemmed word delete stemmer; return 0; }