using System; namespace Poseidon.Analysis { /// /// The Stemmer class transforms a word into its root form. /// Implementing the Porter Stemming Algorithm /// /// /// Modified from: http://tartarus.org/martin/PorterStemmer/csharp2.txt /// /// /// var stemmer = new PorterStemmer(); /// var stem = stemmer.StemWord(word); /// public class PorterStemmer { // The passed in word turned into a char array. // Quicker to use to rebuilding strings each time a change is made. private char[] wordArray; // Current index to the end of the word in the character array. This will // change as the end of the string gets modified. private int endIndex; // Index of the (potential) end of the stem word in the char array. private int stemIndex; /// /// Stem the passed in word. /// /// Word to evaluate /// public string StemWord(string word) { // Do nothing for empty strings or short words. if (string.IsNullOrWhiteSpace(word) || word.Length <= 2) return word; wordArray = word.ToCharArray(); stemIndex = 0; endIndex = word.Length - 1; Step1(); Step2(); Step3(); Step4(); Step5(); Step6(); var length = endIndex + 1; return new String(wordArray, 0, length); } // Step1() gets rid of plurals and -ed or -ing. /* Examples: caresses -> caress ponies -> poni ties -> ti caress -> caress cats -> cat feed -> feed agreed -> agree disabled -> disable matting -> mat mating -> mate meeting -> meet milling -> mill messing -> mess meetings -> meet */ private void Step1() { // If the word ends with s take that off if (wordArray[endIndex] == 's') { if (EndsWith("sses")) { endIndex -= 2; } else if (EndsWith("ies")) { SetEnd("i"); } else if (wordArray[endIndex - 1] != 's') { endIndex--; } } if (EndsWith("eed")) { if (MeasureConsontantSequence() > 0) endIndex--; } else if ((EndsWith("ed") || EndsWith("ing")) && VowelInStem()) { endIndex = stemIndex; if (EndsWith("at")) SetEnd("ate"); else if (EndsWith("bl")) SetEnd("ble"); else if (EndsWith("iz")) SetEnd("ize"); else if (IsDoubleConsontant(endIndex)) { endIndex--; int ch = wordArray[endIndex]; if (ch == 'l' || ch == 's' || ch == 'z') endIndex++; } else if (MeasureConsontantSequence() == 1 && IsCVC(endIndex)) SetEnd("e"); } } // Step2() turns terminal y to i when there is another vowel in the stem. private void Step2() { if (EndsWith("y") && VowelInStem()) wordArray[endIndex] = 'i'; } // Step3() maps double suffices to single ones. so -ization ( = -ize plus // -ation) maps to -ize etc. note that the string before the suffix must give m() > 0. private void Step3() { if (endIndex == 0) return; /* For Bug 1 */ switch (wordArray[endIndex - 1]) { case 'a': if (EndsWith("ational")) { ReplaceEnd("ate"); break; } if (EndsWith("tional")) { ReplaceEnd("tion"); } break; case 'c': if (EndsWith("enci")) { ReplaceEnd("ence"); break; } if (EndsWith("anci")) { ReplaceEnd("ance"); } break; case 'e': if (EndsWith("izer")) { ReplaceEnd("ize"); } break; case 'l': if (EndsWith("bli")) { ReplaceEnd("ble"); break; } if (EndsWith("alli")) { ReplaceEnd("al"); break; } if (EndsWith("entli")) { ReplaceEnd("ent"); break; } if (EndsWith("eli")) { ReplaceEnd("e"); break; } if (EndsWith("ousli")) { ReplaceEnd("ous"); } break; case 'o': if (EndsWith("ization")) { ReplaceEnd("ize"); break; } if (EndsWith("ation")) { ReplaceEnd("ate"); break; } if (EndsWith("ator")) { ReplaceEnd("ate"); } break; case 's': if (EndsWith("alism")) { ReplaceEnd("al"); break; } if (EndsWith("iveness")) { ReplaceEnd("ive"); break; } if (EndsWith("fulness")) { ReplaceEnd("ful"); break; } if (EndsWith("ousness")) { ReplaceEnd("ous"); } break; case 't': if (EndsWith("aliti")) { ReplaceEnd("al"); break; } if (EndsWith("iviti")) { ReplaceEnd("ive"); break; } if (EndsWith("biliti")) { ReplaceEnd("ble"); } break; case 'g': if (EndsWith("logi")) { ReplaceEnd("log"); } break; } } /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */ private void Step4() { switch (wordArray[endIndex]) { case 'e': if (EndsWith("icate")) { ReplaceEnd("ic"); break; } if (EndsWith("ative")) { ReplaceEnd(""); break; } if (EndsWith("alize")) { ReplaceEnd("al"); } break; case 'i': if (EndsWith("iciti")) { ReplaceEnd("ic"); } break; case 'l': if (EndsWith("ical")) { ReplaceEnd("ic"); break; } if (EndsWith("ful")) { ReplaceEnd(""); } break; case 's': if (EndsWith("ness")) { ReplaceEnd(""); } break; } } /* step5() takes off -ant, -ence etc., in context vcvc. */ private void Step5() { if (endIndex == 0) return; switch (wordArray[endIndex - 1]) { case 'a': if (EndsWith("al")) break; return; case 'c': if (EndsWith("ance")) break; if (EndsWith("ence")) break; return; case 'e': if (EndsWith("er")) break; return; case 'i': if (EndsWith("ic")) break; return; case 'l': if (EndsWith("able")) break; if (EndsWith("ible")) break; return; case 'n': if (EndsWith("ant")) break; if (EndsWith("ement")) break; if (EndsWith("ment")) break; /* element etc. not stripped before the m */ if (EndsWith("ent")) break; return; case 'o': if (EndsWith("ion") && stemIndex >= 0 && (wordArray[stemIndex] == 's' || wordArray[stemIndex] == 't')) break; /* j >= 0 fixes Bug 2 */ if (EndsWith("ou")) break; return; /* takes care of -ous */ case 's': if (EndsWith("ism")) break; return; case 't': if (EndsWith("ate")) break; if (EndsWith("iti")) break; return; case 'u': if (EndsWith("ous")) break; return; case 'v': if (EndsWith("ive")) break; return; case 'z': if (EndsWith("ize")) break; return; default: return; } if (MeasureConsontantSequence() > 1) endIndex = stemIndex; } /* step6() removes a final -e if m() > 1. */ private void Step6() { stemIndex = endIndex; if (wordArray[endIndex] == 'e') { var a = MeasureConsontantSequence(); if (a > 1 || a == 1 && !IsCVC(endIndex - 1)) endIndex--; } if (wordArray[endIndex] == 'l' && IsDoubleConsontant(endIndex) && MeasureConsontantSequence() > 1) endIndex--; } // Returns true if the character at the specified index is a consonant. // With special handling for 'y'. private bool IsConsonant(int index) { var c = wordArray[index]; if (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u') return false; return c != 'y' || (index == 0 || !IsConsonant(index - 1)); } /* m() measures the number of consonant sequences between 0 and j. if c is a consonant sequence and v a vowel sequence, and <..> indicates arbitrary presence, gives 0 vc gives 1 vcvc gives 2 vcvcvc gives 3 .... */ private int MeasureConsontantSequence() { var n = 0; var index = 0; while (true) { if (index > stemIndex) return n; if (!IsConsonant(index)) break; index++; } index++; while (true) { while (true) { if (index > stemIndex) return n; if (IsConsonant(index)) break; index++; } index++; n++; while (true) { if (index > stemIndex) return n; if (!IsConsonant(index)) break; index++; } index++; } } // Return true if there is a vowel in the current stem (0 ... stemIndex) private bool VowelInStem() { int i; for (i = 0; i <= stemIndex; i++) { if (!IsConsonant(i)) return true; } return false; } // Returns true if the char at the specified index and the one preceeding it are the same consonants. private bool IsDoubleConsontant(int index) { if (index < 1) return false; return wordArray[index] == wordArray[index - 1] && IsConsonant(index); } /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant and also if the second c is not w,x or y. this is used when trying to restore an e at the end of a short word. e.g. cav(e), lov(e), hop(e), crim(e), but snow, box, tray. */ private bool IsCVC(int index) { if (index < 2 || !IsConsonant(index) || IsConsonant(index - 1) || !IsConsonant(index - 2)) return false; var c = wordArray[index]; return c != 'w' && c != 'x' && c != 'y'; } // Does the current word array end with the specified string. private bool EndsWith(string s) { var length = s.Length; var index = endIndex - length + 1; if (index < 0) return false; for (var i = 0; i < length; i++) { if (wordArray[index + i] != s[i]) return false; } stemIndex = endIndex - length; return true; } // Set the end of the word to s. // Starting at the current stem pointer and readjusting the end pointer. private void SetEnd(string s) { var length = s.Length; var index = stemIndex + 1; for (var i = 0; i < length; i++) { wordArray[index + i] = s[i]; } // Set the end pointer to the new end of the word. endIndex = stemIndex + length; } // Conditionally replace the end of the word private void ReplaceEnd(string s) { if (MeasureConsontantSequence() > 0) SetEnd(s); } } }