// Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original // paper, in // // Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, // no. 3, pp 130-137, // // see also http://www.tartarus.org/~martin/PorterStemmer // Release 1 be 'andargor', Jul 2004 // Release 2 (substantially revised) by Christopher McKenzie, Aug 2009 var stemmer = (function(){ var step2list = { "ational" : "ate", "tional" : "tion", "enci" : "ence", "anci" : "ance", "izer" : "ize", "bli" : "ble", "alli" : "al", "entli" : "ent", "eli" : "e", "ousli" : "ous", "ization" : "ize", "ation" : "ate", "ator" : "ate", "alism" : "al", "iveness" : "ive", "fulness" : "ful", "ousness" : "ous", "aliti" : "al", "iviti" : "ive", "biliti" : "ble", "logi" : "log" }, step3list = { "icate" : "ic", "ative" : "", "alize" : "al", "iciti" : "ic", "ical" : "ic", "ful" : "", "ness" : "" }, c = "[^aeiou]", // consonant v = "[aeiouy]", // vowel C = c + "[^aeiouy]*", // consonant sequence V = v + "[aeiou]*", // vowel sequence mgr0 = "^(" + C + ")?" + V + C, // [C]VC... is m>0 meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$", // [C]VC[V] is m=1 mgr1 = "^(" + C + ")?" + V + C + V + C, // [C]VCVC... is m>1 s_v = "^(" + C + ")?" + v; // vowel in stem return function (w) { var stem, suffix, firstch, re, re2, re3, re4, origword = w; if (w.length < 3) { return w; } firstch = w.substr(0,1); if (firstch == "y") { w = firstch.toUpperCase() + w.substr(1); } // Step 1a re = /^(.+?)(ss|i)es$/; re2 = /^(.+?)([^s])s$/; if (re.test(w)) { w = w.replace(re,"$1$2"); } else if (re2.test(w)) { w = w.replace(re2,"$1$2"); } // Step 1b re = /^(.+?)eed$/; re2 = /^(.+?)(ed|ing)$/; if (re.test(w)) { var fp = re.exec(w); re = new RegExp(mgr0); if (re.test(fp[1])) { re = /.$/; w = w.replace(re,""); } } else if (re2.test(w)) { var fp = re2.exec(w); stem = fp[1]; re2 = new RegExp(s_v); if (re2.test(stem)) { w = stem; re2 = /(at|bl|iz)$/; re3 = new RegExp("([^aeiouylsz])\\1$"); re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); if (re2.test(w)) { w = w + "e"; } else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); } else if (re4.test(w)) { w = w + "e"; } } } // Step 1c re = /^(.+?)y$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; re = new RegExp(s_v); if (re.test(stem)) { w = stem + "i"; } } // Step 2 re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; suffix = fp[2]; re = new RegExp(mgr0); if (re.test(stem)) { w = stem + step2list[suffix]; } } // Step 3 re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; suffix = fp[2]; re = new RegExp(mgr0); if (re.test(stem)) { w = stem + step3list[suffix]; } } // Step 4 re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; re2 = /^(.+?)(s|t)(ion)$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; re = new RegExp(mgr1); if (re.test(stem)) { w = stem; } } else if (re2.test(w)) { var fp = re2.exec(w); stem = fp[1] + fp[2]; re2 = new RegExp(mgr1); if (re2.test(stem)) { w = stem; } } // Step 5 re = /^(.+?)e$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; re = new RegExp(mgr1); re2 = new RegExp(meq1); re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) { w = stem; } } re = /ll$/; re2 = new RegExp(mgr1); if (re.test(w) && re2.test(w)) { re = /.$/; w = w.replace(re,""); } // and turn initial Y back to y if (firstch == "y") { w = firstch.toLowerCase() + w.substr(1); } return w; } })();