## PORTER'S STEMMING ALGORITHM implemented in TCL ## ## Author: Aris Theodorakos ## E-mail: artheo@iit.demokritos.gr ## Date: 6/11/2006 ## ## Based on the modified version of Porter's stemming algorithm posted at: ## http://www.tartarus.org/martin/PorterStemmer/ ## This code has been tested with the sample vocabulary found in the above site ## and has given the same output results for all words. ## ## To stem a single word, use the procedure stem ## ex. puts stdout [stem tanned] ## To stem a text file, call the procedure stemfile ## ex. stemfile voc.txt output.txt ## ## You can send me by e-mail any comments, queries or detected bugs. ## ## NOTE: ## ===== ## This implementation does NOT use global variables in order to have ## namespace independent procedures and simplicity in algorithm design. #################################################################################################### ## Proc: cons #################################################################################################### ## Syntax: cons ## Returns 1 if the letter of the at position is a consonant, ## 0 if it is a vowel. counts from 0... proc cons { word i } { switch -- [string index $word $i] { A - a - E - e - I - i - O - o - U - u { return 0 } Y - y { if { $i == 0 } { return 1 } else { incr i -1 return [expr {![cons $word $i]}] } } default { return 1 } } };# cons #################################################################################################### ## Proc: m #################################################################################################### ## Syntax: m ## Returns the measure of the , which is a natural number including 0. ## For definition of measure, see: http://www.tartarus.org/martin/PorterStemmer/def.txt proc m { word } { set n 0 set i 0 set len [string length $word] incr len -1 while { [cons $word $i] } { if { $i == $len } { return 0 } incr i } while { 1 } { while { ![cons $word $i] } { if { $i == $len } { return $n } incr i } incr n while { [cons $word $i] } { if { $i == $len } { return $n } incr i } } };# m #################################################################################################### ## Proc: vowelin #################################################################################################### ## Syntax: vowelin ## Returns 1 if the contains at least one vowel, 0 otherwise. proc vowelin { word } { set len [string length $word] for { set i 0 } { $i < $len } { incr i } { if { ![cons $word $i] } { return 1 } } return 0; };# vowelin #################################################################################################### ## Proc: doublec #################################################################################################### ## Syntax: doublec ## Returns 1 if the ends with double consonant, 0 otherwise. proc doublec { word } { set wordlen [string length $word] if { $wordlen < 2 } { return 0 } if { [string index $word end] != [string index $word end-1] } { return 0 } else { return [cons $word [expr {$wordlen-1}]] } };# doublec #################################################################################################### ## Proc: cvc #################################################################################################### ## Syntax: cvc ## Returns 1 if the ends with the combination: Consonant-Vowel-Consonant, ## where the last consonant is not w, x or y. Otherwise it returns 0 proc cvc { word } { set pos [string length $word] if { $pos < 3 } { return 0 } incr pos -1 if { ![cons $word $pos] || [cons $word [expr {$pos-1}]] \ || ![cons $word [expr {$pos-2}]]} { return 0 } switch -- [string index $word $pos] { w - x - y { return 0 } default { return 1 } } };# cvc #################################################################################################### ## Proc: step1ab #################################################################################################### ## Syntax: step1ab ## Returns the converted after the steps 1a and 1b of Porter's algorithm. proc step1ab { word } { switch -glob -- $word { *?sses { set stem [string range $word 0 end-4]; set word ${stem}ss } *?ies { set stem [string range $word 0 end-3]; set word ${stem}i } *?ss { } *??s { set stem [string range $word 0 end-1]; set word ${stem} } default { } } set goto1b 0 switch -glob -- $word { *?eed { set stem [string range $word 0 end-3] if { [m $stem] } { set word ${stem}ee } } *?ed { set stem [string range $word 0 end-2] if { [vowelin $stem] } { set goto1b 1 set word ${stem} } } *?ing { set stem [string range $word 0 end-3] if { [vowelin $stem] } { set goto1b 1 set word ${stem} } } default { } } if { $goto1b == 1 } { switch -glob -- $word { *?at { set stem [string range $word 0 end-2]; set word ${stem}ate } *?bl { set stem [string range $word 0 end-2]; set word ${stem}ble } *?iz { set stem [string range $word 0 end-2]; set word ${stem}ize } default { if { [doublec $word] } { switch -glob -- $word { *l - *s - *z { } default { set word [string range $word 0 end-1] } } } elseif { [m $word] == 1 && [cvc $word] } { append word e } } } } return $word };# step1ab #################################################################################################### ## Proc: step1c #################################################################################################### ## Syntax: step1c ## Returns the converted after the step 1c of Porter's algorithm. proc step1c { word } { if { [string match *?y $word] } { set stem [string range $word 0 end-1] if { [vowelin $stem] } { set word ${stem}i } } return $word };# step1c #################################################################################################### ## Proc: step2 #################################################################################################### ## Syntax: step2 ## Returns the converted after the step 2 of Porter's algorithm. proc step2 { word } { switch -glob -- $word { *?logi { set stem [string range $word 0 end-4]; if { [m $stem] } { set word ${stem}log } } *?ational { set stem [string range $word 0 end-7]; if { [m $stem] } { set word ${stem}ate } } *?tional { set stem [string range $word 0 end-6]; if { [m $stem] } { set word ${stem}tion } } *?enci { set stem [string range $word 0 end-4]; if { [m $stem] } { set word ${stem}ence } } *?anci { set stem [string range $word 0 end-4]; if { [m $stem] } { set word ${stem}ance } } *?izer { set stem [string range $word 0 end-4]; if { [m $stem] } { set word ${stem}ize } } *?bli { set stem [string range $word 0 end-3]; if { [m $stem] } { set word ${stem}ble } } *?alli { set stem [string range $word 0 end-4]; if { [m $stem] } { set word ${stem}al } } *?entli { set stem [string range $word 0 end-5]; if { [m $stem] } { set word ${stem}ent } } *?eli { set stem [string range $word 0 end-3]; if { [m $stem] } { set word ${stem}e } } *?ousli { set stem [string range $word 0 end-5]; if { [m $stem] } { set word ${stem}ous } } *?ization { set stem [string range $word 0 end-7]; if { [m $stem] } { set word ${stem}ize } } *?ation { set stem [string range $word 0 end-5]; if { [m $stem] } { set word ${stem}ate } } *?ator { set stem [string range $word 0 end-4]; if { [m $stem] } { set word ${stem}ate } } *?alism { set stem [string range $word 0 end-5]; if { [m $stem] } { set word ${stem}al } } *?iveness { set stem [string range $word 0 end-7]; if { [m $stem] } { set word ${stem}ive } } *?fulness { set stem [string range $word 0 end-7]; if { [m $stem] } { set word ${stem}ful } } *?ousness { set stem [string range $word 0 end-7]; if { [m $stem] } { set word ${stem}ous } } *?aliti { set stem [string range $word 0 end-5]; if { [m $stem] } { set word ${stem}al } } *?iviti { set stem [string range $word 0 end-5]; if { [m $stem] } { set word ${stem}ive } } *?biliti { set stem [string range $word 0 end-6]; if { [m $stem] } { set word ${stem}ble } } default { } } return $word };# step2 #################################################################################################### ## Proc: step3 #################################################################################################### ## Syntax: step3 ## Returns the converted after the step 3 of Porter's algorithm. proc step3 { word } { switch -glob -- $word { *?icate { set stem [string range $word 0 end-5]; if { [m $stem] } { set word ${stem}ic } } *?ative { set stem [string range $word 0 end-5]; if { [m $stem] } { set word ${stem} } } *?alize { set stem [string range $word 0 end-5]; if { [m $stem] } { set word ${stem}al } } *?iciti { set stem [string range $word 0 end-5]; if { [m $stem] } { set word ${stem}ic } } *?ical { set stem [string range $word 0 end-4]; if { [m $stem] } { set word ${stem}ic } } *?ful { set stem [string range $word 0 end-3]; if { [m $stem] } { set word ${stem} } } *?ness { set stem [string range $word 0 end-4]; if { [m $stem] } { set word ${stem} } } default { } } return $word };# step3 #################################################################################################### ## Proc: step4 #################################################################################################### ## Syntax: step4 ## Returns the converted after the step 4 of Porter's algorithm. proc step4 { word } { switch -glob -- $word { *?al { set stem [string range $word 0 end-2]; if { [m $stem]>1 } { set word ${stem} } } *?ance { set stem [string range $word 0 end-4]; if { [m $stem]>1 } { set word ${stem} } } *?ence { set stem [string range $word 0 end-4]; if { [m $stem]>1 } { set word ${stem} } } *?er { set stem [string range $word 0 end-2]; if { [m $stem]>1 } { set word ${stem} } } *?ic { set stem [string range $word 0 end-2]; if { [m $stem]>1 } { set word ${stem} } } *?able { set stem [string range $word 0 end-4]; if { [m $stem]>1 } { set word ${stem} } } *?ible { set stem [string range $word 0 end-4]; if { [m $stem]>1 } { set word ${stem} } } *?ant { set stem [string range $word 0 end-3]; if { [m $stem]>1 } { set word ${stem} } } *?ement { set stem [string range $word 0 end-5]; if { [m $stem]>1 } { set word ${stem} } } *?ment { set stem [string range $word 0 end-4]; if { [m $stem]>1 } { set word ${stem} } } *?ent { set stem [string range $word 0 end-3]; if { [m $stem]>1 } { set word ${stem} } } *?ion { set stem [string range $word 0 end-3] if { [string match *s $stem] || [string match *t $stem] } { if { [m $stem] > 1 } { set word ${stem} } } } *?ou { set stem [string range $word 0 end-2]; if { [m $stem]>1 } { set word ${stem} } } *?ism { set stem [string range $word 0 end-3]; if { [m $stem]>1 } { set word ${stem} } } *?ate { set stem [string range $word 0 end-3]; if { [m $stem]>1 } { set word ${stem} } } *?iti { set stem [string range $word 0 end-3]; if { [m $stem]>1 } { set word ${stem} } } *?ous { set stem [string range $word 0 end-3]; if { [m $stem]>1 } { set word ${stem} } } *?ive { set stem [string range $word 0 end-3]; if { [m $stem]>1 } { set word ${stem} } } *?ize { set stem [string range $word 0 end-3]; if { [m $stem]>1 } { set word ${stem} } } default { } } return $word };# step4 #################################################################################################### ## Proc: step5 #################################################################################################### ## Syntax: step5 ## Returns the converted after the step 5 of Porter's algorithm. proc step5 { word } { if { [string match *?e $word] } { set stem [string range $word 0 end-1] if { [m $stem] > 1 } { set word $stem } \ elseif { [m $stem] == 1 && ![cvc $stem] } { set word $stem } } if { [m $word] > 1 && [doublec $word] } { if { [string match *l $word] } { set word [string range $word 0 end-1] } } return $word };# step5 #################################################################################################### ## Proc: stem #################################################################################################### ## Syntax: stem ## It "passes" the through all the above steps and finally returns its stem. ## It is at the user's responsibility to convert the to lowercase before ## calling this procedure. proc stem { word } { if { [string length $word] < 3 } { return $word } return [step5 [step4 [step3 [step2 [step1c [step1ab $word]]]]]] };# stem #################################################################################################### ## Proc: stemfile #################################################################################################### ## Syntax: stemfile ## It copies the contents of to ## after replacing every english word by its stem. proc stemfile { input_file output_file } { if [catch {open $input_file r} filein] { error "Cannot open file \"$input_file\"!" } if [catch {open $output_file w} fileout] { error "Cannot open file \"$output_file\" for writing!" } set inword 0 while { ![eof $filein] } { set ch [read $filein 1] if { $inword == 0 } { if { [string match {[a-zA-Z]} $ch] } { set word $ch set inword 1 } else { puts -nonewline $fileout $ch } } \ else { if { [string match {[a-zA-Z]} $ch] } { append word $ch } else { puts -nonewline $fileout [stem [string tolower $word]]$ch set inword 0 } } } if { $inword == 1 } { puts -nonewline $fileout [stem $word] } close $filein close $fileout };# stemfile