// Package stemmer implements the Porter stemming algorithm for English.
// This is the original 1980 algorithm; I chose it over Porter2 because it
// is simpler and the TIL corpus is small enough that recall differences
// don't matter in practice.
package stemmer
import "strings"
// Stem returns the Porter stem of word. Input is assumed lowercase.
func Stem(word string) string {
if len(word) <= 2 {
return word
}
w := []byte(word)
w = step1a(w)
w = step1b(w)
w = step1c(w)
w = step2(w)
w = step3(w)
w = step4(w)
w = step5a(w)
w = step5b(w)
return string(w)
}
// StemAll returns the stems for a slice of words.
func StemAll(words []string) []string {
out := make([]string, len(words))
for i, w := range words {
out[i] = Stem(w)
}
return out
}
func isVowel(b byte) bool {
switch b {
case 'a', 'e', 'i', 'o', 'u':
return true
}
return false
}
// consonant reports whether w[i] is a consonant, treating 'y' as a consonant
// when preceded by a vowel (per the Porter rules).
func consonant(w []byte, i int) bool {
if isVowel(w[i]) {
return false
}
if w[i] == 'y' {
if i == 0 {
return true
}
return !consonant(w, i-1)
}
return true
}
// measure counts the number of (VC) patterns in stem.
func measure(stem []byte) int {
n := 0
i := 0
for i < len(stem) && consonant(stem, i) {
i++
}
for i < len(stem) {
for i < len(stem) && !consonant(stem, i) {
i++
}
if i == len(stem) {
break
}
n++
for i < len(stem) && consonant(stem, i) {
i++
}
}
return n
}
func hasVowel(stem []byte) bool {
for i := range stem {
if !consonant(stem, i) {
return true
}
}
return false
}
func endsWith(w []byte, suffix string) bool {
return strings.HasSuffix(string(w), suffix)
}
func replace(w []byte, suf, repl string) []byte {
return append(w[:len(w)-len(suf)], repl...)
}
func step1a(w []byte) []byte {
switch {
case endsWith(w, "sses"):
return replace(w, "sses", "ss")
case endsWith(w, "ies"):
return replace(w, "ies", "i")
case endsWith(w, "ss"):
return w
case endsWith(w, "s"):
return w[:len(w)-1]
}
return w
}
func step1b(w []byte) []byte {
if endsWith(w, "eed") {
if measure(w[:len(w)-3]) > 0 {
return w[:len(w)-1]
}
return w
}
for _, suf := range []string{"ed", "ing"} {
if endsWith(w, suf) {
stem := w[:len(w)-len(suf)]
if hasVowel(stem) {
w = stem
return step1bFix(w)
}
}
}
return w
}
func step1bFix(w []byte) []byte {
switch {
case endsWith(w, "at"):
return replace(w, "at", "ate")
case endsWith(w, "bl"):
return replace(w, "bl", "ble")
case endsWith(w, "iz"):
return replace(w, "iz", "ize")
}
n := len(w)
if n >= 2 && w[n-1] == w[n-2] && !isVowel(w[n-1]) && w[n-1] != 'l' && w[n-1] != 's' && w[n-1] != 'z' {
return w[:n-1]
}
if measure(w) == 1 && n >= 3 && consonant(w, n-3) && !consonant(w, n-2) && consonant(w, n-1) {
last := w[n-1]
if last != 'w' && last != 'x' && last != 'y' {
return append(w, 'e')
}
}
return w
}
func step1c(w []byte) []byte {
if len(w) > 2 && w[len(w)-1] == 'y' && hasVowel(w[:len(w)-1]) {
w[len(w)-1] = 'i'
}
return w
}
var step2Table = [][2]string{
{"ational", "ate"}, {"tional", "tion"},
{"enci", "ence"}, {"anci", "ance"},
{"izer", "ize"}, {"abli", "able"},
{"alli", "al"}, {"entli", "ent"},
{"eli", "e"}, {"ousli", "ous"},
{"ization", "ize"}, {"ation", "ate"},
{"ator", "ate"}, {"alism", "al"},
{"iveness", "ive"}, {"fulness", "ful"},
{"ousness", "ous"}, {"aliti", "al"},
{"iviti", "ive"}, {"biliti", "ble"},
}
func step2(w []byte) []byte {
for _, pair := range step2Table {
if endsWith(w, pair[0]) {
stem := w[:len(w)-len(pair[0])]
if measure(stem) > 0 {
return replace(w, pair[0], pair[1])
}
break
}
}
return w
}
var step3Table = [][2]string{
{"icate", "ic"}, {"ative", ""},
{"alize", "al"}, {"iciti", "ic"},
{"ical", "ic"}, {"ful", ""},
{"ness", ""},
}
func step3(w []byte) []byte {
for _, pair := range step3Table {
if endsWith(w, pair[0]) {
stem := w[:len(w)-len(pair[0])]
if measure(stem) > 0 {
return replace(w, pair[0], pair[1])
}
break
}
}
return w
}
var step4Table = []string{
"al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement",
"ment", "ent", "ou", "ism", "ate", "iti", "ous", "ive", "ize",
}
func step4(w []byte) []byte {
for _, suf := range step4Table {
if endsWith(w, suf) {
stem := w[:len(w)-len(suf)]
if measure(stem) > 1 {
if suf == "ion" && len(stem) > 0 && stem[len(stem)-1] != 's' && stem[len(stem)-1] != 't' {
return w
}
return stem
}
break
}
}
return w
}
func step5a(w []byte) []byte {
if len(w) < 2 || w[len(w)-1] != 'e' {
return w
}
stem := w[:len(w)-1]
m := measure(stem)
if m > 1 || (m == 1 && !isCVC(stem)) {
return stem
}
return w
}
func step5b(w []byte) []byte {
n := len(w)
if n >= 2 && w[n-1] == 'l' && w[n-2] == 'l' && measure(w) > 1 {
return w[:n-1]
}
return w
}
func isCVC(w []byte) bool {
n := len(w)
if n < 3 {
return false
}
if !consonant(w, n-1) || consonant(w, n-2) || !consonant(w, n-3) {
return false
}
last := w[n-1]
return last != 'w' && last != 'x' && last != 'y'
}