internal/index/stemmer/porter.go

// Package stemmer implements the Porter stemming algorithm for English.
// This is the original 1980 algorithm; I chose it over Porter2 because it
// is simpler and the TIL corpus is small enough that recall differences
// don't matter in practice.
package stemmer

import "strings"

// Stem returns the Porter stem of word. Input is assumed lowercase.
func Stem(word string) string {
	if len(word) <= 2 {
		return word
	}
	w := []byte(word)
	w = step1a(w)
	w = step1b(w)
	w = step1c(w)
	w = step2(w)
	w = step3(w)
	w = step4(w)
	w = step5a(w)
	w = step5b(w)
	return string(w)
}

// StemAll returns the stems for a slice of words.
func StemAll(words []string) []string {
	out := make([]string, len(words))
	for i, w := range words {
		out[i] = Stem(w)
	}
	return out
}

func isVowel(b byte) bool {
	switch b {
	case 'a', 'e', 'i', 'o', 'u':
		return true
	}
	return false
}

// consonant reports whether w[i] is a consonant, treating 'y' as a consonant
// when preceded by a vowel (per the Porter rules).
func consonant(w []byte, i int) bool {
	if isVowel(w[i]) {
		return false
	}
	if w[i] == 'y' {
		if i == 0 {
			return true
		}
		return !consonant(w, i-1)
	}
	return true
}

// measure counts the number of (VC) patterns in stem.
func measure(stem []byte) int {
	n := 0
	i := 0
	for i < len(stem) && consonant(stem, i) {
		i++
	}
	for i < len(stem) {
		for i < len(stem) && !consonant(stem, i) {
			i++
		}
		if i == len(stem) {
			break
		}
		n++
		for i < len(stem) && consonant(stem, i) {
			i++
		}
	}
	return n
}

func hasVowel(stem []byte) bool {
	for i := range stem {
		if !consonant(stem, i) {
			return true
		}
	}
	return false
}

func endsWith(w []byte, suffix string) bool {
	return strings.HasSuffix(string(w), suffix)
}

func replace(w []byte, suf, repl string) []byte {
	return append(w[:len(w)-len(suf)], repl...)
}

func step1a(w []byte) []byte {
	switch {
	case endsWith(w, "sses"):
		return replace(w, "sses", "ss")
	case endsWith(w, "ies"):
		return replace(w, "ies", "i")
	case endsWith(w, "ss"):
		return w
	case endsWith(w, "s"):
		return w[:len(w)-1]
	}
	return w
}

func step1b(w []byte) []byte {
	if endsWith(w, "eed") {
		if measure(w[:len(w)-3]) > 0 {
			return w[:len(w)-1]
		}
		return w
	}
	for _, suf := range []string{"ed", "ing"} {
		if endsWith(w, suf) {
			stem := w[:len(w)-len(suf)]
			if hasVowel(stem) {
				w = stem
				return step1bFix(w)
			}
		}
	}
	return w
}

func step1bFix(w []byte) []byte {
	switch {
	case endsWith(w, "at"):
		return replace(w, "at", "ate")
	case endsWith(w, "bl"):
		return replace(w, "bl", "ble")
	case endsWith(w, "iz"):
		return replace(w, "iz", "ize")
	}
	n := len(w)
	if n >= 2 && w[n-1] == w[n-2] && !isVowel(w[n-1]) && w[n-1] != 'l' && w[n-1] != 's' && w[n-1] != 'z' {
		return w[:n-1]
	}
	if measure(w) == 1 && n >= 3 && consonant(w, n-3) && !consonant(w, n-2) && consonant(w, n-1) {
		last := w[n-1]
		if last != 'w' && last != 'x' && last != 'y' {
			return append(w, 'e')
		}
	}
	return w
}

func step1c(w []byte) []byte {
	if len(w) > 2 && w[len(w)-1] == 'y' && hasVowel(w[:len(w)-1]) {
		w[len(w)-1] = 'i'
	}
	return w
}

var step2Table = [][2]string{
	{"ational", "ate"}, {"tional", "tion"},
	{"enci", "ence"}, {"anci", "ance"},
	{"izer", "ize"}, {"abli", "able"},
	{"alli", "al"}, {"entli", "ent"},
	{"eli", "e"}, {"ousli", "ous"},
	{"ization", "ize"}, {"ation", "ate"},
	{"ator", "ate"}, {"alism", "al"},
	{"iveness", "ive"}, {"fulness", "ful"},
	{"ousness", "ous"}, {"aliti", "al"},
	{"iviti", "ive"}, {"biliti", "ble"},
}

func step2(w []byte) []byte {
	for _, pair := range step2Table {
		if endsWith(w, pair[0]) {
			stem := w[:len(w)-len(pair[0])]
			if measure(stem) > 0 {
				return replace(w, pair[0], pair[1])
			}
			break
		}
	}
	return w
}

var step3Table = [][2]string{
	{"icate", "ic"}, {"ative", ""},
	{"alize", "al"}, {"iciti", "ic"},
	{"ical", "ic"}, {"ful", ""},
	{"ness", ""},
}

func step3(w []byte) []byte {
	for _, pair := range step3Table {
		if endsWith(w, pair[0]) {
			stem := w[:len(w)-len(pair[0])]
			if measure(stem) > 0 {
				return replace(w, pair[0], pair[1])
			}
			break
		}
	}
	return w
}

var step4Table = []string{
	"al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement",
	"ment", "ent", "ou", "ism", "ate", "iti", "ous", "ive", "ize",
}

func step4(w []byte) []byte {
	for _, suf := range step4Table {
		if endsWith(w, suf) {
			stem := w[:len(w)-len(suf)]
			if measure(stem) > 1 {
				if suf == "ion" && len(stem) > 0 && stem[len(stem)-1] != 's' && stem[len(stem)-1] != 't' {
					return w
				}
				return stem
			}
			break
		}
	}
	return w
}

func step5a(w []byte) []byte {
	if len(w) < 2 || w[len(w)-1] != 'e' {
		return w
	}
	stem := w[:len(w)-1]
	m := measure(stem)
	if m > 1 || (m == 1 && !isCVC(stem)) {
		return stem
	}
	return w
}

func step5b(w []byte) []byte {
	n := len(w)
	if n >= 2 && w[n-1] == 'l' && w[n-2] == 'l' && measure(w) > 1 {
		return w[:n-1]
	}
	return w
}

func isCVC(w []byte) bool {
	n := len(w)
	if n < 3 {
		return false
	}
	if !consonant(w, n-1) || consonant(w, n-2) || !consonant(w, n-3) {
		return false
	}
	last := w[n-1]
	return last != 'w' && last != 'x' && last != 'y'
}