// Package tokenizer splits input text into tokens that the inverted index
// can store. This file holds the English-language implementation with a
// short stopword list. Tokenization is intentionally plain: unicode word
// boundaries, lowercase, drop punctuation and stopwords. Anything fancier
// goes in a separate file behind a language tag.
package tokenizer
import (
"strings"
"unicode"
)
// Token is a single lexical unit, carrying its lowercased form and the
// byte offset within the original source. Offsets are useful for
// highlighting search hits.
type Token struct {
Term string
Offset int
}
// English tokenizes English text.
type English struct {
KeepStopwords bool
MinLen int
}
// DefaultEnglish returns a tokenizer with the conservative defaults used
// by the build pipeline.
func DefaultEnglish() *English {
return &English{MinLen: 2}
}
// Tokenize splits src into tokens.
func (e *English) Tokenize(src string) []Token {
min := e.MinLen
if min < 1 {
min = 1
}
var tokens []Token
start := -1
for i, r := range src {
if unicode.IsLetter(r) || unicode.IsDigit(r) {
if start < 0 {
start = i
}
continue
}
if start >= 0 {
term := strings.ToLower(src[start:i])
if len(term) >= min && (e.KeepStopwords || !IsStopword(term)) {
tokens = append(tokens, Token{Term: term, Offset: start})
}
start = -1
}
}
if start >= 0 {
term := strings.ToLower(src[start:])
if len(term) >= min && (e.KeepStopwords || !IsStopword(term)) {
tokens = append(tokens, Token{Term: term, Offset: start})
}
}
return tokens
}
// Terms returns just the term strings, for callers that don't care about
// offsets (e.g. index builders).
func (e *English) Terms(src string) []string {
toks := e.Tokenize(src)
out := make([]string, len(toks))
for i, t := range toks {
out[i] = t.Term
}
return out
}
// Stopwords is the short list of English function words we drop by default.
// Keeping this small on purpose; aggressive filtering hurts recall on a
// TIL-sized corpus.
var stopwords = map[string]struct{}{
"a": {}, "an": {}, "and": {}, "are": {}, "as": {}, "at": {},
"be": {}, "by": {}, "for": {}, "from": {}, "has": {}, "he": {},
"in": {}, "is": {}, "it": {}, "its": {}, "of": {}, "on": {},
"that": {}, "the": {}, "to": {}, "was": {}, "were": {}, "will": {},
"with": {}, "i": {}, "you": {}, "we": {}, "they": {}, "this": {},
"or": {}, "but": {}, "not": {}, "so": {}, "if": {}, "do": {},
"does": {}, "did": {}, "can": {}, "could": {}, "should": {}, "would": {},
}
// IsStopword reports whether term is in the curated English stopword set.
func IsStopword(term string) bool {
_, ok := stopwords[strings.ToLower(term)]
return ok
}
// Add lets a caller extend the stopword set at runtime. Intended for tests
// and site configuration.
func Add(words ...string) {
for _, w := range words {
stopwords[strings.ToLower(w)] = struct{}{}
}
}
// Size returns the current stopword count.
func Size() int { return len(stopwords) }