internal/index/tokenizer/english.go

3.0 KB · 118 lines · 2024-11-24 · 9f3e1b8
// Package tokenizer splits input text into tokens that the inverted index
// can store. This file holds the English-language implementation with a
// short stopword list. Tokenization is intentionally plain: unicode word
// boundaries, lowercase, drop punctuation and stopwords. Anything fancier
// goes in a separate file behind a language tag.
package tokenizer

import (
	"strings"
	"unicode"
)

// Token is a single lexical unit, carrying its lowercased form and the
// byte offset within the original source. Offsets are useful for
// highlighting search hits.
type Token struct {
	Term   string
	Offset int
}

// English tokenizes English text.
type English struct {
	KeepStopwords bool
	MinLen        int
}

// DefaultEnglish returns a tokenizer with the conservative defaults used
// by the build pipeline.
func DefaultEnglish() *English {
	return &English{MinLen: 2}
}

// Tokenize splits src into tokens.
func (e *English) Tokenize(src string) []Token {
	min := e.MinLen
	if min < 1 {
		min = 1
	}
	var tokens []Token
	start := -1
	for i, r := range src {
		if unicode.IsLetter(r) || unicode.IsDigit(r) {
			if start < 0 {
				start = i
			}
			continue
		}
		if start >= 0 {
			term := strings.ToLower(src[start:i])
			if len(term) >= min && (e.KeepStopwords || !IsStopword(term)) {
				tokens = append(tokens, Token{Term: term, Offset: start})
			}
			start = -1
		}
	}
	if start >= 0 {
		term := strings.ToLower(src[start:])
		if len(term) >= min && (e.KeepStopwords || !IsStopword(term)) {
			tokens = append(tokens, Token{Term: term, Offset: start})
		}
	}
	return tokens
}

// Terms returns just the term strings, for callers that don't care about
// offsets (e.g. index builders).
func (e *English) Terms(src string) []string {
	toks := e.Tokenize(src)
	out := make([]string, len(toks))
	for i, t := range toks {
		out[i] = t.Term
	}
	return out
}

// Stopwords is the short list of English function words we drop by default.
// Keeping this small on purpose; aggressive filtering hurts recall on a
// TIL-sized corpus.
var stopwords = map[string]struct{}{
	"a": {}, "an": {}, "and": {}, "are": {}, "as": {}, "at": {},
	"be": {}, "by": {}, "for": {}, "from": {}, "has": {}, "he": {},
	"in": {}, "is": {}, "it": {}, "its": {}, "of": {}, "on": {},
	"that": {}, "the": {}, "to": {}, "was": {}, "were": {}, "will": {},
	"with": {}, "i": {}, "you": {}, "we": {}, "they": {}, "this": {},
	"or": {}, "but": {}, "not": {}, "so": {}, "if": {}, "do": {},
	"does": {}, "did": {}, "can": {}, "could": {}, "should": {}, "would": {},
}

// IsStopword reports whether term is in the curated English stopword set.
func IsStopword(term string) bool {
	_, ok := stopwords[strings.ToLower(term)]
	return ok
}

// Add lets a caller extend the stopword set at runtime. Intended for tests
// and site configuration.
func Add(words ...string) {
	for _, w := range words {
		stopwords[strings.ToLower(w)] = struct{}{}
	}
}

// Size returns the current stopword count.
func Size() int { return len(stopwords) }