package tokenizer
import (
"reflect"
"testing"
)
func TestTokenizeBasic(t *testing.T) {
t.Parallel()
cases := []struct {
name string
in string
want []string
}{
{"empty", "", nil},
{"simple", "hello world", []string{"hello", "world"}},
{"with_stopwords", "the quick brown fox", []string{"quick", "brown", "fox"}},
{"punctuation", "cats, dogs; birds!", []string{"cats", "dogs", "birds"}},
{"mixed_case", "Hello World", []string{"hello", "world"}},
{"digits", "go 1.21 release", []string{"go", "21", "release"}},
{"unicode_letters", "café résumé", []string{"café", "résumé"}},
}
e := DefaultEnglish()
for _, tc := range cases {
tc := tc
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
got := e.Terms(tc.in)
if !reflect.DeepEqual(got, tc.want) && !(len(got) == 0 && len(tc.want) == 0) {
t.Errorf("Terms(%q) = %v, want %v", tc.in, got, tc.want)
}
})
}
}
func TestTokenizeOffsets(t *testing.T) {
t.Parallel()
e := DefaultEnglish()
toks := e.Tokenize("hello, world")
if len(toks) != 2 {
t.Fatalf("expected 2 tokens, got %d", len(toks))
}
if toks[0].Offset != 0 {
t.Errorf("first offset = %d", toks[0].Offset)
}
if toks[1].Offset != 7 {
t.Errorf("second offset = %d", toks[1].Offset)
}
}
func TestIsStopword(t *testing.T) {
t.Parallel()
cases := map[string]bool{
"the": true,
"AND": true,
"house": false,
"python": false,
}
for w, want := range cases {
w, want := w, want
t.Run(w, func(t *testing.T) {
t.Parallel()
if IsStopword(w) != want {
t.Errorf("IsStopword(%q) = %v, want %v", w, !want, want)
}
})
}
}
func TestAddExtendsStopwords(t *testing.T) {
t.Parallel()
t.Helper()
before := Size()
Add("xyzzy")
t.Cleanup(func() { delete(stopwords, "xyzzy") })
if !IsStopword("xyzzy") {
t.Error("Add did not register the word")
}
if Size() != before+1 {
t.Errorf("Size = %d, before = %d", Size(), before)
}
}