internal/index/tokenizer/english_test.go

1.9 KB · 82 lines · 2024-11-24 · c6a4d2e
package tokenizer

import (
	"reflect"
	"testing"
)

func TestTokenizeBasic(t *testing.T) {
	t.Parallel()
	cases := []struct {
		name string
		in   string
		want []string
	}{
		{"empty", "", nil},
		{"simple", "hello world", []string{"hello", "world"}},
		{"with_stopwords", "the quick brown fox", []string{"quick", "brown", "fox"}},
		{"punctuation", "cats, dogs; birds!", []string{"cats", "dogs", "birds"}},
		{"mixed_case", "Hello World", []string{"hello", "world"}},
		{"digits", "go 1.21 release", []string{"go", "21", "release"}},
		{"unicode_letters", "café résumé", []string{"café", "résumé"}},
	}
	e := DefaultEnglish()
	for _, tc := range cases {
		tc := tc
		t.Run(tc.name, func(t *testing.T) {
			t.Parallel()
			got := e.Terms(tc.in)
			if !reflect.DeepEqual(got, tc.want) && !(len(got) == 0 && len(tc.want) == 0) {
				t.Errorf("Terms(%q) = %v, want %v", tc.in, got, tc.want)
			}
		})
	}
}

func TestTokenizeOffsets(t *testing.T) {
	t.Parallel()
	e := DefaultEnglish()
	toks := e.Tokenize("hello, world")
	if len(toks) != 2 {
		t.Fatalf("expected 2 tokens, got %d", len(toks))
	}
	if toks[0].Offset != 0 {
		t.Errorf("first offset = %d", toks[0].Offset)
	}
	if toks[1].Offset != 7 {
		t.Errorf("second offset = %d", toks[1].Offset)
	}
}

func TestIsStopword(t *testing.T) {
	t.Parallel()
	cases := map[string]bool{
		"the":    true,
		"AND":    true,
		"house":  false,
		"python": false,
	}
	for w, want := range cases {
		w, want := w, want
		t.Run(w, func(t *testing.T) {
			t.Parallel()
			if IsStopword(w) != want {
				t.Errorf("IsStopword(%q) = %v, want %v", w, !want, want)
			}
		})
	}
}

func TestAddExtendsStopwords(t *testing.T) {
	t.Parallel()
	t.Helper()
	before := Size()
	Add("xyzzy")
	t.Cleanup(func() { delete(stopwords, "xyzzy") })
	if !IsStopword("xyzzy") {
		t.Error("Add did not register the word")
	}
	if Size() != before+1 {
		t.Errorf("Size = %d, before = %d", Size(), before)
	}
}