internal/index/invert/build.go

3.8 KB · 144 lines · 2024-12-28 · 7c218ba
// Package invert builds an in-memory inverted index from tokenized posts.
// The index is small enough for the whole TIL corpus to fit comfortably
// in a browser's memory after serialization, so I don't bother with block
// compression here. See mercemay.top/src/tilstream/ for the bigger
// picture of how the search pipeline works.
package invert

import (
	"sort"
	"sync"

	"mercemay.top/src/tilstream/internal/index/stemmer"
	"mercemay.top/src/tilstream/internal/index/tokenizer"
)

// Doc is a post waiting to be indexed.
type Doc struct {
	ID    string
	Title string
	Body  string
	Tags  []string
	URL   string
}

// Posting is a single term->document entry with position info.
type Posting struct {
	DocID     string
	Frequency int
	Positions []int
}

// Index is the final immutable result.
type Index struct {
	Terms map[string][]Posting
	Docs  map[string]DocMeta
}

// DocMeta holds per-document metadata not stored in the postings.
type DocMeta struct {
	Title string
	URL   string
	Tags  []string
	Len   int
}

// Builder accumulates postings and returns a finalized Index.
type Builder struct {
	tokenizer *tokenizer.English
	stem      bool
	mu        sync.Mutex
	docs      map[string]DocMeta
	postings  map[string]map[string]*Posting // term -> docID -> posting
}

// NewBuilder returns a Builder with the default English tokenizer and
// Porter stemming enabled.
func NewBuilder() *Builder {
	return &Builder{
		tokenizer: tokenizer.DefaultEnglish(),
		stem:      true,
		docs:      make(map[string]DocMeta),
		postings:  make(map[string]map[string]*Posting),
	}
}

// DisableStemming skips the Porter pass. Used in tests where deterministic
// token output is easier to reason about.
func (b *Builder) DisableStemming() *Builder {
	b.stem = false
	return b
}

// Add indexes a single document.
func (b *Builder) Add(d Doc) {
	b.mu.Lock()
	defer b.mu.Unlock()
	tokens := b.tokenizer.Tokenize(d.Title + " " + d.Body)
	if b.stem {
		for i, t := range tokens {
			tokens[i].Term = stemmer.Stem(t.Term)
		}
	}
	for _, tag := range d.Tags {
		tokens = append(tokens, tokenizer.Token{Term: stemmer.Stem(tag)})
	}
	b.docs[d.ID] = DocMeta{
		Title: d.Title,
		URL:   d.URL,
		Tags:  append([]string(nil), d.Tags...),
		Len:   len(tokens),
	}
	for pos, tok := range tokens {
		byDoc, ok := b.postings[tok.Term]
		if !ok {
			byDoc = make(map[string]*Posting)
			b.postings[tok.Term] = byDoc
		}
		p, ok := byDoc[d.ID]
		if !ok {
			p = &Posting{DocID: d.ID}
			byDoc[d.ID] = p
		}
		p.Frequency++
		p.Positions = append(p.Positions, pos)
	}
}

// Build finalizes the index and returns an immutable snapshot.
func (b *Builder) Build() *Index {
	b.mu.Lock()
	defer b.mu.Unlock()
	idx := &Index{
		Terms: make(map[string][]Posting, len(b.postings)),
		Docs:  make(map[string]DocMeta, len(b.docs)),
	}
	for term, byDoc := range b.postings {
		list := make([]Posting, 0, len(byDoc))
		for _, p := range byDoc {
			list = append(list, *p)
		}
		sort.Slice(list, func(i, j int) bool { return list[i].DocID < list[j].DocID })
		idx.Terms[term] = list
	}
	for id, meta := range b.docs {
		idx.Docs[id] = meta
	}
	return idx
}

// Search returns docIDs matching any of the given terms.
func (idx *Index) Search(terms []string) []string {
	hits := make(map[string]int)
	for _, term := range terms {
		stemmed := stemmer.Stem(term)
		for _, p := range idx.Terms[stemmed] {
			hits[p.DocID] += p.Frequency
		}
	}
	out := make([]string, 0, len(hits))
	for id := range hits {
		out = append(out, id)
	}
	sort.SliceStable(out, func(i, j int) bool { return hits[out[i]] > hits[out[j]] })
	return out
}