// Package invert builds an in-memory inverted index from tokenized posts.
// The index is small enough for the whole TIL corpus to fit comfortably
// in a browser's memory after serialization, so I don't bother with block
// compression here. See mercemay.top/src/tilstream/ for the bigger
// picture of how the search pipeline works.
package invert
import (
"sort"
"sync"
"mercemay.top/src/tilstream/internal/index/stemmer"
"mercemay.top/src/tilstream/internal/index/tokenizer"
)
// Doc is a post waiting to be indexed.
type Doc struct {
ID string
Title string
Body string
Tags []string
URL string
}
// Posting is a single term->document entry with position info.
type Posting struct {
DocID string
Frequency int
Positions []int
}
// Index is the final immutable result.
type Index struct {
Terms map[string][]Posting
Docs map[string]DocMeta
}
// DocMeta holds per-document metadata not stored in the postings.
type DocMeta struct {
Title string
URL string
Tags []string
Len int
}
// Builder accumulates postings and returns a finalized Index.
type Builder struct {
tokenizer *tokenizer.English
stem bool
mu sync.Mutex
docs map[string]DocMeta
postings map[string]map[string]*Posting // term -> docID -> posting
}
// NewBuilder returns a Builder with the default English tokenizer and
// Porter stemming enabled.
func NewBuilder() *Builder {
return &Builder{
tokenizer: tokenizer.DefaultEnglish(),
stem: true,
docs: make(map[string]DocMeta),
postings: make(map[string]map[string]*Posting),
}
}
// DisableStemming skips the Porter pass. Used in tests where deterministic
// token output is easier to reason about.
func (b *Builder) DisableStemming() *Builder {
b.stem = false
return b
}
// Add indexes a single document.
func (b *Builder) Add(d Doc) {
b.mu.Lock()
defer b.mu.Unlock()
tokens := b.tokenizer.Tokenize(d.Title + " " + d.Body)
if b.stem {
for i, t := range tokens {
tokens[i].Term = stemmer.Stem(t.Term)
}
}
for _, tag := range d.Tags {
tokens = append(tokens, tokenizer.Token{Term: stemmer.Stem(tag)})
}
b.docs[d.ID] = DocMeta{
Title: d.Title,
URL: d.URL,
Tags: append([]string(nil), d.Tags...),
Len: len(tokens),
}
for pos, tok := range tokens {
byDoc, ok := b.postings[tok.Term]
if !ok {
byDoc = make(map[string]*Posting)
b.postings[tok.Term] = byDoc
}
p, ok := byDoc[d.ID]
if !ok {
p = &Posting{DocID: d.ID}
byDoc[d.ID] = p
}
p.Frequency++
p.Positions = append(p.Positions, pos)
}
}
// Build finalizes the index and returns an immutable snapshot.
func (b *Builder) Build() *Index {
b.mu.Lock()
defer b.mu.Unlock()
idx := &Index{
Terms: make(map[string][]Posting, len(b.postings)),
Docs: make(map[string]DocMeta, len(b.docs)),
}
for term, byDoc := range b.postings {
list := make([]Posting, 0, len(byDoc))
for _, p := range byDoc {
list = append(list, *p)
}
sort.Slice(list, func(i, j int) bool { return list[i].DocID < list[j].DocID })
idx.Terms[term] = list
}
for id, meta := range b.docs {
idx.Docs[id] = meta
}
return idx
}
// Search returns docIDs matching any of the given terms.
func (idx *Index) Search(terms []string) []string {
hits := make(map[string]int)
for _, term := range terms {
stemmed := stemmer.Stem(term)
for _, p := range idx.Terms[stemmed] {
hits[p.DocID] += p.Frequency
}
}
out := make([]string, 0, len(hits))
for id := range hits {
out = append(out, id)
}
sort.SliceStable(out, func(i, j int) bool { return hits[out[i]] > hits[out[j]] })
return out
}