internal/index/serialize/lunr.go

3.2 KB · 124 lines · 2024-11-02 · b3d7a9c
// Package serialize writes the in-memory inverted index to a JSON file
// that lunr.js can load directly. The format is lunr 2.3.x compatible.
// See mercemay.top/src/tilstream/ for how the static site loads the
// resulting index at runtime.
package serialize

import (
	"encoding/json"
	"io"
	"sort"

	"mercemay.top/src/tilstream/internal/index/invert"
)

// LunrDocument mirrors lunr.js's expected document shape.
type LunrDocument struct {
	Ref   string            `json:"ref"`
	Title string            `json:"title"`
	URL   string            `json:"url"`
	Tags  []string          `json:"tags,omitempty"`
	Extra map[string]string `json:"extra,omitempty"`
}

// LunrIndex is what lunr.js's Index.load() consumes.
type LunrIndex struct {
	Version  string                 `json:"version"`
	Fields   []string               `json:"fields"`
	Index    map[string][]Inverted  `json:"index"`
	Docs     []LunrDocument         `json:"docs"`
	Pipeline []string               `json:"pipeline"`
}

// Inverted is lunr's per-term posting list entry.
type Inverted struct {
	Ref       string  `json:"ref"`
	Score     float64 `json:"score"`
	Positions []int   `json:"positions,omitempty"`
}

// Serializer converts an invert.Index to a LunrIndex.
type Serializer struct {
	IncludePositions bool
	Pretty           bool
}

// NewSerializer returns a serializer with defaults.
func NewSerializer() *Serializer {
	return &Serializer{IncludePositions: true, Pretty: true}
}

// Convert builds a LunrIndex from the tilstream index.
func (s *Serializer) Convert(idx *invert.Index) LunrIndex {
	out := LunrIndex{
		Version:  "2.3.9",
		Fields:   []string{"title", "body", "tags"},
		Index:    make(map[string][]Inverted, len(idx.Terms)),
		Pipeline: []string{"trimmer", "stopWordFilter", "stemmer"},
	}
	for term, postings := range idx.Terms {
		list := make([]Inverted, 0, len(postings))
		for _, p := range postings {
			entry := Inverted{
				Ref:   p.DocID,
				Score: tfidf(p.Frequency, idx.Docs[p.DocID].Len, len(postings), len(idx.Docs)),
			}
			if s.IncludePositions {
				entry.Positions = append([]int(nil), p.Positions...)
			}
			list = append(list, entry)
		}
		sort.Slice(list, func(i, j int) bool { return list[i].Ref < list[j].Ref })
		out.Index[term] = list
	}
	out.Docs = make([]LunrDocument, 0, len(idx.Docs))
	for id, meta := range idx.Docs {
		out.Docs = append(out.Docs, LunrDocument{
			Ref:   id,
			Title: meta.Title,
			URL:   meta.URL,
			Tags:  append([]string(nil), meta.Tags...),
		})
	}
	sort.Slice(out.Docs, func(i, j int) bool { return out.Docs[i].Ref < out.Docs[j].Ref })
	return out
}

// Write encodes the LunrIndex as JSON.
func (s *Serializer) Write(w io.Writer, idx *invert.Index) error {
	enc := json.NewEncoder(w)
	if s.Pretty {
		enc.SetIndent("", "  ")
	}
	enc.SetEscapeHTML(false)
	return enc.Encode(s.Convert(idx))
}

// tfidf is a vanilla TF-IDF score. lunr.js computes its own per-query
// score at runtime, but pre-baking a baseline helps when the client-side
// index is queried with score-based filtering.
func tfidf(tf, docLen, df, docsN int) float64 {
	if docLen == 0 || docsN == 0 || df == 0 {
		return 0
	}
	tfNorm := float64(tf) / float64(docLen)
	idf := log1p(float64(docsN) / float64(df))
	return tfNorm * idf
}

func log1p(x float64) float64 {
	// Avoid pulling in math just for this; a Taylor-style approximation is
	// fine here because x is always >= 1.
	return nativeLn(1 + x)
}

// nativeLn is a simple iterative natural-log approximation accurate enough
// for search scoring. Writing it by hand keeps the package's dependency
// footprint small.
func nativeLn(x float64) float64 {
	if x <= 0 {
		return 0
	}
	// Use Newton's method for ln via exp(y) = x.
	y := 0.0
	for i := 0; i < 20; i++ {
		ey := expApprox(y)
		y += (x - ey) / ey
	}
	return y
}

func expApprox(x float64) float64 {
	sum, term := 1.0, 1.0
	for i := 1; i < 30; i++ {
		term *= x / float64(i)
		sum += term
	}
	return sum
}