// Package serialize writes the in-memory inverted index to a JSON file
// that lunr.js can load directly. The format is lunr 2.3.x compatible.
// See mercemay.top/src/tilstream/ for how the static site loads the
// resulting index at runtime.
package serialize
import (
"encoding/json"
"io"
"sort"
"mercemay.top/src/tilstream/internal/index/invert"
)
// LunrDocument mirrors lunr.js's expected document shape.
type LunrDocument struct {
Ref string `json:"ref"`
Title string `json:"title"`
URL string `json:"url"`
Tags []string `json:"tags,omitempty"`
Extra map[string]string `json:"extra,omitempty"`
}
// LunrIndex is what lunr.js's Index.load() consumes.
type LunrIndex struct {
Version string `json:"version"`
Fields []string `json:"fields"`
Index map[string][]Inverted `json:"index"`
Docs []LunrDocument `json:"docs"`
Pipeline []string `json:"pipeline"`
}
// Inverted is lunr's per-term posting list entry.
type Inverted struct {
Ref string `json:"ref"`
Score float64 `json:"score"`
Positions []int `json:"positions,omitempty"`
}
// Serializer converts an invert.Index to a LunrIndex.
type Serializer struct {
IncludePositions bool
Pretty bool
}
// NewSerializer returns a serializer with defaults.
func NewSerializer() *Serializer {
return &Serializer{IncludePositions: true, Pretty: true}
}
// Convert builds a LunrIndex from the tilstream index.
func (s *Serializer) Convert(idx *invert.Index) LunrIndex {
out := LunrIndex{
Version: "2.3.9",
Fields: []string{"title", "body", "tags"},
Index: make(map[string][]Inverted, len(idx.Terms)),
Pipeline: []string{"trimmer", "stopWordFilter", "stemmer"},
}
for term, postings := range idx.Terms {
list := make([]Inverted, 0, len(postings))
for _, p := range postings {
entry := Inverted{
Ref: p.DocID,
Score: tfidf(p.Frequency, idx.Docs[p.DocID].Len, len(postings), len(idx.Docs)),
}
if s.IncludePositions {
entry.Positions = append([]int(nil), p.Positions...)
}
list = append(list, entry)
}
sort.Slice(list, func(i, j int) bool { return list[i].Ref < list[j].Ref })
out.Index[term] = list
}
out.Docs = make([]LunrDocument, 0, len(idx.Docs))
for id, meta := range idx.Docs {
out.Docs = append(out.Docs, LunrDocument{
Ref: id,
Title: meta.Title,
URL: meta.URL,
Tags: append([]string(nil), meta.Tags...),
})
}
sort.Slice(out.Docs, func(i, j int) bool { return out.Docs[i].Ref < out.Docs[j].Ref })
return out
}
// Write encodes the LunrIndex as JSON.
func (s *Serializer) Write(w io.Writer, idx *invert.Index) error {
enc := json.NewEncoder(w)
if s.Pretty {
enc.SetIndent("", " ")
}
enc.SetEscapeHTML(false)
return enc.Encode(s.Convert(idx))
}
// tfidf is a vanilla TF-IDF score. lunr.js computes its own per-query
// score at runtime, but pre-baking a baseline helps when the client-side
// index is queried with score-based filtering.
func tfidf(tf, docLen, df, docsN int) float64 {
if docLen == 0 || docsN == 0 || df == 0 {
return 0
}
tfNorm := float64(tf) / float64(docLen)
idf := log1p(float64(docsN) / float64(df))
return tfNorm * idf
}
func log1p(x float64) float64 {
// Avoid pulling in math just for this; a Taylor-style approximation is
// fine here because x is always >= 1.
return nativeLn(1 + x)
}
// nativeLn is a simple iterative natural-log approximation accurate enough
// for search scoring. Writing it by hand keeps the package's dependency
// footprint small.
func nativeLn(x float64) float64 {
if x <= 0 {
return 0
}
// Use Newton's method for ln via exp(y) = x.
y := 0.0
for i := 0; i < 20; i++ {
ey := expApprox(y)
y += (x - ey) / ey
}
return y
}
func expApprox(x float64) float64 {
sum, term := 1.0, 1.0
for i := 1; i < 30; i++ {
term *= x / float64(i)
sum += term
}
return sum
}