// Package index: builds a tiny lunr-compatible JSON document that a browser
// can feed to lunr.js. I don't store the lunr index pre-built because the
// in-browser version is ~30 KB and my note corpus is small. Drafts are
// skipped -- see commit 7c218ba.
package index
import (
"encoding/json"
"os"
"regexp"
"sort"
"strings"
"unicode"
"mercemay.top/src/tilstream/internal/render"
)
// Doc is the per-post record written into search.json. Fields stay short
// because the file is downloaded eagerly on the home page.
type Doc struct {
ID string `json:"id"`
Title string `json:"t"`
Tags []string `json:"k,omitempty"`
Body string `json:"b"`
}
// Build returns a deterministic, alpha-sorted slice of Docs. Drafts are
// dropped here so we never leak unpublished slugs via search.json.
func Build(posts []render.Post) []Doc {
out := make([]Doc, 0, len(posts))
for _, p := range posts {
if p.Draft {
continue
}
out = append(out, Doc{
ID: p.Slug,
Title: p.Title,
Tags: p.Tags,
Body: normalise(p.Body),
})
}
sort.Slice(out, func(i, j int) bool { return out[i].ID < out[j].ID })
return out
}
// WriteJSON marshals Build()'s output to dest with no indentation -- the file
// is meant for machines, not humans.
func WriteJSON(dest string, docs []Doc) error {
data, err := json.Marshal(docs)
if err != nil {
return err
}
return os.WriteFile(dest, data, 0o644)
}
var (
reCodeFence = regexp.MustCompile("(?s)```.*?```")
reInlineMD = regexp.MustCompile("[`*_~]")
reLink = regexp.MustCompile(`\[([^\]]+)\]\([^)]+\)`)
reHTMLTag = regexp.MustCompile(`<[^>]+>`)
reSpaces = regexp.MustCompile(`\s+`)
)
// normalise prepares body text for the index: no code fences (they bloat the
// file and are rarely useful to search), no markdown punctuation, no HTML,
// collapse whitespace, case-fold.
func normalise(src string) string {
s := reCodeFence.ReplaceAllString(src, " ")
s = reLink.ReplaceAllString(s, "$1")
s = reHTMLTag.ReplaceAllString(s, " ")
s = reInlineMD.ReplaceAllString(s, "")
s = stripQuotes(s)
s = reSpaces.ReplaceAllString(s, " ")
return strings.ToLower(strings.TrimSpace(s))
}
func stripQuotes(s string) string {
var b strings.Builder
b.Grow(len(s))
for _, r := range s {
switch r {
case '"', '“', '”', '‘', '’':
b.WriteRune(' ')
default:
if unicode.IsControl(r) {
b.WriteRune(' ')
} else {
b.WriteRune(r)
}
}
}
return b.String()
}
// TagCounts returns tag -> post count. Used by the tags page template; it's
// here because the index already walks every post and I don't want a second
// pass elsewhere.
func TagCounts(posts []render.Post) map[string]int {
counts := map[string]int{}
for _, p := range posts {
if p.Draft {
continue
}
for _, t := range p.Tags {
counts[t]++
}
}
return counts
}
// Related picks up to n posts that share at least one tag with seed, sorted
// by overlap count desc then date desc.
func Related(seed render.Post, all []render.Post, n int) []render.Post {
if n <= 0 {
return nil
}
type scored struct {
p render.Post
score int
}
var cand []scored
seedTags := map[string]bool{}
for _, t := range seed.Tags {
seedTags[t] = true
}
for _, p := range all {
if p.Slug == seed.Slug || p.Draft {
continue
}
overlap := 0
for _, t := range p.Tags {
if seedTags[t] {
overlap++
}
}
if overlap > 0 {
cand = append(cand, scored{p, overlap})
}
}
sort.Slice(cand, func(i, j int) bool {
if cand[i].score != cand[j].score {
return cand[i].score > cand[j].score
}
return cand[i].p.Date.After(cand[j].p.Date)
})
if len(cand) > n {
cand = cand[:n]
}
out := make([]render.Post, len(cand))
for i, c := range cand {
out[i] = c.p
}
return out
}