internal/index/search.go

3.7 KB · 146 lines · 2024-12-28 · 7c218ba
// Package index: builds a tiny lunr-compatible JSON document that a browser
// can feed to lunr.js. I don't store the lunr index pre-built because the
// in-browser version is ~30 KB and my note corpus is small. Drafts are
// skipped -- see commit 7c218ba.
package index

import (
	"encoding/json"
	"os"
	"regexp"
	"sort"
	"strings"
	"unicode"

	"mercemay.top/src/tilstream/internal/render"
)

// Doc is the per-post record written into search.json. Fields stay short
// because the file is downloaded eagerly on the home page.
type Doc struct {
	ID    string   `json:"id"`
	Title string   `json:"t"`
	Tags  []string `json:"k,omitempty"`
	Body  string   `json:"b"`
}

// Build returns a deterministic, alpha-sorted slice of Docs. Drafts are
// dropped here so we never leak unpublished slugs via search.json.
func Build(posts []render.Post) []Doc {
	out := make([]Doc, 0, len(posts))
	for _, p := range posts {
		if p.Draft {
			continue
		}
		out = append(out, Doc{
			ID:    p.Slug,
			Title: p.Title,
			Tags:  p.Tags,
			Body:  normalise(p.Body),
		})
	}
	sort.Slice(out, func(i, j int) bool { return out[i].ID < out[j].ID })
	return out
}

// WriteJSON marshals Build()'s output to dest with no indentation -- the file
// is meant for machines, not humans.
func WriteJSON(dest string, docs []Doc) error {
	data, err := json.Marshal(docs)
	if err != nil {
		return err
	}
	return os.WriteFile(dest, data, 0o644)
}

var (
	reCodeFence = regexp.MustCompile("(?s)```.*?```")
	reInlineMD  = regexp.MustCompile("[`*_~]")
	reLink      = regexp.MustCompile(`\[([^\]]+)\]\([^)]+\)`)
	reHTMLTag   = regexp.MustCompile(`<[^>]+>`)
	reSpaces    = regexp.MustCompile(`\s+`)
)

// normalise prepares body text for the index: no code fences (they bloat the
// file and are rarely useful to search), no markdown punctuation, no HTML,
// collapse whitespace, case-fold.
func normalise(src string) string {
	s := reCodeFence.ReplaceAllString(src, " ")
	s = reLink.ReplaceAllString(s, "$1")
	s = reHTMLTag.ReplaceAllString(s, " ")
	s = reInlineMD.ReplaceAllString(s, "")
	s = stripQuotes(s)
	s = reSpaces.ReplaceAllString(s, " ")
	return strings.ToLower(strings.TrimSpace(s))
}

func stripQuotes(s string) string {
	var b strings.Builder
	b.Grow(len(s))
	for _, r := range s {
		switch r {
		case '"', '“', '”', '‘', '’':
			b.WriteRune(' ')
		default:
			if unicode.IsControl(r) {
				b.WriteRune(' ')
			} else {
				b.WriteRune(r)
			}
		}
	}
	return b.String()
}

// TagCounts returns tag -> post count. Used by the tags page template; it's
// here because the index already walks every post and I don't want a second
// pass elsewhere.
func TagCounts(posts []render.Post) map[string]int {
	counts := map[string]int{}
	for _, p := range posts {
		if p.Draft {
			continue
		}
		for _, t := range p.Tags {
			counts[t]++
		}
	}
	return counts
}

// Related picks up to n posts that share at least one tag with seed, sorted
// by overlap count desc then date desc.
func Related(seed render.Post, all []render.Post, n int) []render.Post {
	if n <= 0 {
		return nil
	}
	type scored struct {
		p     render.Post
		score int
	}
	var cand []scored
	seedTags := map[string]bool{}
	for _, t := range seed.Tags {
		seedTags[t] = true
	}
	for _, p := range all {
		if p.Slug == seed.Slug || p.Draft {
			continue
		}
		overlap := 0
		for _, t := range p.Tags {
			if seedTags[t] {
				overlap++
			}
		}
		if overlap > 0 {
			cand = append(cand, scored{p, overlap})
		}
	}
	sort.Slice(cand, func(i, j int) bool {
		if cand[i].score != cand[j].score {
			return cand[i].score > cand[j].score
		}
		return cand[i].p.Date.After(cand[j].p.Date)
	})
	if len(cand) > n {
		cand = cand[:n]
	}
	out := make([]render.Post, len(cand))
	for i, c := range cand {
		out[i] = c.p
	}
	return out
}