internal/pipeline/stage/hash.go

package stage

import (
	"context"
	"crypto/sha256"
	"encoding/hex"
	"fmt"
	"sort"
	"strings"

	"mercemay.top/src/tilstream/internal/pipeline"
)

// Hash computes a deterministic content hash for each post and stores it
// on Post.Hash. The hash is SHA-256 of:
//
//   raw markdown body || sorted front-matter k=v pairs
//
// I use it as a cache key for the content-addressed output directory; when
// a post's hash doesn't change between builds, the write stage can skip
// it entirely.
type Hash struct {
	Short bool
}

// Name returns the stage name.
func (*Hash) Name() string { return "hash" }

// Run fills Post.Hash for every post.
func (h *Hash) Run(ctx context.Context, st *pipeline.State) error {
	for i := range st.Posts {
		select {
		case <-ctx.Done():
			return ctx.Err()
		default:
		}
		hash := ContentHash(st.Posts[i].Raw, st.Posts[i].Meta)
		if h.Short {
			hash = hash[:12]
		}
		st.Posts[i].Hash = hash
	}
	return nil
}

// ContentHash computes the SHA-256 hash described above. Exposed so other
// callers (e.g. the dev server cache) can match the same key.
func ContentHash(raw []byte, meta map[string]string) string {
	sum := sha256.New()
	sum.Write(raw)
	sum.Write([]byte{0}) // separator byte
	keys := make([]string, 0, len(meta))
	for k := range meta {
		keys = append(keys, k)
	}
	sort.Strings(keys)
	for _, k := range keys {
		sum.Write([]byte(k))
		sum.Write([]byte{'='})
		sum.Write([]byte(meta[k]))
		sum.Write([]byte{'\n'})
	}
	return hex.EncodeToString(sum.Sum(nil))
}

// ShortHash returns the first 12 characters of ContentHash.
func ShortHash(raw []byte, meta map[string]string) string {
	return ContentHash(raw, meta)[:12]
}

// Verify re-computes each post's hash and returns an error if any stored
// hash no longer matches. Used by the doctor command.
func Verify(st *pipeline.State) error {
	var mismatches []string
	for _, p := range st.Posts {
		want := ContentHash(p.Raw, p.Meta)
		stored := p.Hash
		if len(stored) == 12 {
			want = want[:12]
		}
		if stored != "" && stored != want {
			mismatches = append(mismatches, fmt.Sprintf("%s: stored=%s got=%s", p.Path, stored, want))
		}
	}
	if len(mismatches) > 0 {
		return fmt.Errorf("hash mismatch:\n  %s", strings.Join(mismatches, "\n  "))
	}
	return nil
}

// Manifest returns a newline-joined list of "path hash" pairs suitable for
// writing to an on-disk manifest file.
func Manifest(st *pipeline.State) string {
	rows := make([]string, 0, len(st.Posts))
	for _, p := range st.Posts {
		rows = append(rows, fmt.Sprintf("%s %s", p.Path, p.Hash))
	}
	sort.Strings(rows)
	return strings.Join(rows, "\n") + "\n"
}