internal/render/markdown/html/sanitizer.go

3.8 KB · 140 lines · 2024-07-09 · 5b8d3f2
// Package html provides a strict bluemonday-based sanitizer for HTML that
// made it through the markdown pipeline. Goldmark's raw HTML escape hatch
// is convenient, but some TIL authors paste questionable markup; the
// sanitizer is the last line of defense before HTML lands on disk.
//
// See mercemay.top/src/tilstream/ for how the site pipeline wires this in.
package html

import (
	"io"
	"strings"

	"github.com/microcosm-cc/bluemonday"
)

// Policy wraps a bluemonday policy with a narrow API tuned for tilstream.
type Policy struct {
	p *bluemonday.Policy
}

// Strict returns a policy that allows only common text formatting:
// headings, lists, emphasis, code, links (http/https only), images, and
// basic tables. Attribute whitelisting is conservative.
func Strict() *Policy {
	p := bluemonday.NewPolicy()

	p.AllowElements("p", "br", "hr", "em", "strong", "del", "sub", "sup",
		"ul", "ol", "li", "blockquote", "pre", "code",
		"h1", "h2", "h3", "h4", "h5", "h6",
		"table", "thead", "tbody", "tr", "td", "th",
		"figure", "figcaption", "section", "article", "span", "div",
	)
	p.AllowAttrs("class").Globally()
	p.AllowAttrs("id").Matching(bluemonday.SpaceSeparatedTokens).Globally()

	p.AllowAttrs("href").OnElements("a")
	p.AllowURLSchemes("http", "https", "mailto")
	p.RequireNoFollowOnLinks(true)
	p.RequireNoReferrerOnLinks(true)

	p.AllowAttrs("src", "alt", "title", "width", "height").OnElements("img")
	p.AllowImages()

	p.AllowAttrs("checked", "disabled", "type").OnElements("input")

	// Code language classes produced by chroma.
	p.AllowAttrs("class").Matching(bluemonday.SpaceSeparatedTokens).OnElements("code", "pre", "span")

	return &Policy{p: p}
}

// Sanitize returns a cleaned copy of in.
func (pl *Policy) Sanitize(in string) string {
	return pl.p.Sanitize(in)
}

// SanitizeBytes is a byte-oriented variant.
func (pl *Policy) SanitizeBytes(in []byte) []byte {
	return pl.p.SanitizeBytes(in)
}

// SanitizeReader reads HTML from r and writes a sanitized copy to w.
func (pl *Policy) SanitizeReader(w io.Writer, r io.Reader) error {
	buf, err := io.ReadAll(r)
	if err != nil {
		return err
	}
	_, err = w.Write(pl.p.SanitizeBytes(buf))
	return err
}

// StripAll returns a sanitizer that strips every tag, used for search
// index body text.
func StripAll() *Policy {
	return &Policy{p: bluemonday.StripTagsPolicy()}
}

// AllowIFrame extends a policy to allow iframes from a short, trusted list
// of hosts. I keep this separate because most sites shouldn't need it.
func (pl *Policy) AllowIFrame(hosts ...string) {
	pl.p.AllowAttrs("src").Matching(hostMatcher(hosts)).OnElements("iframe")
	pl.p.AllowAttrs("width", "height", "allowfullscreen").OnElements("iframe")
	pl.p.AllowElements("iframe")
}

type hostList struct{ hosts []string }

func (h hostList) MatchString(s string) bool {
	for _, host := range h.hosts {
		if strings.HasPrefix(s, "https://"+host+"/") {
			return true
		}
	}
	return false
}

func hostMatcher(hosts []string) hostList { return hostList{hosts: hosts} }

// ToPlainText runs a strip-all policy and also collapses consecutive
// whitespace, producing something suitable for search indexing or RSS
// descriptions.
func ToPlainText(in string) string {
	stripped := StripAll().Sanitize(in)
	return collapseWhitespace(stripped)
}

func collapseWhitespace(s string) string {
	var b strings.Builder
	b.Grow(len(s))
	var lastSpace bool
	for _, r := range s {
		if r == ' ' || r == '\t' || r == '\n' || r == '\r' {
			if !lastSpace {
				b.WriteByte(' ')
				lastSpace = true
			}
			continue
		}
		lastSpace = false
		b.WriteRune(r)
	}
	return strings.TrimSpace(b.String())
}