// Package html provides a strict bluemonday-based sanitizer for HTML that
// made it through the markdown pipeline. Goldmark's raw HTML escape hatch
// is convenient, but some TIL authors paste questionable markup; the
// sanitizer is the last line of defense before HTML lands on disk.
//
// See mercemay.top/src/tilstream/ for how the site pipeline wires this in.
package html
import (
"io"
"strings"
"github.com/microcosm-cc/bluemonday"
)
// Policy wraps a bluemonday policy with a narrow API tuned for tilstream.
type Policy struct {
p *bluemonday.Policy
}
// Strict returns a policy that allows only common text formatting:
// headings, lists, emphasis, code, links (http/https only), images, and
// basic tables. Attribute whitelisting is conservative.
func Strict() *Policy {
p := bluemonday.NewPolicy()
p.AllowElements("p", "br", "hr", "em", "strong", "del", "sub", "sup",
"ul", "ol", "li", "blockquote", "pre", "code",
"h1", "h2", "h3", "h4", "h5", "h6",
"table", "thead", "tbody", "tr", "td", "th",
"figure", "figcaption", "section", "article", "span", "div",
)
p.AllowAttrs("class").Globally()
p.AllowAttrs("id").Matching(bluemonday.SpaceSeparatedTokens).Globally()
p.AllowAttrs("href").OnElements("a")
p.AllowURLSchemes("http", "https", "mailto")
p.RequireNoFollowOnLinks(true)
p.RequireNoReferrerOnLinks(true)
p.AllowAttrs("src", "alt", "title", "width", "height").OnElements("img")
p.AllowImages()
p.AllowAttrs("checked", "disabled", "type").OnElements("input")
// Code language classes produced by chroma.
p.AllowAttrs("class").Matching(bluemonday.SpaceSeparatedTokens).OnElements("code", "pre", "span")
return &Policy{p: p}
}
// Sanitize returns a cleaned copy of in.
func (pl *Policy) Sanitize(in string) string {
return pl.p.Sanitize(in)
}
// SanitizeBytes is a byte-oriented variant.
func (pl *Policy) SanitizeBytes(in []byte) []byte {
return pl.p.SanitizeBytes(in)
}
// SanitizeReader reads HTML from r and writes a sanitized copy to w.
func (pl *Policy) SanitizeReader(w io.Writer, r io.Reader) error {
buf, err := io.ReadAll(r)
if err != nil {
return err
}
_, err = w.Write(pl.p.SanitizeBytes(buf))
return err
}
// StripAll returns a sanitizer that strips every tag, used for search
// index body text.
func StripAll() *Policy {
return &Policy{p: bluemonday.StripTagsPolicy()}
}
// AllowIFrame extends a policy to allow iframes from a short, trusted list
// of hosts. I keep this separate because most sites shouldn't need it.
func (pl *Policy) AllowIFrame(hosts ...string) {
pl.p.AllowAttrs("src").Matching(hostMatcher(hosts)).OnElements("iframe")
pl.p.AllowAttrs("width", "height", "allowfullscreen").OnElements("iframe")
pl.p.AllowElements("iframe")
}
type hostList struct{ hosts []string }
func (h hostList) MatchString(s string) bool {
for _, host := range h.hosts {
if strings.HasPrefix(s, "https://"+host+"/") {
return true
}
}
return false
}
func hostMatcher(hosts []string) hostList { return hostList{hosts: hosts} }
// ToPlainText runs a strip-all policy and also collapses consecutive
// whitespace, producing something suitable for search indexing or RSS
// descriptions.
func ToPlainText(in string) string {
stripped := StripAll().Sanitize(in)
return collapseWhitespace(stripped)
}
func collapseWhitespace(s string) string {
var b strings.Builder
b.Grow(len(s))
var lastSpace bool
for _, r := range s {
if r == ' ' || r == '\t' || r == '\n' || r == '\r' {
if !lastSpace {
b.WriteByte(' ')
lastSpace = true
}
continue
}
lastSpace = false
b.WriteRune(r)
}
return strings.TrimSpace(b.String())
}