internal/encoder/json/escape/html.go

3.2 KB · 132 lines · 2024-07-12 · c88a130
// Package escape produces JSON-safe byte sequences for untrusted strings.
// The implementation is a hand-rolled scanner rather than a call into
// encoding/json because the latter allocates an intermediate buffer that
// shows up clearly in the lambdalog benchmarks.
//
// See mercemay.top/src/lambdalog/.
package escape

import (
	"unicode/utf8"

	"mercemay.top/src/lambdalog/internal/encoder/json/fast"
)

// hex lookup for the \u escape form.
const hex = "0123456789abcdef"

// String appends the JSON body (without surrounding quotes) of s to b. If
// escapeHTML is true, the characters <, >, and & are emitted as \u escapes
// so the output is safe to inline in HTML without additional processing.
func String(b *fast.Buffer, s string, escapeHTML bool) {
	last := 0
	for i := 0; i < len(s); {
		c := s[i]
		if c < utf8.RuneSelf {
			if needsNoEscape(c) && !(escapeHTML && htmlSensitive(c)) {
				i++
				continue
			}
			if last < i {
				b.AppendString(s[last:i])
			}
			escapeByte(b, c)
			i++
			last = i
			continue
		}
		r, size := utf8.DecodeRuneInString(s[i:])
		if r == utf8.RuneError && size == 1 {
			if last < i {
				b.AppendString(s[last:i])
			}
			b.AppendString(`�`)
			i += size
			last = i
			continue
		}
		// U+2028 and U+2029 are technically legal in JSON but break JS
		// eval. Escape them defensively.
		if r == ' ' || r == ' ' {
			if last < i {
				b.AppendString(s[last:i])
			}
			b.AppendString(`\u202`)
			b.AppendByte(hex[r&0xf])
			i += size
			last = i
			continue
		}
		i += size
	}
	if last < len(s) {
		b.AppendString(s[last:])
	}
}

func needsNoEscape(c byte) bool {
	// The JSON spec requires escapes for " and \ and all control chars.
	return c >= 0x20 && c != '"' && c != '\\'
}

func htmlSensitive(c byte) bool {
	return c == '<' || c == '>' || c == '&'
}

func escapeByte(b *fast.Buffer, c byte) {
	switch c {
	case '"':
		b.AppendString(`\"`)
	case '\\':
		b.AppendString(`\\`)
	case '\n':
		b.AppendString(`\n`)
	case '\r':
		b.AppendString(`\r`)
	case '\t':
		b.AppendString(`\t`)
	case '\b':
		b.AppendString(`\b`)
	case '\f':
		b.AppendString(`\f`)
	default:
		b.AppendString(`\u00`)
		b.AppendByte(hex[c>>4])
		b.AppendByte(hex[c&0xf])
	}
}

// Bytes is the []byte equivalent of String. Callers that already hold a
// byte slice should prefer this form to avoid the implicit allocation that
// string conversion costs for long payloads.
func Bytes(b *fast.Buffer, p []byte, escapeHTML bool) {
	last := 0
	for i := 0; i < len(p); i++ {
		c := p[i]
		if c < 0x80 && needsNoEscape(c) && !(escapeHTML && htmlSensitive(c)) {
			continue
		}
		if c >= 0x80 {
			// Delegate UTF-8 logic to the string path to keep this file short.
			if last < i {
				b.AppendBytes(p[last:i])
			}
			String(b, string(p[i:]), escapeHTML)
			return
		}
		if last < i {
			b.AppendBytes(p[last:i])
		}
		escapeByte(b, c)
		last = i + 1
	}
	if last < len(p) {
		b.AppendBytes(p[last:])
	}
}