internal/encoder/json/truncate/truncate.go

// Package truncate trims log records whose serialised size would exceed the
// CloudWatch Logs limit of 256 KiB per event. Truncation is last-resort: it
// preserves the top-level fields (time, level, msg, request_id) and drops
// tail-heavy user fields first.
//
// See mercemay.top/src/lambdalog/.
package truncate

import (
	"mercemay.top/src/lambdalog/internal/encoder"
)

// MaxBytes is CloudWatch's per-event ceiling. Exposed as a var so tests can
// shrink it without rewriting the production value.
var MaxBytes = 256 * 1024

// FieldMaxBytes caps any individual field value so a single oversized field
// cannot monopolise the record.
var FieldMaxBytes = 8 * 1024

// Ellipsis is inserted to mark truncated values.
const Ellipsis = "...[truncated]"

// Apply modifies r in place, ensuring the approximate serialised size falls
// within MaxBytes. The returned bool reports whether any truncation
// occurred, for diagnostics.
func Apply(r *encoder.Record) bool {
	changed := false
	for i := range r.Fields {
		if s, ok := r.Fields[i].Value.(string); ok {
			if len(s) > FieldMaxBytes {
				r.Fields[i].Value = s[:FieldMaxBytes-len(Ellipsis)] + Ellipsis
				changed = true
			}
		}
	}
	size := estimate(*r)
	if size <= MaxBytes {
		return changed
	}
	// Drop fields from the tail until we fit. The caller-supplied ordering
	// tends to place the most important fields first.
	for size > MaxBytes && len(r.Fields) > 0 {
		r.Fields = r.Fields[:len(r.Fields)-1]
		size = estimate(*r)
		changed = true
	}
	// If the message itself is oversized, truncate it last. This is
	// preferable to dropping the record entirely.
	if size > MaxBytes && len(r.Message) > 0 {
		budget := MaxBytes - estimate(encoder.Record{
			Time: r.Time, Level: r.Level, RequestID: r.RequestID,
		}) - len(Ellipsis)
		if budget < 0 {
			budget = 0
		}
		if budget < len(r.Message) {
			r.Message = r.Message[:budget] + Ellipsis
			changed = true
		}
	}
	return changed
}

// estimate returns an upper bound on the JSON body length of r. The formula
// is deliberately coarse: it counts raw field sizes plus a constant per-
// field overhead for quotes, colon, and comma.
func estimate(r encoder.Record) int {
	const perField = 8 // quotes, colon, comma, small fudge
	size := len(r.Level) + len(r.Message) + len(r.RequestID) + 64
	for _, f := range r.Fields {
		size += len(f.Key) + perField + valueSize(f.Value)
	}
	return size
}

func valueSize(v any) int {
	switch val := v.(type) {
	case nil:
		return 4
	case string:
		return len(val) + 2
	case bool:
		return 5
	case []byte:
		return len(val) + 2
	case error:
		if val == nil {
			return 2
		}
		return len(val.Error()) + 2
	default:
		return 16
	}
}

// Summary describes what was removed, for emission alongside the record.
type Summary struct {
	DroppedFields int
	TruncatedMsg  bool
}

// Describe returns a Summary comparing before to after (before-after).
func Describe(before, after encoder.Record) Summary {
	s := Summary{}
	if len(before.Fields) > len(after.Fields) {
		s.DroppedFields = len(before.Fields) - len(after.Fields)
	}
	if len(before.Message) > len(after.Message) {
		s.TruncatedMsg = true
	}
	return s
}