gnoma/internal/security/safelist.go

package security

import "regexp"

// safelistEntry pairs a user-facing pattern name (the TOML knob value) with
// its compiled regex. The name flows through to log fields so operators can
// measure per-pattern FP-rate deltas — the data F-2's go/no-go decision
// depends on.
type safelistEntry struct {
	name string
	re   *regexp.Regexp
}

// safelistSpan is a half-open byte range [start, end) in the scanned content
// that the user has declared as a known-safe shape (UUID, hash, URL, timestamp).
// Tokens contained inside any span are skipped by scanEntropy — they never
// reach the entropy scorer, so they cannot produce false positives under
// lowered thresholds or redact_high_entropy = true.
type safelistSpan struct {
	start int
	end   int
	name  string
}

// defaultSafelistPatterns returns the curated allow-list of known-safe shapes,
// keyed by the user-facing name accepted in [security].entropy_safelist.
//
// Adding a key here exposes a new opt-in name to user configs. Removing or
// renaming a key is a breaking change.
func defaultSafelistPatterns() map[string]*regexp.Regexp {
	return map[string]*regexp.Regexp{
		// UUID v1–5: 8-4-4-4-12 hex with hyphens. Case-insensitive.
		"uuid": regexp.MustCompile(`(?i)\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b`),

		// SHA-1 / SHA-256 / SHA-384 / SHA-512 hex digests.
		"sha_hex": regexp.MustCompile(`(?i)\b(?:[0-9a-f]{40}|[0-9a-f]{64}|[0-9a-f]{96}|[0-9a-f]{128})\b`),

		// ISO-8601 timestamp (date + time, optional fractional seconds, optional zone).
		"iso8601": regexp.MustCompile(`\b\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?\b`),

		// RFC-3986-ish HTTP(S) URL. Greedy up to whitespace or quoting.
		"url": regexp.MustCompile(`\bhttps?://[^\s'"<>` + "`" + `]+`),
	}
}

// splitSafelistNames partitions user-supplied names into resolved entries and
// the list of unknown names. Callers (NewFirewall) surface unknowns so a typo
// like "uid" instead of "uuid" doesn't silently disable the safelist.
func splitSafelistNames(names []string) (entries []safelistEntry, unknown []string) {
	if len(names) == 0 {
		return nil, nil
	}
	defaults := defaultSafelistPatterns()
	for _, name := range names {
		if re, ok := defaults[name]; ok {
			entries = append(entries, safelistEntry{name: name, re: re})
		} else {
			unknown = append(unknown, name)
		}
	}
	return entries, unknown
}

// buildSafelist resolves names to entries, dropping unknowns silently. Used
// where the caller doesn't need to report typos (e.g. test setup).
func buildSafelist(names []string) []safelistEntry {
	entries, _ := splitSafelistNames(names)
	return entries
}

// safelistSpansFor returns every safelist match in content, tagged with the
// pattern name that produced it. Spans may overlap; containment is checked
// per-token in scanEntropy.
func safelistSpansFor(content string, entries []safelistEntry) []safelistSpan {
	if len(entries) == 0 {
		return nil
	}
	var spans []safelistSpan
	for _, e := range entries {
		for _, loc := range e.re.FindAllStringIndex(content, -1) {
			spans = append(spans, safelistSpan{start: loc[0], end: loc[1], name: e.name})
		}
	}
	return spans
}

// inAnySpan reports whether [start, end) lies fully inside any safelist span.
// Returns the matching pattern name so the skip can be logged for FP-rate
// telemetry — the data F-2 gates on.
func inAnySpan(spans []safelistSpan, start, end int) (string, bool) {
	for _, s := range spans {
		if start >= s.start && end <= s.end {
			return s.name, true
		}
	}
	return "", false
}