Files
vikingowl 49d80cf847 feat(security): format-aware entropy safelist (Phase F-1)
Add a deterministic pre-extractor that skips known-safe token shapes
before they reach the entropy scorer. Targets the false-positive
regime that bites under lowered entropy_threshold or
redact_high_entropy = true — UUIDs (~3.4 bits), SHA hex digests
(~3.9 bits), ISO-8601 timestamps, and HTTP(S) URLs.

Config knob lives under the existing security section to match
entropy_threshold / redact_high_entropy convention:

  [security]
  entropy_safelist = ["uuid", "sha_hex", "iso8601", "url"]

Empty / unset preserves pre-F-1 behaviour exactly — users opt in.

Per-pattern Debug telemetry fires on every skip (pattern name +
token length, never the token bytes). This is the data F-2's
go/no-go gate depends on; the plan literally specifies it.

NewFirewall validates names at the config boundary and emits a
Warn for unknown entries so a typo like "uid" instead of "uuid"
surfaces loudly instead of silently disabling FP reduction.

Tests cover: UUID/SHA-1/SHA-256 skipped at lowered threshold,
mixed payload (safe shape + real secret) preserves the secret,
secret-adjacent-to-UUID regression guard, empty safelist preserves
pre-F-1 behaviour, unknown name silently dropped at scanner level
but warned at firewall level, end-to-end FirewallConfig wiring,
and the skip-telemetry log line.

F-2 remains gated on real-workload FP-rate observations.
2026-05-22 12:39:10 +02:00

98 lines
3.5 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package security
import "regexp"
// safelistEntry pairs a user-facing pattern name (the TOML knob value) with
// its compiled regex. The name flows through to log fields so operators can
// measure per-pattern FP-rate deltas — the data F-2's go/no-go decision
// depends on.
type safelistEntry struct {
name string
re *regexp.Regexp
}
// safelistSpan is a half-open byte range [start, end) in the scanned content
// that the user has declared as a known-safe shape (UUID, hash, URL, timestamp).
// Tokens contained inside any span are skipped by scanEntropy — they never
// reach the entropy scorer, so they cannot produce false positives under
// lowered thresholds or redact_high_entropy = true.
type safelistSpan struct {
start int
end int
name string
}
// defaultSafelistPatterns returns the curated allow-list of known-safe shapes,
// keyed by the user-facing name accepted in [security].entropy_safelist.
//
// Adding a key here exposes a new opt-in name to user configs. Removing or
// renaming a key is a breaking change.
func defaultSafelistPatterns() map[string]*regexp.Regexp {
return map[string]*regexp.Regexp{
// UUID v15: 8-4-4-4-12 hex with hyphens. Case-insensitive.
"uuid": regexp.MustCompile(`(?i)\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b`),
// SHA-1 / SHA-256 / SHA-384 / SHA-512 hex digests.
"sha_hex": regexp.MustCompile(`(?i)\b(?:[0-9a-f]{40}|[0-9a-f]{64}|[0-9a-f]{96}|[0-9a-f]{128})\b`),
// ISO-8601 timestamp (date + time, optional fractional seconds, optional zone).
"iso8601": regexp.MustCompile(`\b\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?\b`),
// RFC-3986-ish HTTP(S) URL. Greedy up to whitespace or quoting.
"url": regexp.MustCompile(`\bhttps?://[^\s'"<>` + "`" + `]+`),
}
}
// splitSafelistNames partitions user-supplied names into resolved entries and
// the list of unknown names. Callers (NewFirewall) surface unknowns so a typo
// like "uid" instead of "uuid" doesn't silently disable the safelist.
func splitSafelistNames(names []string) (entries []safelistEntry, unknown []string) {
if len(names) == 0 {
return nil, nil
}
defaults := defaultSafelistPatterns()
for _, name := range names {
if re, ok := defaults[name]; ok {
entries = append(entries, safelistEntry{name: name, re: re})
} else {
unknown = append(unknown, name)
}
}
return entries, unknown
}
// buildSafelist resolves names to entries, dropping unknowns silently. Used
// where the caller doesn't need to report typos (e.g. test setup).
func buildSafelist(names []string) []safelistEntry {
entries, _ := splitSafelistNames(names)
return entries
}
// safelistSpansFor returns every safelist match in content, tagged with the
// pattern name that produced it. Spans may overlap; containment is checked
// per-token in scanEntropy.
func safelistSpansFor(content string, entries []safelistEntry) []safelistSpan {
if len(entries) == 0 {
return nil
}
var spans []safelistSpan
for _, e := range entries {
for _, loc := range e.re.FindAllStringIndex(content, -1) {
spans = append(spans, safelistSpan{start: loc[0], end: loc[1], name: e.name})
}
}
return spans
}
// inAnySpan reports whether [start, end) lies fully inside any safelist span.
// Returns the matching pattern name so the skip can be logged for FP-rate
// telemetry — the data F-2 gates on.
func inAnySpan(spans []safelistSpan, start, end int) (string, bool) {
for _, s := range spans {
if start >= s.start && end <= s.end {
return s.name, true
}
}
return "", false
}