Files
vikingowl 49d80cf847 feat(security): format-aware entropy safelist (Phase F-1)
Add a deterministic pre-extractor that skips known-safe token shapes
before they reach the entropy scorer. Targets the false-positive
regime that bites under lowered entropy_threshold or
redact_high_entropy = true — UUIDs (~3.4 bits), SHA hex digests
(~3.9 bits), ISO-8601 timestamps, and HTTP(S) URLs.

Config knob lives under the existing security section to match
entropy_threshold / redact_high_entropy convention:

  [security]
  entropy_safelist = ["uuid", "sha_hex", "iso8601", "url"]

Empty / unset preserves pre-F-1 behaviour exactly — users opt in.

Per-pattern Debug telemetry fires on every skip (pattern name +
token length, never the token bytes). This is the data F-2's
go/no-go gate depends on; the plan literally specifies it.

NewFirewall validates names at the config boundary and emits a
Warn for unknown entries so a typo like "uid" instead of "uuid"
surfaces loudly instead of silently disabling FP reduction.

Tests cover: UUID/SHA-1/SHA-256 skipped at lowered threshold,
mixed payload (safe shape + real secret) preserves the secret,
secret-adjacent-to-UUID regression guard, empty safelist preserves
pre-F-1 behaviour, unknown name silently dropped at scanner level
but warned at firewall level, end-to-end FirewallConfig wiring,
and the skip-telemetry log line.

F-2 remains gated on real-workload FP-rate observations.
2026-05-22 12:39:10 +02:00

295 lines
9.4 KiB
Go

package security
import (
"bytes"
"log/slog"
"strings"
"testing"
)
// A real high-entropy token (random base64-ish) used as the "secret"
// in mixed-payload tests. Confirmed to score >= 4.5 with the default
// alphabet and to be long enough (>=20 chars) to enter scanEntropy.
const secretToken = "x9KqLm2pNvBz3RtYwH7Xj4QsDc8Fa6Vu"
// loweredThreshold sits below typical UUID/hash entropy (UUID v4 ≈ 3.4,
// SHA hex ≈ 3.9). The plan flags this regime — lowered threshold or
// redact_high_entropy = true — as where FPs bite. F-1 must remove them.
const loweredThreshold = 3.0
func TestSafelist_UUIDIsSkipped(t *testing.T) {
s := NewScanner(loweredThreshold, true)
s.SetSafelist([]string{"uuid"})
matches := s.Scan("trace_id=550e8400-e29b-41d4-a716-446655440000 done")
for _, m := range matches {
if m.Pattern == "high_entropy" {
t.Errorf("UUID should not be flagged as high_entropy: %+v", m)
}
}
}
func TestSafelist_SHA256IsSkipped(t *testing.T) {
s := NewScanner(4.5, true)
s.SetSafelist([]string{"sha_hex"})
sha256 := "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
matches := s.Scan("commit " + sha256)
for _, m := range matches {
if m.Pattern == "high_entropy" {
t.Errorf("SHA-256 should not be flagged as high_entropy: %+v", m)
}
}
}
func TestSafelist_SHA1IsSkipped(t *testing.T) {
s := NewScanner(4.5, true)
s.SetSafelist([]string{"sha_hex"})
sha1 := "356a192b7913b04c54574d18c28d46e6395428ab"
matches := s.Scan("blob " + sha1)
for _, m := range matches {
if m.Pattern == "high_entropy" {
t.Errorf("SHA-1 should not be flagged as high_entropy: %+v", m)
}
}
}
func TestSafelist_MixedPayload_SecretStillCaught(t *testing.T) {
s := NewScanner(loweredThreshold, true)
s.SetSafelist([]string{"uuid", "sha_hex"})
uuid := "550e8400-e29b-41d4-a716-446655440000"
content := "id=" + uuid + " secret=" + secretToken
matches := s.Scan(content)
var entropyHits []SecretMatch
for _, m := range matches {
if m.Pattern == "high_entropy" {
entropyHits = append(entropyHits, m)
}
}
if len(entropyHits) != 1 {
t.Fatalf("want 1 entropy hit (the actual secret), got %d: %+v", len(entropyHits), entropyHits)
}
// Confirm the hit covers the secret, not the UUID.
hit := content[entropyHits[0].Start:entropyHits[0].End]
if hit != secretToken {
t.Errorf("entropy hit covered %q, want %q", hit, secretToken)
}
}
func TestSafelist_EmptyPreservesCurrentBehavior(t *testing.T) {
// No safelist configured — under a lowered threshold the UUID trips
// entropy. This is the pre-F-1 false positive the safelist removes;
// here we lock in that pre-F-1 behaviour is unchanged when no safelist
// is supplied.
s := NewScanner(loweredThreshold, true) // SetSafelist intentionally not called
uuid := "550e8400-e29b-41d4-a716-446655440000"
matches := s.Scan(uuid)
var entropyHits int
for _, m := range matches {
if m.Pattern == "high_entropy" {
entropyHits++
}
}
if entropyHits == 0 {
t.Error("with no safelist + lowered threshold, UUID should still trigger entropy (pre-F-1 baseline)")
}
}
func TestSafelist_UnknownNameIgnored(t *testing.T) {
s := NewScanner(loweredThreshold, true)
// "made_up" is not a known pattern — must be silently dropped, not panic.
s.SetSafelist([]string{"uuid", "made_up", "sha_hex"})
uuid := "550e8400-e29b-41d4-a716-446655440000"
matches := s.Scan(uuid)
for _, m := range matches {
if m.Pattern == "high_entropy" {
t.Errorf("uuid should still be skipped despite unknown name in list: %+v", m)
}
}
}
func TestSafelist_URLPathNotFlagged(t *testing.T) {
s := NewScanner(4.5, true)
s.SetSafelist([]string{"url"})
// A high-entropy URL path — a real-world false positive shape.
url := "https://example.com/" + secretToken
matches := s.Scan(url)
for _, m := range matches {
if m.Pattern == "high_entropy" {
hit := url[m.Start:m.End]
t.Errorf("URL substring %q should be covered by url safelist", hit)
}
}
}
func TestSafelist_ISO8601Span(t *testing.T) {
// ISO-8601 timestamps don't survive entropy tokenization as a single
// 20+-char token (':' splits them), so this is mostly a sanity check
// that declaring iso8601 doesn't break anything.
s := NewScanner(4.5, true)
s.SetSafelist([]string{"iso8601"})
ts := "2026-05-22T10:30:00.123Z"
matches := s.Scan(ts)
for _, m := range matches {
if m.Pattern == "high_entropy" {
t.Errorf("ISO-8601 timestamp should not trip entropy: %+v", m)
}
}
}
func TestSafelist_SecretAdjacentToUUIDStillRedacted(t *testing.T) {
// Regression guard: a real secret that happens to abut a UUID must
// not be swallowed by the UUID's safelist span.
s := NewScanner(loweredThreshold, true)
s.SetSafelist([]string{"uuid"})
uuid := "550e8400-e29b-41d4-a716-446655440000"
content := uuid + " " + secretToken
matches := s.Scan(content)
var foundSecret bool
for _, m := range matches {
if m.Pattern == "high_entropy" && content[m.Start:m.End] == secretToken {
foundSecret = true
}
}
if !foundSecret {
t.Errorf("secret adjacent to UUID was not detected; matches=%+v", matches)
}
}
func TestSafelist_KnownPatternNamesMatchPlan(t *testing.T) {
// Plan-locked names that the user-facing TOML knob accepts.
// Changing these breaks user configs — bump with care.
want := []string{"uuid", "sha_hex", "iso8601", "url"}
got := defaultSafelistPatterns()
if len(got) != len(want) {
t.Fatalf("default safelist size = %d, want %d", len(got), len(want))
}
for _, name := range want {
if _, ok := got[name]; !ok {
t.Errorf("missing safelist pattern %q (have %v)", name, safelistKeys(got))
}
}
}
func safelistKeys[V any](m map[string]V) []string {
out := make([]string, 0, len(m))
for k := range m {
out = append(out, k)
}
return out
}
func TestFirewall_EntropySafelistEndToEnd(t *testing.T) {
// End-to-end: FirewallConfig.EntropySafelist must flow through to
// the scanner's runtime behavior. A SHA-256 in tool output should
// survive an entropy-redacting firewall when sha_hex is safelisted.
sha256 := "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
content := "commit " + sha256 + " landed"
withSafelist := NewFirewall(FirewallConfig{
ScanToolResults: true,
RedactHighEntropy: true,
EntropyThreshold: loweredThreshold,
EntropySafelist: []string{"sha_hex"},
})
if got := withSafelist.ScanToolResult(content); !strings.Contains(got, sha256) {
t.Errorf("safelisted SHA-256 should pass through, got %q", got)
}
withoutSafelist := NewFirewall(FirewallConfig{
ScanToolResults: true,
RedactHighEntropy: true,
EntropyThreshold: loweredThreshold,
})
if got := withoutSafelist.ScanToolResult(content); strings.Contains(got, sha256) {
t.Errorf("without safelist the SHA-256 should be redacted at threshold %.1f, got %q", loweredThreshold, got)
}
}
func TestFirewall_UnknownSafelistNameWarns(t *testing.T) {
// A typo like "uid" instead of "uuid" must surface as a Warn so the
// operator notices, rather than silently disabling FP reduction.
var buf bytes.Buffer
logger := slog.New(slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelWarn}))
_ = NewFirewall(FirewallConfig{
EntropySafelist: []string{"uuid", "uid"}, // "uid" is the typo
Logger: logger,
})
logs := buf.String()
if !strings.Contains(logs, "unknown entropy safelist name") {
t.Errorf("expected warning about unknown name, got logs: %q", logs)
}
if !strings.Contains(logs, "uid") {
t.Errorf("warning should name the unknown entry, got logs: %q", logs)
}
if strings.Contains(logs, "name=uuid ") || strings.Contains(logs, "name=uuid\n") {
t.Errorf("known name 'uuid' should not be warned about, got logs: %q", logs)
}
}
func TestFirewall_AllKnownSafelistNamesQuiet(t *testing.T) {
// No warnings for any of the canonical names — guards against a
// future code change that accidentally renames a default pattern.
var buf bytes.Buffer
logger := slog.New(slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelWarn}))
_ = NewFirewall(FirewallConfig{
EntropySafelist: []string{"uuid", "sha_hex", "iso8601", "url"},
Logger: logger,
})
if logs := buf.String(); logs != "" {
t.Errorf("known safelist names should not warn, got: %q", logs)
}
}
func TestSafelist_SkipIsLogged(t *testing.T) {
// Per-pattern telemetry is the data F-2's go/no-go gate depends on.
// Verify a skip emits a Debug log carrying the pattern name.
var buf bytes.Buffer
logger := slog.New(slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelDebug}))
s := NewScanner(loweredThreshold, true)
s.SetLogger(logger)
s.SetSafelist([]string{"uuid"})
uuid := "550e8400-e29b-41d4-a716-446655440000"
_ = s.Scan(uuid)
logs := buf.String()
if !strings.Contains(logs, "entropy candidate skipped by safelist") {
t.Errorf("expected debug log on skip, got: %q", logs)
}
if !strings.Contains(logs, "pattern=uuid") {
t.Errorf("debug log should carry pattern name, got: %q", logs)
}
}
// Sanity check the helper that powers other tests: the secret token
// we use really is high-entropy and long enough for the scanner.
func TestSafelist_SecretTokenIsHighEntropy(t *testing.T) {
if len(secretToken) < 20 {
t.Fatalf("secretToken too short: %d", len(secretToken))
}
if e := shannonEntropy(secretToken); e < 4.5 {
t.Fatalf("secretToken entropy = %.2f, want >= 4.5 (test corpus drift)", e)
}
// And confirm it's stripped of any characters that would split the token.
if strings.ContainsAny(secretToken, " .:") {
t.Fatalf("secretToken contains a tokenizer split char")
}
}