fix(slm,router): honest classifier diagnostics + 15s default timeout

Five fixes folded into one commit because they all answer the same
question: 'why does my router stats output lie to me?'

Issue 1 (timeout). Default classify timeout was 5s — too short for
cold-start ollama loads on small models. Bumped to 15s and surfaced
as [slm].classify_timeout (0 = built-in default). Empirically caught
when a user's reecdev/tiny3.5:1.5b hit 'stream error: context
deadline exceeded' on every single classify call.

Issue 2 (Warn-level error). The SLM-fallback path logged the
underlying error at Debug, invisible without --verbose. Promoted to
Warn so a first-time misconfiguration surfaces immediately. The
fallback itself is benign; the signal is that the SLM isn't doing
the work it was supposed to.

Issue 3 (stats hint). Hard-coded 'check that llamafile boots' even
when the user is on ollama. Replaced with backend-templated advice
read from cfg.SLM.Backend. Also distinguishes three diagnostic
cases that were collapsed before:
- SLM never called (zero attempts)
- SLM called N times but every call fell back (timeout/parse)
- SLM working but minority share

Issue 4 (effective heuristic share). The classifier breakdown
shows 'heuristic' and 'slm_fallback' as separate sources, but both
routed through HeuristicClassifier — only the source tag differs.
New line under 'total observations' surfaces the combined share
honestly: 'effective heuristic share: 100% (44 fallbacks + 10
pure heuristic)'.

Issue 5 (config schema). [slm].classify_timeout joins the existing
[slm] knobs alongside startup_timeout. Documented inline with the
cold-start-load rationale.
This commit is contained in:
2026-05-25 01:05:57 +02:00
parent fa65a68728
commit f3c70bd802
5 changed files with 69 additions and 28 deletions
+2 -2
View File
@@ -180,7 +180,7 @@ func main() {
case "slm":
os.Exit(runSLMCommand(cliArgs[1:], cfg, logger))
case "router":
os.Exit(runRouterCommand(cliArgs[1:], profile))
os.Exit(runRouterCommand(cliArgs[1:], cfg, profile))
case "profile":
os.Exit(runProfileCommand(cliArgs[1:], cfg, profile))
}
@@ -881,7 +881,7 @@ func main() {
// transport and as a router arm. Both paths route through the
// firewall after fwRef.Set fires above.
slmProvider := security.WrapProvider(boot.Provider, fwRef)
lazy.set(slm.NewClassifier(slmProvider, boot.Model, logger))
lazy.set(slm.NewClassifier(slmProvider, boot.Model, time.Duration(cfg.SLM.ClassifyTimeout), logger))
// ToolUse comes from the live probe of the actual model. For
// completion-only models (e.g. TinyLlama), the SLM arm only
// handles knowledge-only prompts where the trivial-prompt
+31 -8
View File
@@ -12,7 +12,7 @@ import (
)
// runRouterCommand handles `gnoma router <subcommand>`. Returns an exit code.
func runRouterCommand(args []string, profile gnomacfg.Profile) int {
func runRouterCommand(args []string, cfg *gnomacfg.Config, profile gnomacfg.Profile) int {
if len(args) == 0 {
fmt.Fprintln(os.Stderr, "usage: gnoma router <command>")
fmt.Fprintln(os.Stderr, "commands:")
@@ -21,14 +21,14 @@ func runRouterCommand(args []string, profile gnomacfg.Profile) int {
}
switch args[0] {
case "stats":
return runRouterStats(profile)
return runRouterStats(cfg, profile)
default:
fmt.Fprintf(os.Stderr, "unknown router command: %s\n", args[0])
return 1
}
}
func runRouterStats(profile gnomacfg.Profile) int {
func runRouterStats(cfg *gnomacfg.Config, profile gnomacfg.Profile) int {
path := profile.QualityFile(gnomacfg.GlobalConfigDir())
data, err := os.ReadFile(path)
if err != nil {
@@ -52,7 +52,7 @@ func runRouterStats(profile gnomacfg.Profile) int {
}
printArmTable(snap)
fmt.Println()
printClassifierTable(snap)
printClassifierTable(snap, cfg)
return 0
}
@@ -86,7 +86,7 @@ func printArmTable(snap router.QualitySnapshot) {
_ = tw.Flush()
}
func printClassifierTable(snap router.QualitySnapshot) {
func printClassifierTable(snap router.QualitySnapshot, cfg *gnomacfg.Config) {
fmt.Println("Classifier source breakdown:")
counts := snap.ClassifierCounts
if len(counts) == 0 {
@@ -125,16 +125,39 @@ func printClassifierTable(snap router.QualitySnapshot) {
_ = tw.Flush()
fmt.Printf(" total observations: %d\n", total)
// Phase-4 trust hint.
// Effective heuristic share: both pure heuristic and slm_fallback
// observations were routed via the HeuristicClassifier — the only
// difference is whether the SLM was attempted first. Surfacing the
// combined share answers "how often did the SLM actually drive
// routing?" honestly.
effectiveHeuristic := counts["heuristic"] + counts["slm_fallback"]
if total > 0 {
fmt.Printf(" effective heuristic share: %.1f%% (%d fallbacks + %d pure heuristic)\n",
float64(effectiveHeuristic)/float64(total)*100,
counts["slm_fallback"], counts["heuristic"])
}
// Phase-4 trust hint. Distinguishes the three diagnostic cases —
// SLM never called, SLM called but every call failed, SLM working
// but minority share — and templates the actionable advice off
// the configured backend so the hint doesn't mention llamafile
// when the user is on ollama (or vice versa).
slmShare := 0.0
if total > 0 {
slmShare = float64(counts["slm"]) / float64(total) * 100
}
backend := "the SLM"
if cfg != nil && cfg.SLM.Backend != "" {
backend = cfg.SLM.Backend
}
switch {
case total < 50:
fmt.Println(" hint: < 50 observations — too sparse for Phase 4 trust signal yet.")
case counts["slm"] == 0:
fmt.Println(" hint: SLM has never classified — check that llamafile boots before short-lived runs end.")
case counts["slm"] == 0 && counts["slm_fallback"] == 0:
fmt.Printf(" hint: SLM never called — check [slm].enabled and that %s is reachable.\n", backend)
case counts["slm"] == 0 && counts["slm_fallback"] > 0:
fmt.Printf(" hint: SLM was called %d times but every call fell back — run with `--verbose` to see the underlying error (likely a timeout or parse failure for %s).\n",
counts["slm_fallback"], backend)
case slmShare < 50:
fmt.Printf(" hint: SLM share is %.0f%% — fallback is doing most of the work.\n", slmShare)
}
+7
View File
@@ -48,6 +48,13 @@ type SLMSection struct {
DataDir string `toml:"data_dir"` // llamafile-only: where to put it (empty = XDG default)
ExpectedSHA256 string `toml:"expected_sha256"` // llamafile-only: verify hash if non-empty
StartupTimeout Duration `toml:"startup_timeout"` // llamafile-only: first-launch wait budget; 0 = default 5s
// ClassifyTimeout caps each task-classification call to the SLM.
// 0 here means "use the built-in default" (15s). Cold-start model
// loads + thinking-mode first-token latency can easily exceed 5s
// on smaller hardware, so the default is generous. Tune down to
// 2-3s on fast setups, or up to 30s for very slow ones.
ClassifyTimeout Duration `toml:"classify_timeout"`
}
// ArmConfig tunes routing for a single registered arm. Multiple [[arms]]
+18 -7
View File
@@ -14,10 +14,13 @@ import (
"somegit.dev/Owlibou/gnoma/internal/stream"
)
// defaultClassifyTimeout — 5 s accommodates thinking-mode models like
// Qwen3 distillations (Tiny3.5) that emit reasoning tokens before output.
// Non-thinking models complete in well under 1 s.
const defaultClassifyTimeout = 5 * time.Second
// defaultClassifyTimeout — 15 s accommodates cold-start model loads
// (ollama lazily loads on first call, ~2-8s for a 1.5B model on SSD)
// combined with thinking-mode first-token latency (Qwen3 distillations
// like Tiny3.5 sometimes emit <think> tokens before the JSON output
// even with /no_think). Non-thinking warm models complete in well
// under 1 s. Tune via [slm].classify_timeout in config.
const defaultClassifyTimeout = 15 * time.Second
const classifySystemPrompt = `Classify the following coding request. /no_think
Respond with JSON only, no other text, no reasoning, no thinking tags.
@@ -47,14 +50,18 @@ type Classifier struct {
// NewClassifier creates a Classifier. model is the model name passed to the provider
// (llamafile ignores it but openaicompat requires a non-empty value).
func NewClassifier(p provider.Provider, model string, logger *slog.Logger) *Classifier {
// Pass timeout=0 to use the built-in default (defaultClassifyTimeout).
func NewClassifier(p provider.Provider, model string, timeout time.Duration, logger *slog.Logger) *Classifier {
if logger == nil {
logger = slog.Default()
}
if timeout <= 0 {
timeout = defaultClassifyTimeout
}
return &Classifier{
provider: p,
model: model,
timeout: defaultClassifyTimeout,
timeout: timeout,
logger: logger,
}
}
@@ -68,7 +75,11 @@ func (c *Classifier) Classify(ctx context.Context, prompt string, history []mess
resp, err := c.callSLM(tctx, prompt)
if err != nil {
c.logger.Debug("slm classify fallback", "error", err)
// Warn-level so a first-time misconfiguration (timeout too tight,
// wrong endpoint, malformed JSON from the model) surfaces without
// requiring --verbose. The fallback path itself is benign; the
// signal is that the SLM isn't doing the work it was supposed to.
c.logger.Warn("slm classify fallback", "error", err, "timeout", c.timeout)
t, ferr := router.HeuristicClassifier{}.Classify(ctx, prompt, history)
t.ClassifierSource = router.ClassifierSLMFallback
return t, ferr
+11 -11
View File
@@ -54,7 +54,7 @@ func TestClassifier_HappyPath(t *testing.T) {
// SLM complexity 0.55 stays above the Debug floor (0.4), so the SLM
// value is preserved verbatim.
p := &mockProvider{text: `{"task_type":"Debug","complexity":0.55,"requires_tools":false}`}
cls := NewClassifier(p, "default", nil)
cls := NewClassifier(p, "default", 0, nil)
task, err := cls.Classify(context.Background(), "fix the failing test", nil)
if err != nil {
@@ -76,7 +76,7 @@ func TestClassifier_AppliesTaskTypeFloor(t *testing.T) {
// bump ComplexityScore up to the floor so the SLM arm can't be picked
// for its own kind of misclassification.
p := &mockProvider{text: `{"task_type":"Debug","complexity":0.25,"requires_tools":false}`}
cls := NewClassifier(p, "default", nil)
cls := NewClassifier(p, "default", 0, nil)
task, err := cls.Classify(context.Background(), "fix the failing test", nil)
if err != nil {
@@ -91,7 +91,7 @@ func TestClassifier_AppliesTaskTypeFloor(t *testing.T) {
func TestClassifier_BlendHeuristic(t *testing.T) {
// SLM returns one type; other Task fields should come from heuristic.
p := &mockProvider{text: `{"task_type":"Boilerplate","complexity":0.1,"requires_tools":false}`}
cls := NewClassifier(p, "default", nil)
cls := NewClassifier(p, "default", 0, nil)
task, err := cls.Classify(context.Background(), "scaffold a new HTTP handler", nil)
if err != nil {
@@ -108,7 +108,7 @@ func TestClassifier_BlendHeuristic(t *testing.T) {
func TestClassifier_FallbackOnBadJSON(t *testing.T) {
p := &mockProvider{text: "I cannot classify that."}
cls := NewClassifier(p, "default", nil)
cls := NewClassifier(p, "default", 0, nil)
// Should not error — falls back to heuristic.
task, err := cls.Classify(context.Background(), "write unit tests for the parser", nil)
@@ -123,7 +123,7 @@ func TestClassifier_FallbackOnBadJSON(t *testing.T) {
func TestClassifier_FallbackOnProviderError(t *testing.T) {
p := &mockProvider{err: errors.New("connection refused")}
cls := NewClassifier(p, "default", nil)
cls := NewClassifier(p, "default", 0, nil)
task, err := cls.Classify(context.Background(), "explain how generics work", nil)
if err != nil {
@@ -137,7 +137,7 @@ func TestClassifier_FallbackOnProviderError(t *testing.T) {
func TestClassifier_FallbackOnTimeout(t *testing.T) {
p := &mockProvider{delay: 500 * time.Millisecond}
cls := NewClassifier(p, "default", nil)
cls := NewClassifier(p, "default", 0, nil)
cls.timeout = 50 * time.Millisecond // force timeout
task, err := cls.Classify(context.Background(), "debug the failing test", nil)
@@ -153,7 +153,7 @@ func TestClassifier_FallbackOnTimeout(t *testing.T) {
func TestClassifier_FenceStripping(t *testing.T) {
fenced := "```json\n{\"task_type\":\"Refactor\",\"complexity\":0.5,\"requires_tools\":true}\n```"
p := &mockProvider{text: fenced}
cls := NewClassifier(p, "default", nil)
cls := NewClassifier(p, "default", 0, nil)
task, err := cls.Classify(context.Background(), "refactor the auth middleware", nil)
if err != nil {
@@ -166,7 +166,7 @@ func TestClassifier_FenceStripping(t *testing.T) {
func TestClassifier_UnknownTaskType_FallsBackToHeuristic(t *testing.T) {
p := &mockProvider{text: `{"task_type":"FooBar","complexity":0.3,"requires_tools":false}`}
cls := NewClassifier(p, "default", nil)
cls := NewClassifier(p, "default", 0, nil)
task, err := cls.Classify(context.Background(), "implement a binary search function", nil)
if err != nil {
@@ -178,7 +178,7 @@ func TestClassifier_UnknownTaskType_FallsBackToHeuristic(t *testing.T) {
func TestClassifier_SetsClassifierSource_OnSuccess(t *testing.T) {
p := &mockProvider{text: `{"task_type":"Debug","complexity":0.3,"requires_tools":true}`}
cls := NewClassifier(p, "default", nil)
cls := NewClassifier(p, "default", 0, nil)
task, err := cls.Classify(context.Background(), "fix the failing test", nil)
if err != nil {
t.Fatal(err)
@@ -190,7 +190,7 @@ func TestClassifier_SetsClassifierSource_OnSuccess(t *testing.T) {
func TestClassifier_SetsClassifierSource_OnFallback(t *testing.T) {
p := &mockProvider{err: errors.New("backend unreachable")}
cls := NewClassifier(p, "default", nil)
cls := NewClassifier(p, "default", 0, nil)
task, err := cls.Classify(context.Background(), "fix the failing test", nil)
if err != nil {
t.Fatal(err)
@@ -202,7 +202,7 @@ func TestClassifier_SetsClassifierSource_OnFallback(t *testing.T) {
func TestClassifier_ContextPassedToHistory(t *testing.T) {
p := &mockProvider{text: `{"task_type":"Explain","complexity":0.2,"requires_tools":false}`}
cls := NewClassifier(p, "default", nil)
cls := NewClassifier(p, "default", 0, nil)
history := []message.Message{
{Role: message.RoleUser, Content: []message.Content{{Type: message.ContentText, Text: "prior"}}},