fix(slm,router): honest classifier diagnostics + 15s default timeout

Five fixes folded into one commit because they all answer the same question: 'why does my router stats output lie to me?' Issue 1 (timeout). Default classify timeout was 5s — too short for cold-start ollama loads on small models. Bumped to 15s and surfaced as [slm].classify_timeout (0 = built-in default). Empirically caught when a user's reecdev/tiny3.5:1.5b hit 'stream error: context deadline exceeded' on every single classify call. Issue 2 (Warn-level error). The SLM-fallback path logged the underlying error at Debug, invisible without --verbose. Promoted to Warn so a first-time misconfiguration surfaces immediately. The fallback itself is benign; the signal is that the SLM isn't doing the work it was supposed to. Issue 3 (stats hint). Hard-coded 'check that llamafile boots' even when the user is on ollama. Replaced with backend-templated advice read from cfg.SLM.Backend. Also distinguishes three diagnostic cases that were collapsed before: - SLM never called (zero attempts) - SLM called N times but every call fell back (timeout/parse) - SLM working but minority share Issue 4 (effective heuristic share). The classifier breakdown shows 'heuristic' and 'slm_fallback' as separate sources, but both routed through HeuristicClassifier — only the source tag differs. New line under 'total observations' surfaces the combined share honestly: 'effective heuristic share: 100% (44 fallbacks + 10 pure heuristic)'. Issue 5 (config schema). [slm].classify_timeout joins the existing [slm] knobs alongside startup_timeout. Documented inline with the cold-start-load rationale.
2026-05-25 01:05:57 +02:00
parent fa65a68728
commit f3c70bd802
5 changed files with 69 additions and 28 deletions
@@ -180,7 +180,7 @@ func main() {
 		case "slm":
 			os.Exit(runSLMCommand(cliArgs[1:], cfg, logger))
 		case "router":
-			os.Exit(runRouterCommand(cliArgs[1:], profile))
+			os.Exit(runRouterCommand(cliArgs[1:], cfg, profile))
 		case "profile":
 			os.Exit(runProfileCommand(cliArgs[1:], cfg, profile))
 		}
@@ -881,7 +881,7 @@ func main() {
 			// transport and as a router arm. Both paths route through the
 			// firewall after fwRef.Set fires above.
 			slmProvider := security.WrapProvider(boot.Provider, fwRef)
-			lazy.set(slm.NewClassifier(slmProvider, boot.Model, logger))
+			lazy.set(slm.NewClassifier(slmProvider, boot.Model, time.Duration(cfg.SLM.ClassifyTimeout), logger))
 			// ToolUse comes from the live probe of the actual model. For
 			// completion-only models (e.g. TinyLlama), the SLM arm only
 			// handles knowledge-only prompts where the trivial-prompt
@@ -12,7 +12,7 @@ import (
 )

 // runRouterCommand handles `gnoma router <subcommand>`. Returns an exit code.
-func runRouterCommand(args []string, profile gnomacfg.Profile) int {
+func runRouterCommand(args []string, cfg *gnomacfg.Config, profile gnomacfg.Profile) int {
 	if len(args) == 0 {
 		fmt.Fprintln(os.Stderr, "usage: gnoma router <command>")
 		fmt.Fprintln(os.Stderr, "commands:")
@@ -21,14 +21,14 @@ func runRouterCommand(args []string, profile gnomacfg.Profile) int {
 	}
 	switch args[0] {
 	case "stats":
-		return runRouterStats(profile)
+		return runRouterStats(cfg, profile)
 	default:
 		fmt.Fprintf(os.Stderr, "unknown router command: %s\n", args[0])
 		return 1
 	}
 }

-func runRouterStats(profile gnomacfg.Profile) int {
+func runRouterStats(cfg *gnomacfg.Config, profile gnomacfg.Profile) int {
 	path := profile.QualityFile(gnomacfg.GlobalConfigDir())
 	data, err := os.ReadFile(path)
 	if err != nil {
@@ -52,7 +52,7 @@ func runRouterStats(profile gnomacfg.Profile) int {
 	}
 	printArmTable(snap)
 	fmt.Println()
-	printClassifierTable(snap)
+	printClassifierTable(snap, cfg)
 	return 0
 }

@@ -86,7 +86,7 @@ func printArmTable(snap router.QualitySnapshot) {
 	_ = tw.Flush()
 }

-func printClassifierTable(snap router.QualitySnapshot) {
+func printClassifierTable(snap router.QualitySnapshot, cfg *gnomacfg.Config) {
 	fmt.Println("Classifier source breakdown:")
 	counts := snap.ClassifierCounts
 	if len(counts) == 0 {
@@ -125,16 +125,39 @@ func printClassifierTable(snap router.QualitySnapshot) {
 	_ = tw.Flush()
 	fmt.Printf("  total observations: %d\n", total)

-	// Phase-4 trust hint.
+	// Effective heuristic share: both pure heuristic and slm_fallback
+	// observations were routed via the HeuristicClassifier — the only
+	// difference is whether the SLM was attempted first. Surfacing the
+	// combined share answers "how often did the SLM actually drive
+	// routing?" honestly.
+	effectiveHeuristic := counts["heuristic"] + counts["slm_fallback"]
+	if total > 0 {
+		fmt.Printf("  effective heuristic share: %.1f%% (%d fallbacks + %d pure heuristic)\n",
+			float64(effectiveHeuristic)/float64(total)*100,
+			counts["slm_fallback"], counts["heuristic"])
+	}
+
+	// Phase-4 trust hint. Distinguishes the three diagnostic cases —
+	// SLM never called, SLM called but every call failed, SLM working
+	// but minority share — and templates the actionable advice off
+	// the configured backend so the hint doesn't mention llamafile
+	// when the user is on ollama (or vice versa).
 	slmShare := 0.0
 	if total > 0 {
 		slmShare = float64(counts["slm"]) / float64(total) * 100
 	}
+	backend := "the SLM"
+	if cfg != nil && cfg.SLM.Backend != "" {
+		backend = cfg.SLM.Backend
+	}
 	switch {
 	case total < 50:
 		fmt.Println("  hint: < 50 observations — too sparse for Phase 4 trust signal yet.")
-	case counts["slm"] == 0:
-		fmt.Println("  hint: SLM has never classified — check that llamafile boots before short-lived runs end.")
+	case counts["slm"] == 0 && counts["slm_fallback"] == 0:
+		fmt.Printf("  hint: SLM never called — check [slm].enabled and that %s is reachable.\n", backend)
+	case counts["slm"] == 0 && counts["slm_fallback"] > 0:
+		fmt.Printf("  hint: SLM was called %d times but every call fell back — run with `--verbose` to see the underlying error (likely a timeout or parse failure for %s).\n",
+			counts["slm_fallback"], backend)
 	case slmShare < 50:
 		fmt.Printf("  hint: SLM share is %.0f%% — fallback is doing most of the work.\n", slmShare)
 	}
@@ -48,6 +48,13 @@ type SLMSection struct {
 	DataDir        string   `toml:"data_dir"`        // llamafile-only: where to put it (empty = XDG default)
 	ExpectedSHA256 string   `toml:"expected_sha256"` // llamafile-only: verify hash if non-empty
 	StartupTimeout Duration `toml:"startup_timeout"` // llamafile-only: first-launch wait budget; 0 = default 5s
+
+	// ClassifyTimeout caps each task-classification call to the SLM.
+	// 0 here means "use the built-in default" (15s). Cold-start model
+	// loads + thinking-mode first-token latency can easily exceed 5s
+	// on smaller hardware, so the default is generous. Tune down to
+	// 2-3s on fast setups, or up to 30s for very slow ones.
+	ClassifyTimeout Duration `toml:"classify_timeout"`
 }

 // ArmConfig tunes routing for a single registered arm. Multiple [[arms]]
@@ -14,10 +14,13 @@ import (
 	"somegit.dev/Owlibou/gnoma/internal/stream"
 )

-// defaultClassifyTimeout — 5 s accommodates thinking-mode models like
-// Qwen3 distillations (Tiny3.5) that emit reasoning tokens before output.
-// Non-thinking models complete in well under 1 s.
-const defaultClassifyTimeout = 5 * time.Second
+// defaultClassifyTimeout — 15 s accommodates cold-start model loads
+// (ollama lazily loads on first call, ~2-8s for a 1.5B model on SSD)
+// combined with thinking-mode first-token latency (Qwen3 distillations
+// like Tiny3.5 sometimes emit <think> tokens before the JSON output
+// even with /no_think). Non-thinking warm models complete in well
+// under 1 s. Tune via [slm].classify_timeout in config.
+const defaultClassifyTimeout = 15 * time.Second

 const classifySystemPrompt = `Classify the following coding request. /no_think
 Respond with JSON only, no other text, no reasoning, no thinking tags.
@@ -47,14 +50,18 @@ type Classifier struct {

 // NewClassifier creates a Classifier. model is the model name passed to the provider
 // (llamafile ignores it but openaicompat requires a non-empty value).
-func NewClassifier(p provider.Provider, model string, logger *slog.Logger) *Classifier {
+// Pass timeout=0 to use the built-in default (defaultClassifyTimeout).
+func NewClassifier(p provider.Provider, model string, timeout time.Duration, logger *slog.Logger) *Classifier {
 	if logger == nil {
 		logger = slog.Default()
 	}
+	if timeout <= 0 {
+		timeout = defaultClassifyTimeout
+	}
 	return &Classifier{
 		provider: p,
 		model:    model,
-		timeout:  defaultClassifyTimeout,
+		timeout:  timeout,
 		logger:   logger,
 	}
 }
@@ -68,7 +75,11 @@ func (c *Classifier) Classify(ctx context.Context, prompt string, history []mess

 	resp, err := c.callSLM(tctx, prompt)
 	if err != nil {
-		c.logger.Debug("slm classify fallback", "error", err)
+		// Warn-level so a first-time misconfiguration (timeout too tight,
+		// wrong endpoint, malformed JSON from the model) surfaces without
+		// requiring --verbose. The fallback path itself is benign; the
+		// signal is that the SLM isn't doing the work it was supposed to.
+		c.logger.Warn("slm classify fallback", "error", err, "timeout", c.timeout)
 		t, ferr := router.HeuristicClassifier{}.Classify(ctx, prompt, history)
 		t.ClassifierSource = router.ClassifierSLMFallback
 		return t, ferr
@@ -54,7 +54,7 @@ func TestClassifier_HappyPath(t *testing.T) {
 	// SLM complexity 0.55 stays above the Debug floor (0.4), so the SLM
 	// value is preserved verbatim.
 	p := &mockProvider{text: `{"task_type":"Debug","complexity":0.55,"requires_tools":false}`}
-	cls := NewClassifier(p, "default", nil)
+	cls := NewClassifier(p, "default", 0, nil)

 	task, err := cls.Classify(context.Background(), "fix the failing test", nil)
 	if err != nil {
@@ -76,7 +76,7 @@ func TestClassifier_AppliesTaskTypeFloor(t *testing.T) {
 	// bump ComplexityScore up to the floor so the SLM arm can't be picked
 	// for its own kind of misclassification.
 	p := &mockProvider{text: `{"task_type":"Debug","complexity":0.25,"requires_tools":false}`}
-	cls := NewClassifier(p, "default", nil)
+	cls := NewClassifier(p, "default", 0, nil)

 	task, err := cls.Classify(context.Background(), "fix the failing test", nil)
 	if err != nil {
@@ -91,7 +91,7 @@ func TestClassifier_AppliesTaskTypeFloor(t *testing.T) {
 func TestClassifier_BlendHeuristic(t *testing.T) {
 	// SLM returns one type; other Task fields should come from heuristic.
 	p := &mockProvider{text: `{"task_type":"Boilerplate","complexity":0.1,"requires_tools":false}`}
-	cls := NewClassifier(p, "default", nil)
+	cls := NewClassifier(p, "default", 0, nil)

 	task, err := cls.Classify(context.Background(), "scaffold a new HTTP handler", nil)
 	if err != nil {
@@ -108,7 +108,7 @@ func TestClassifier_BlendHeuristic(t *testing.T) {

 func TestClassifier_FallbackOnBadJSON(t *testing.T) {
 	p := &mockProvider{text: "I cannot classify that."}
-	cls := NewClassifier(p, "default", nil)
+	cls := NewClassifier(p, "default", 0, nil)

 	// Should not error — falls back to heuristic.
 	task, err := cls.Classify(context.Background(), "write unit tests for the parser", nil)
@@ -123,7 +123,7 @@ func TestClassifier_FallbackOnBadJSON(t *testing.T) {

 func TestClassifier_FallbackOnProviderError(t *testing.T) {
 	p := &mockProvider{err: errors.New("connection refused")}
-	cls := NewClassifier(p, "default", nil)
+	cls := NewClassifier(p, "default", 0, nil)

 	task, err := cls.Classify(context.Background(), "explain how generics work", nil)
 	if err != nil {
@@ -137,7 +137,7 @@ func TestClassifier_FallbackOnProviderError(t *testing.T) {

 func TestClassifier_FallbackOnTimeout(t *testing.T) {
 	p := &mockProvider{delay: 500 * time.Millisecond}
-	cls := NewClassifier(p, "default", nil)
+	cls := NewClassifier(p, "default", 0, nil)
 	cls.timeout = 50 * time.Millisecond // force timeout

 	task, err := cls.Classify(context.Background(), "debug the failing test", nil)
@@ -153,7 +153,7 @@ func TestClassifier_FallbackOnTimeout(t *testing.T) {
 func TestClassifier_FenceStripping(t *testing.T) {
 	fenced := "```json\n{\"task_type\":\"Refactor\",\"complexity\":0.5,\"requires_tools\":true}\n```"
 	p := &mockProvider{text: fenced}
-	cls := NewClassifier(p, "default", nil)
+	cls := NewClassifier(p, "default", 0, nil)

 	task, err := cls.Classify(context.Background(), "refactor the auth middleware", nil)
 	if err != nil {
@@ -166,7 +166,7 @@ func TestClassifier_FenceStripping(t *testing.T) {

 func TestClassifier_UnknownTaskType_FallsBackToHeuristic(t *testing.T) {
 	p := &mockProvider{text: `{"task_type":"FooBar","complexity":0.3,"requires_tools":false}`}
-	cls := NewClassifier(p, "default", nil)
+	cls := NewClassifier(p, "default", 0, nil)

 	task, err := cls.Classify(context.Background(), "implement a binary search function", nil)
 	if err != nil {
@@ -178,7 +178,7 @@ func TestClassifier_UnknownTaskType_FallsBackToHeuristic(t *testing.T) {

 func TestClassifier_SetsClassifierSource_OnSuccess(t *testing.T) {
 	p := &mockProvider{text: `{"task_type":"Debug","complexity":0.3,"requires_tools":true}`}
-	cls := NewClassifier(p, "default", nil)
+	cls := NewClassifier(p, "default", 0, nil)
 	task, err := cls.Classify(context.Background(), "fix the failing test", nil)
 	if err != nil {
 		t.Fatal(err)
@@ -190,7 +190,7 @@ func TestClassifier_SetsClassifierSource_OnSuccess(t *testing.T) {

 func TestClassifier_SetsClassifierSource_OnFallback(t *testing.T) {
 	p := &mockProvider{err: errors.New("backend unreachable")}
-	cls := NewClassifier(p, "default", nil)
+	cls := NewClassifier(p, "default", 0, nil)
 	task, err := cls.Classify(context.Background(), "fix the failing test", nil)
 	if err != nil {
 		t.Fatal(err)
@@ -202,7 +202,7 @@ func TestClassifier_SetsClassifierSource_OnFallback(t *testing.T) {

 func TestClassifier_ContextPassedToHistory(t *testing.T) {
 	p := &mockProvider{text: `{"task_type":"Explain","complexity":0.2,"requires_tools":false}`}
-	cls := NewClassifier(p, "default", nil)
+	cls := NewClassifier(p, "default", 0, nil)

 	history := []message.Message{
 		{Role: message.RoleUser, Content: []message.Content{{Type: message.ContentText, Text: "prior"}}},