diff --git a/cmd/gnoma/main.go b/cmd/gnoma/main.go index 6747bca..9b522a3 100644 --- a/cmd/gnoma/main.go +++ b/cmd/gnoma/main.go @@ -180,7 +180,7 @@ func main() { case "slm": os.Exit(runSLMCommand(cliArgs[1:], cfg, logger)) case "router": - os.Exit(runRouterCommand(cliArgs[1:], profile)) + os.Exit(runRouterCommand(cliArgs[1:], cfg, profile)) case "profile": os.Exit(runProfileCommand(cliArgs[1:], cfg, profile)) } @@ -881,7 +881,7 @@ func main() { // transport and as a router arm. Both paths route through the // firewall after fwRef.Set fires above. slmProvider := security.WrapProvider(boot.Provider, fwRef) - lazy.set(slm.NewClassifier(slmProvider, boot.Model, logger)) + lazy.set(slm.NewClassifier(slmProvider, boot.Model, time.Duration(cfg.SLM.ClassifyTimeout), logger)) // ToolUse comes from the live probe of the actual model. For // completion-only models (e.g. TinyLlama), the SLM arm only // handles knowledge-only prompts where the trivial-prompt diff --git a/cmd/gnoma/router_cmd.go b/cmd/gnoma/router_cmd.go index f54dbec..4641275 100644 --- a/cmd/gnoma/router_cmd.go +++ b/cmd/gnoma/router_cmd.go @@ -12,7 +12,7 @@ import ( ) // runRouterCommand handles `gnoma router `. Returns an exit code. -func runRouterCommand(args []string, profile gnomacfg.Profile) int { +func runRouterCommand(args []string, cfg *gnomacfg.Config, profile gnomacfg.Profile) int { if len(args) == 0 { fmt.Fprintln(os.Stderr, "usage: gnoma router ") fmt.Fprintln(os.Stderr, "commands:") @@ -21,14 +21,14 @@ func runRouterCommand(args []string, profile gnomacfg.Profile) int { } switch args[0] { case "stats": - return runRouterStats(profile) + return runRouterStats(cfg, profile) default: fmt.Fprintf(os.Stderr, "unknown router command: %s\n", args[0]) return 1 } } -func runRouterStats(profile gnomacfg.Profile) int { +func runRouterStats(cfg *gnomacfg.Config, profile gnomacfg.Profile) int { path := profile.QualityFile(gnomacfg.GlobalConfigDir()) data, err := os.ReadFile(path) if err != nil { @@ -52,7 +52,7 @@ func runRouterStats(profile gnomacfg.Profile) int { } printArmTable(snap) fmt.Println() - printClassifierTable(snap) + printClassifierTable(snap, cfg) return 0 } @@ -86,7 +86,7 @@ func printArmTable(snap router.QualitySnapshot) { _ = tw.Flush() } -func printClassifierTable(snap router.QualitySnapshot) { +func printClassifierTable(snap router.QualitySnapshot, cfg *gnomacfg.Config) { fmt.Println("Classifier source breakdown:") counts := snap.ClassifierCounts if len(counts) == 0 { @@ -125,16 +125,39 @@ func printClassifierTable(snap router.QualitySnapshot) { _ = tw.Flush() fmt.Printf(" total observations: %d\n", total) - // Phase-4 trust hint. + // Effective heuristic share: both pure heuristic and slm_fallback + // observations were routed via the HeuristicClassifier — the only + // difference is whether the SLM was attempted first. Surfacing the + // combined share answers "how often did the SLM actually drive + // routing?" honestly. + effectiveHeuristic := counts["heuristic"] + counts["slm_fallback"] + if total > 0 { + fmt.Printf(" effective heuristic share: %.1f%% (%d fallbacks + %d pure heuristic)\n", + float64(effectiveHeuristic)/float64(total)*100, + counts["slm_fallback"], counts["heuristic"]) + } + + // Phase-4 trust hint. Distinguishes the three diagnostic cases — + // SLM never called, SLM called but every call failed, SLM working + // but minority share — and templates the actionable advice off + // the configured backend so the hint doesn't mention llamafile + // when the user is on ollama (or vice versa). slmShare := 0.0 if total > 0 { slmShare = float64(counts["slm"]) / float64(total) * 100 } + backend := "the SLM" + if cfg != nil && cfg.SLM.Backend != "" { + backend = cfg.SLM.Backend + } switch { case total < 50: fmt.Println(" hint: < 50 observations — too sparse for Phase 4 trust signal yet.") - case counts["slm"] == 0: - fmt.Println(" hint: SLM has never classified — check that llamafile boots before short-lived runs end.") + case counts["slm"] == 0 && counts["slm_fallback"] == 0: + fmt.Printf(" hint: SLM never called — check [slm].enabled and that %s is reachable.\n", backend) + case counts["slm"] == 0 && counts["slm_fallback"] > 0: + fmt.Printf(" hint: SLM was called %d times but every call fell back — run with `--verbose` to see the underlying error (likely a timeout or parse failure for %s).\n", + counts["slm_fallback"], backend) case slmShare < 50: fmt.Printf(" hint: SLM share is %.0f%% — fallback is doing most of the work.\n", slmShare) } diff --git a/internal/config/config.go b/internal/config/config.go index 2c7ed48..91e065e 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -48,6 +48,13 @@ type SLMSection struct { DataDir string `toml:"data_dir"` // llamafile-only: where to put it (empty = XDG default) ExpectedSHA256 string `toml:"expected_sha256"` // llamafile-only: verify hash if non-empty StartupTimeout Duration `toml:"startup_timeout"` // llamafile-only: first-launch wait budget; 0 = default 5s + + // ClassifyTimeout caps each task-classification call to the SLM. + // 0 here means "use the built-in default" (15s). Cold-start model + // loads + thinking-mode first-token latency can easily exceed 5s + // on smaller hardware, so the default is generous. Tune down to + // 2-3s on fast setups, or up to 30s for very slow ones. + ClassifyTimeout Duration `toml:"classify_timeout"` } // ArmConfig tunes routing for a single registered arm. Multiple [[arms]] diff --git a/internal/slm/classifier.go b/internal/slm/classifier.go index 987c0ec..086a803 100644 --- a/internal/slm/classifier.go +++ b/internal/slm/classifier.go @@ -14,10 +14,13 @@ import ( "somegit.dev/Owlibou/gnoma/internal/stream" ) -// defaultClassifyTimeout — 5 s accommodates thinking-mode models like -// Qwen3 distillations (Tiny3.5) that emit reasoning tokens before output. -// Non-thinking models complete in well under 1 s. -const defaultClassifyTimeout = 5 * time.Second +// defaultClassifyTimeout — 15 s accommodates cold-start model loads +// (ollama lazily loads on first call, ~2-8s for a 1.5B model on SSD) +// combined with thinking-mode first-token latency (Qwen3 distillations +// like Tiny3.5 sometimes emit tokens before the JSON output +// even with /no_think). Non-thinking warm models complete in well +// under 1 s. Tune via [slm].classify_timeout in config. +const defaultClassifyTimeout = 15 * time.Second const classifySystemPrompt = `Classify the following coding request. /no_think Respond with JSON only, no other text, no reasoning, no thinking tags. @@ -47,14 +50,18 @@ type Classifier struct { // NewClassifier creates a Classifier. model is the model name passed to the provider // (llamafile ignores it but openaicompat requires a non-empty value). -func NewClassifier(p provider.Provider, model string, logger *slog.Logger) *Classifier { +// Pass timeout=0 to use the built-in default (defaultClassifyTimeout). +func NewClassifier(p provider.Provider, model string, timeout time.Duration, logger *slog.Logger) *Classifier { if logger == nil { logger = slog.Default() } + if timeout <= 0 { + timeout = defaultClassifyTimeout + } return &Classifier{ provider: p, model: model, - timeout: defaultClassifyTimeout, + timeout: timeout, logger: logger, } } @@ -68,7 +75,11 @@ func (c *Classifier) Classify(ctx context.Context, prompt string, history []mess resp, err := c.callSLM(tctx, prompt) if err != nil { - c.logger.Debug("slm classify fallback", "error", err) + // Warn-level so a first-time misconfiguration (timeout too tight, + // wrong endpoint, malformed JSON from the model) surfaces without + // requiring --verbose. The fallback path itself is benign; the + // signal is that the SLM isn't doing the work it was supposed to. + c.logger.Warn("slm classify fallback", "error", err, "timeout", c.timeout) t, ferr := router.HeuristicClassifier{}.Classify(ctx, prompt, history) t.ClassifierSource = router.ClassifierSLMFallback return t, ferr diff --git a/internal/slm/classifier_test.go b/internal/slm/classifier_test.go index 1f7cc20..4311dde 100644 --- a/internal/slm/classifier_test.go +++ b/internal/slm/classifier_test.go @@ -54,7 +54,7 @@ func TestClassifier_HappyPath(t *testing.T) { // SLM complexity 0.55 stays above the Debug floor (0.4), so the SLM // value is preserved verbatim. p := &mockProvider{text: `{"task_type":"Debug","complexity":0.55,"requires_tools":false}`} - cls := NewClassifier(p, "default", nil) + cls := NewClassifier(p, "default", 0, nil) task, err := cls.Classify(context.Background(), "fix the failing test", nil) if err != nil { @@ -76,7 +76,7 @@ func TestClassifier_AppliesTaskTypeFloor(t *testing.T) { // bump ComplexityScore up to the floor so the SLM arm can't be picked // for its own kind of misclassification. p := &mockProvider{text: `{"task_type":"Debug","complexity":0.25,"requires_tools":false}`} - cls := NewClassifier(p, "default", nil) + cls := NewClassifier(p, "default", 0, nil) task, err := cls.Classify(context.Background(), "fix the failing test", nil) if err != nil { @@ -91,7 +91,7 @@ func TestClassifier_AppliesTaskTypeFloor(t *testing.T) { func TestClassifier_BlendHeuristic(t *testing.T) { // SLM returns one type; other Task fields should come from heuristic. p := &mockProvider{text: `{"task_type":"Boilerplate","complexity":0.1,"requires_tools":false}`} - cls := NewClassifier(p, "default", nil) + cls := NewClassifier(p, "default", 0, nil) task, err := cls.Classify(context.Background(), "scaffold a new HTTP handler", nil) if err != nil { @@ -108,7 +108,7 @@ func TestClassifier_BlendHeuristic(t *testing.T) { func TestClassifier_FallbackOnBadJSON(t *testing.T) { p := &mockProvider{text: "I cannot classify that."} - cls := NewClassifier(p, "default", nil) + cls := NewClassifier(p, "default", 0, nil) // Should not error — falls back to heuristic. task, err := cls.Classify(context.Background(), "write unit tests for the parser", nil) @@ -123,7 +123,7 @@ func TestClassifier_FallbackOnBadJSON(t *testing.T) { func TestClassifier_FallbackOnProviderError(t *testing.T) { p := &mockProvider{err: errors.New("connection refused")} - cls := NewClassifier(p, "default", nil) + cls := NewClassifier(p, "default", 0, nil) task, err := cls.Classify(context.Background(), "explain how generics work", nil) if err != nil { @@ -137,7 +137,7 @@ func TestClassifier_FallbackOnProviderError(t *testing.T) { func TestClassifier_FallbackOnTimeout(t *testing.T) { p := &mockProvider{delay: 500 * time.Millisecond} - cls := NewClassifier(p, "default", nil) + cls := NewClassifier(p, "default", 0, nil) cls.timeout = 50 * time.Millisecond // force timeout task, err := cls.Classify(context.Background(), "debug the failing test", nil) @@ -153,7 +153,7 @@ func TestClassifier_FallbackOnTimeout(t *testing.T) { func TestClassifier_FenceStripping(t *testing.T) { fenced := "```json\n{\"task_type\":\"Refactor\",\"complexity\":0.5,\"requires_tools\":true}\n```" p := &mockProvider{text: fenced} - cls := NewClassifier(p, "default", nil) + cls := NewClassifier(p, "default", 0, nil) task, err := cls.Classify(context.Background(), "refactor the auth middleware", nil) if err != nil { @@ -166,7 +166,7 @@ func TestClassifier_FenceStripping(t *testing.T) { func TestClassifier_UnknownTaskType_FallsBackToHeuristic(t *testing.T) { p := &mockProvider{text: `{"task_type":"FooBar","complexity":0.3,"requires_tools":false}`} - cls := NewClassifier(p, "default", nil) + cls := NewClassifier(p, "default", 0, nil) task, err := cls.Classify(context.Background(), "implement a binary search function", nil) if err != nil { @@ -178,7 +178,7 @@ func TestClassifier_UnknownTaskType_FallsBackToHeuristic(t *testing.T) { func TestClassifier_SetsClassifierSource_OnSuccess(t *testing.T) { p := &mockProvider{text: `{"task_type":"Debug","complexity":0.3,"requires_tools":true}`} - cls := NewClassifier(p, "default", nil) + cls := NewClassifier(p, "default", 0, nil) task, err := cls.Classify(context.Background(), "fix the failing test", nil) if err != nil { t.Fatal(err) @@ -190,7 +190,7 @@ func TestClassifier_SetsClassifierSource_OnSuccess(t *testing.T) { func TestClassifier_SetsClassifierSource_OnFallback(t *testing.T) { p := &mockProvider{err: errors.New("backend unreachable")} - cls := NewClassifier(p, "default", nil) + cls := NewClassifier(p, "default", 0, nil) task, err := cls.Classify(context.Background(), "fix the failing test", nil) if err != nil { t.Fatal(err) @@ -202,7 +202,7 @@ func TestClassifier_SetsClassifierSource_OnFallback(t *testing.T) { func TestClassifier_ContextPassedToHistory(t *testing.T) { p := &mockProvider{text: `{"task_type":"Explain","complexity":0.2,"requires_tools":false}`} - cls := NewClassifier(p, "default", nil) + cls := NewClassifier(p, "default", 0, nil) history := []message.Message{ {Role: message.RoleUser, Content: []message.Content{{Type: message.ContentText, Text: "prior"}}},