fix(slm,router): honest classifier diagnostics + 15s default timeout
Five fixes folded into one commit because they all answer the same question: 'why does my router stats output lie to me?' Issue 1 (timeout). Default classify timeout was 5s — too short for cold-start ollama loads on small models. Bumped to 15s and surfaced as [slm].classify_timeout (0 = built-in default). Empirically caught when a user's reecdev/tiny3.5:1.5b hit 'stream error: context deadline exceeded' on every single classify call. Issue 2 (Warn-level error). The SLM-fallback path logged the underlying error at Debug, invisible without --verbose. Promoted to Warn so a first-time misconfiguration surfaces immediately. The fallback itself is benign; the signal is that the SLM isn't doing the work it was supposed to. Issue 3 (stats hint). Hard-coded 'check that llamafile boots' even when the user is on ollama. Replaced with backend-templated advice read from cfg.SLM.Backend. Also distinguishes three diagnostic cases that were collapsed before: - SLM never called (zero attempts) - SLM called N times but every call fell back (timeout/parse) - SLM working but minority share Issue 4 (effective heuristic share). The classifier breakdown shows 'heuristic' and 'slm_fallback' as separate sources, but both routed through HeuristicClassifier — only the source tag differs. New line under 'total observations' surfaces the combined share honestly: 'effective heuristic share: 100% (44 fallbacks + 10 pure heuristic)'. Issue 5 (config schema). [slm].classify_timeout joins the existing [slm] knobs alongside startup_timeout. Documented inline with the cold-start-load rationale.
This commit is contained in:
+2
-2
@@ -180,7 +180,7 @@ func main() {
|
||||
case "slm":
|
||||
os.Exit(runSLMCommand(cliArgs[1:], cfg, logger))
|
||||
case "router":
|
||||
os.Exit(runRouterCommand(cliArgs[1:], profile))
|
||||
os.Exit(runRouterCommand(cliArgs[1:], cfg, profile))
|
||||
case "profile":
|
||||
os.Exit(runProfileCommand(cliArgs[1:], cfg, profile))
|
||||
}
|
||||
@@ -881,7 +881,7 @@ func main() {
|
||||
// transport and as a router arm. Both paths route through the
|
||||
// firewall after fwRef.Set fires above.
|
||||
slmProvider := security.WrapProvider(boot.Provider, fwRef)
|
||||
lazy.set(slm.NewClassifier(slmProvider, boot.Model, logger))
|
||||
lazy.set(slm.NewClassifier(slmProvider, boot.Model, time.Duration(cfg.SLM.ClassifyTimeout), logger))
|
||||
// ToolUse comes from the live probe of the actual model. For
|
||||
// completion-only models (e.g. TinyLlama), the SLM arm only
|
||||
// handles knowledge-only prompts where the trivial-prompt
|
||||
|
||||
+31
-8
@@ -12,7 +12,7 @@ import (
|
||||
)
|
||||
|
||||
// runRouterCommand handles `gnoma router <subcommand>`. Returns an exit code.
|
||||
func runRouterCommand(args []string, profile gnomacfg.Profile) int {
|
||||
func runRouterCommand(args []string, cfg *gnomacfg.Config, profile gnomacfg.Profile) int {
|
||||
if len(args) == 0 {
|
||||
fmt.Fprintln(os.Stderr, "usage: gnoma router <command>")
|
||||
fmt.Fprintln(os.Stderr, "commands:")
|
||||
@@ -21,14 +21,14 @@ func runRouterCommand(args []string, profile gnomacfg.Profile) int {
|
||||
}
|
||||
switch args[0] {
|
||||
case "stats":
|
||||
return runRouterStats(profile)
|
||||
return runRouterStats(cfg, profile)
|
||||
default:
|
||||
fmt.Fprintf(os.Stderr, "unknown router command: %s\n", args[0])
|
||||
return 1
|
||||
}
|
||||
}
|
||||
|
||||
func runRouterStats(profile gnomacfg.Profile) int {
|
||||
func runRouterStats(cfg *gnomacfg.Config, profile gnomacfg.Profile) int {
|
||||
path := profile.QualityFile(gnomacfg.GlobalConfigDir())
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
@@ -52,7 +52,7 @@ func runRouterStats(profile gnomacfg.Profile) int {
|
||||
}
|
||||
printArmTable(snap)
|
||||
fmt.Println()
|
||||
printClassifierTable(snap)
|
||||
printClassifierTable(snap, cfg)
|
||||
return 0
|
||||
}
|
||||
|
||||
@@ -86,7 +86,7 @@ func printArmTable(snap router.QualitySnapshot) {
|
||||
_ = tw.Flush()
|
||||
}
|
||||
|
||||
func printClassifierTable(snap router.QualitySnapshot) {
|
||||
func printClassifierTable(snap router.QualitySnapshot, cfg *gnomacfg.Config) {
|
||||
fmt.Println("Classifier source breakdown:")
|
||||
counts := snap.ClassifierCounts
|
||||
if len(counts) == 0 {
|
||||
@@ -125,16 +125,39 @@ func printClassifierTable(snap router.QualitySnapshot) {
|
||||
_ = tw.Flush()
|
||||
fmt.Printf(" total observations: %d\n", total)
|
||||
|
||||
// Phase-4 trust hint.
|
||||
// Effective heuristic share: both pure heuristic and slm_fallback
|
||||
// observations were routed via the HeuristicClassifier — the only
|
||||
// difference is whether the SLM was attempted first. Surfacing the
|
||||
// combined share answers "how often did the SLM actually drive
|
||||
// routing?" honestly.
|
||||
effectiveHeuristic := counts["heuristic"] + counts["slm_fallback"]
|
||||
if total > 0 {
|
||||
fmt.Printf(" effective heuristic share: %.1f%% (%d fallbacks + %d pure heuristic)\n",
|
||||
float64(effectiveHeuristic)/float64(total)*100,
|
||||
counts["slm_fallback"], counts["heuristic"])
|
||||
}
|
||||
|
||||
// Phase-4 trust hint. Distinguishes the three diagnostic cases —
|
||||
// SLM never called, SLM called but every call failed, SLM working
|
||||
// but minority share — and templates the actionable advice off
|
||||
// the configured backend so the hint doesn't mention llamafile
|
||||
// when the user is on ollama (or vice versa).
|
||||
slmShare := 0.0
|
||||
if total > 0 {
|
||||
slmShare = float64(counts["slm"]) / float64(total) * 100
|
||||
}
|
||||
backend := "the SLM"
|
||||
if cfg != nil && cfg.SLM.Backend != "" {
|
||||
backend = cfg.SLM.Backend
|
||||
}
|
||||
switch {
|
||||
case total < 50:
|
||||
fmt.Println(" hint: < 50 observations — too sparse for Phase 4 trust signal yet.")
|
||||
case counts["slm"] == 0:
|
||||
fmt.Println(" hint: SLM has never classified — check that llamafile boots before short-lived runs end.")
|
||||
case counts["slm"] == 0 && counts["slm_fallback"] == 0:
|
||||
fmt.Printf(" hint: SLM never called — check [slm].enabled and that %s is reachable.\n", backend)
|
||||
case counts["slm"] == 0 && counts["slm_fallback"] > 0:
|
||||
fmt.Printf(" hint: SLM was called %d times but every call fell back — run with `--verbose` to see the underlying error (likely a timeout or parse failure for %s).\n",
|
||||
counts["slm_fallback"], backend)
|
||||
case slmShare < 50:
|
||||
fmt.Printf(" hint: SLM share is %.0f%% — fallback is doing most of the work.\n", slmShare)
|
||||
}
|
||||
|
||||
@@ -48,6 +48,13 @@ type SLMSection struct {
|
||||
DataDir string `toml:"data_dir"` // llamafile-only: where to put it (empty = XDG default)
|
||||
ExpectedSHA256 string `toml:"expected_sha256"` // llamafile-only: verify hash if non-empty
|
||||
StartupTimeout Duration `toml:"startup_timeout"` // llamafile-only: first-launch wait budget; 0 = default 5s
|
||||
|
||||
// ClassifyTimeout caps each task-classification call to the SLM.
|
||||
// 0 here means "use the built-in default" (15s). Cold-start model
|
||||
// loads + thinking-mode first-token latency can easily exceed 5s
|
||||
// on smaller hardware, so the default is generous. Tune down to
|
||||
// 2-3s on fast setups, or up to 30s for very slow ones.
|
||||
ClassifyTimeout Duration `toml:"classify_timeout"`
|
||||
}
|
||||
|
||||
// ArmConfig tunes routing for a single registered arm. Multiple [[arms]]
|
||||
|
||||
@@ -14,10 +14,13 @@ import (
|
||||
"somegit.dev/Owlibou/gnoma/internal/stream"
|
||||
)
|
||||
|
||||
// defaultClassifyTimeout — 5 s accommodates thinking-mode models like
|
||||
// Qwen3 distillations (Tiny3.5) that emit reasoning tokens before output.
|
||||
// Non-thinking models complete in well under 1 s.
|
||||
const defaultClassifyTimeout = 5 * time.Second
|
||||
// defaultClassifyTimeout — 15 s accommodates cold-start model loads
|
||||
// (ollama lazily loads on first call, ~2-8s for a 1.5B model on SSD)
|
||||
// combined with thinking-mode first-token latency (Qwen3 distillations
|
||||
// like Tiny3.5 sometimes emit <think> tokens before the JSON output
|
||||
// even with /no_think). Non-thinking warm models complete in well
|
||||
// under 1 s. Tune via [slm].classify_timeout in config.
|
||||
const defaultClassifyTimeout = 15 * time.Second
|
||||
|
||||
const classifySystemPrompt = `Classify the following coding request. /no_think
|
||||
Respond with JSON only, no other text, no reasoning, no thinking tags.
|
||||
@@ -47,14 +50,18 @@ type Classifier struct {
|
||||
|
||||
// NewClassifier creates a Classifier. model is the model name passed to the provider
|
||||
// (llamafile ignores it but openaicompat requires a non-empty value).
|
||||
func NewClassifier(p provider.Provider, model string, logger *slog.Logger) *Classifier {
|
||||
// Pass timeout=0 to use the built-in default (defaultClassifyTimeout).
|
||||
func NewClassifier(p provider.Provider, model string, timeout time.Duration, logger *slog.Logger) *Classifier {
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
if timeout <= 0 {
|
||||
timeout = defaultClassifyTimeout
|
||||
}
|
||||
return &Classifier{
|
||||
provider: p,
|
||||
model: model,
|
||||
timeout: defaultClassifyTimeout,
|
||||
timeout: timeout,
|
||||
logger: logger,
|
||||
}
|
||||
}
|
||||
@@ -68,7 +75,11 @@ func (c *Classifier) Classify(ctx context.Context, prompt string, history []mess
|
||||
|
||||
resp, err := c.callSLM(tctx, prompt)
|
||||
if err != nil {
|
||||
c.logger.Debug("slm classify fallback", "error", err)
|
||||
// Warn-level so a first-time misconfiguration (timeout too tight,
|
||||
// wrong endpoint, malformed JSON from the model) surfaces without
|
||||
// requiring --verbose. The fallback path itself is benign; the
|
||||
// signal is that the SLM isn't doing the work it was supposed to.
|
||||
c.logger.Warn("slm classify fallback", "error", err, "timeout", c.timeout)
|
||||
t, ferr := router.HeuristicClassifier{}.Classify(ctx, prompt, history)
|
||||
t.ClassifierSource = router.ClassifierSLMFallback
|
||||
return t, ferr
|
||||
|
||||
@@ -54,7 +54,7 @@ func TestClassifier_HappyPath(t *testing.T) {
|
||||
// SLM complexity 0.55 stays above the Debug floor (0.4), so the SLM
|
||||
// value is preserved verbatim.
|
||||
p := &mockProvider{text: `{"task_type":"Debug","complexity":0.55,"requires_tools":false}`}
|
||||
cls := NewClassifier(p, "default", nil)
|
||||
cls := NewClassifier(p, "default", 0, nil)
|
||||
|
||||
task, err := cls.Classify(context.Background(), "fix the failing test", nil)
|
||||
if err != nil {
|
||||
@@ -76,7 +76,7 @@ func TestClassifier_AppliesTaskTypeFloor(t *testing.T) {
|
||||
// bump ComplexityScore up to the floor so the SLM arm can't be picked
|
||||
// for its own kind of misclassification.
|
||||
p := &mockProvider{text: `{"task_type":"Debug","complexity":0.25,"requires_tools":false}`}
|
||||
cls := NewClassifier(p, "default", nil)
|
||||
cls := NewClassifier(p, "default", 0, nil)
|
||||
|
||||
task, err := cls.Classify(context.Background(), "fix the failing test", nil)
|
||||
if err != nil {
|
||||
@@ -91,7 +91,7 @@ func TestClassifier_AppliesTaskTypeFloor(t *testing.T) {
|
||||
func TestClassifier_BlendHeuristic(t *testing.T) {
|
||||
// SLM returns one type; other Task fields should come from heuristic.
|
||||
p := &mockProvider{text: `{"task_type":"Boilerplate","complexity":0.1,"requires_tools":false}`}
|
||||
cls := NewClassifier(p, "default", nil)
|
||||
cls := NewClassifier(p, "default", 0, nil)
|
||||
|
||||
task, err := cls.Classify(context.Background(), "scaffold a new HTTP handler", nil)
|
||||
if err != nil {
|
||||
@@ -108,7 +108,7 @@ func TestClassifier_BlendHeuristic(t *testing.T) {
|
||||
|
||||
func TestClassifier_FallbackOnBadJSON(t *testing.T) {
|
||||
p := &mockProvider{text: "I cannot classify that."}
|
||||
cls := NewClassifier(p, "default", nil)
|
||||
cls := NewClassifier(p, "default", 0, nil)
|
||||
|
||||
// Should not error — falls back to heuristic.
|
||||
task, err := cls.Classify(context.Background(), "write unit tests for the parser", nil)
|
||||
@@ -123,7 +123,7 @@ func TestClassifier_FallbackOnBadJSON(t *testing.T) {
|
||||
|
||||
func TestClassifier_FallbackOnProviderError(t *testing.T) {
|
||||
p := &mockProvider{err: errors.New("connection refused")}
|
||||
cls := NewClassifier(p, "default", nil)
|
||||
cls := NewClassifier(p, "default", 0, nil)
|
||||
|
||||
task, err := cls.Classify(context.Background(), "explain how generics work", nil)
|
||||
if err != nil {
|
||||
@@ -137,7 +137,7 @@ func TestClassifier_FallbackOnProviderError(t *testing.T) {
|
||||
|
||||
func TestClassifier_FallbackOnTimeout(t *testing.T) {
|
||||
p := &mockProvider{delay: 500 * time.Millisecond}
|
||||
cls := NewClassifier(p, "default", nil)
|
||||
cls := NewClassifier(p, "default", 0, nil)
|
||||
cls.timeout = 50 * time.Millisecond // force timeout
|
||||
|
||||
task, err := cls.Classify(context.Background(), "debug the failing test", nil)
|
||||
@@ -153,7 +153,7 @@ func TestClassifier_FallbackOnTimeout(t *testing.T) {
|
||||
func TestClassifier_FenceStripping(t *testing.T) {
|
||||
fenced := "```json\n{\"task_type\":\"Refactor\",\"complexity\":0.5,\"requires_tools\":true}\n```"
|
||||
p := &mockProvider{text: fenced}
|
||||
cls := NewClassifier(p, "default", nil)
|
||||
cls := NewClassifier(p, "default", 0, nil)
|
||||
|
||||
task, err := cls.Classify(context.Background(), "refactor the auth middleware", nil)
|
||||
if err != nil {
|
||||
@@ -166,7 +166,7 @@ func TestClassifier_FenceStripping(t *testing.T) {
|
||||
|
||||
func TestClassifier_UnknownTaskType_FallsBackToHeuristic(t *testing.T) {
|
||||
p := &mockProvider{text: `{"task_type":"FooBar","complexity":0.3,"requires_tools":false}`}
|
||||
cls := NewClassifier(p, "default", nil)
|
||||
cls := NewClassifier(p, "default", 0, nil)
|
||||
|
||||
task, err := cls.Classify(context.Background(), "implement a binary search function", nil)
|
||||
if err != nil {
|
||||
@@ -178,7 +178,7 @@ func TestClassifier_UnknownTaskType_FallsBackToHeuristic(t *testing.T) {
|
||||
|
||||
func TestClassifier_SetsClassifierSource_OnSuccess(t *testing.T) {
|
||||
p := &mockProvider{text: `{"task_type":"Debug","complexity":0.3,"requires_tools":true}`}
|
||||
cls := NewClassifier(p, "default", nil)
|
||||
cls := NewClassifier(p, "default", 0, nil)
|
||||
task, err := cls.Classify(context.Background(), "fix the failing test", nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
@@ -190,7 +190,7 @@ func TestClassifier_SetsClassifierSource_OnSuccess(t *testing.T) {
|
||||
|
||||
func TestClassifier_SetsClassifierSource_OnFallback(t *testing.T) {
|
||||
p := &mockProvider{err: errors.New("backend unreachable")}
|
||||
cls := NewClassifier(p, "default", nil)
|
||||
cls := NewClassifier(p, "default", 0, nil)
|
||||
task, err := cls.Classify(context.Background(), "fix the failing test", nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
@@ -202,7 +202,7 @@ func TestClassifier_SetsClassifierSource_OnFallback(t *testing.T) {
|
||||
|
||||
func TestClassifier_ContextPassedToHistory(t *testing.T) {
|
||||
p := &mockProvider{text: `{"task_type":"Explain","complexity":0.2,"requires_tools":false}`}
|
||||
cls := NewClassifier(p, "default", nil)
|
||||
cls := NewClassifier(p, "default", 0, nil)
|
||||
|
||||
history := []message.Message{
|
||||
{Role: message.RoleUser, Content: []message.Content{{Type: message.ContentText, Text: "prior"}}},
|
||||
|
||||
Reference in New Issue
Block a user