Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 7213a1e2fd | |||
| fd327107df | |||
| 0d3d190a8b | |||
| c065a2dea7 | |||
| 24945b1eb2 | |||
| c0c2e4bff5 | |||
| f3c70bd802 |
@@ -364,9 +364,12 @@ gnoma can run a tiny local model alongside the main provider to:
|
||||
|
||||
```toml
|
||||
[slm]
|
||||
enabled = true
|
||||
backend = "auto" # ollama | llamacpp | llamafile | openaicompat | auto | disabled
|
||||
model = "reecdev/tiny3.5:500m"
|
||||
enabled = true
|
||||
backend = "auto" # ollama | llamacpp | llamafile | openaicompat | auto | disabled
|
||||
model = "qwen3:0.6b"
|
||||
register_as_arm = true # default; set to false to make the SLM classifier-only
|
||||
# (e.g. for FunctionGemma, code-completion-tuned models)
|
||||
classify_timeout = "15s" # default; bump higher for slow cold-loads
|
||||
```
|
||||
|
||||
Setup, presets, and verification: [docs/slm-backends.md](docs/slm-backends.md).
|
||||
|
||||
@@ -146,7 +146,10 @@ Active work, newest first.
|
||||
decision in #1.
|
||||
|
||||
Surfaced from the r/coolgithubprojects v0.3.1 launch thread
|
||||
(2026-05-24, `u/Ha_Deal_5079`).
|
||||
(2026-05-24, `u/Ha_Deal_5079`). The encoder + contextual bandit
|
||||
alternative is now sketched in
|
||||
[`docs/superpowers/plans/2026-05-25-encoder-bandit-router.md`](docs/superpowers/plans/2026-05-25-encoder-bandit-router.md) —
|
||||
that plan supersedes #1 above when it ships.
|
||||
|
||||
- **Security boundary — egress controls + session audit log.** The
|
||||
current `Firewall` is a content boundary only (scans messages and
|
||||
|
||||
+27
-10
@@ -180,7 +180,7 @@ func main() {
|
||||
case "slm":
|
||||
os.Exit(runSLMCommand(cliArgs[1:], cfg, logger))
|
||||
case "router":
|
||||
os.Exit(runRouterCommand(cliArgs[1:], profile))
|
||||
os.Exit(runRouterCommand(cliArgs[1:], cfg, profile))
|
||||
case "profile":
|
||||
os.Exit(runProfileCommand(cliArgs[1:], cfg, profile))
|
||||
}
|
||||
@@ -881,21 +881,38 @@ func main() {
|
||||
// transport and as a router arm. Both paths route through the
|
||||
// firewall after fwRef.Set fires above.
|
||||
slmProvider := security.WrapProvider(boot.Provider, fwRef)
|
||||
lazy.set(slm.NewClassifier(slmProvider, boot.Model, logger))
|
||||
lazy.set(slm.NewClassifier(slmProvider, boot.Model, time.Duration(cfg.SLM.ClassifyTimeout), logger))
|
||||
// ToolUse comes from the live probe of the actual model. For
|
||||
// completion-only models (e.g. TinyLlama), the SLM arm only
|
||||
// handles knowledge-only prompts where the trivial-prompt
|
||||
// heuristic flipped RequiresTools=false. For tool-capable
|
||||
// models, the SLM also covers simple file reads etc., gated
|
||||
// by MaxComplexity=0.3.
|
||||
rtr.RegisterArm(&router.Arm{
|
||||
ID: router.ArmID("slm/" + string(boot.Backend)),
|
||||
Provider: slmProvider,
|
||||
ModelName: boot.Model,
|
||||
IsLocal: true,
|
||||
MaxComplexity: 0.3,
|
||||
Capabilities: provider.Capabilities{ToolUse: boot.ToolSupport},
|
||||
})
|
||||
//
|
||||
// [slm].register_as_arm gates the dual-role registration.
|
||||
// Default (nil) is true to preserve pre-config behaviour.
|
||||
// Explicit false makes the SLM classifier-only, which is
|
||||
// the correct setting for task-specialised models
|
||||
// (FunctionGemma, code-completion-tuned models, etc.) that
|
||||
// would mishandle a general prompt routed to them as the
|
||||
// answer-producing arm.
|
||||
registerAsArm := true
|
||||
if cfg.SLM.RegisterAsArm != nil {
|
||||
registerAsArm = *cfg.SLM.RegisterAsArm
|
||||
}
|
||||
if registerAsArm {
|
||||
rtr.RegisterArm(&router.Arm{
|
||||
ID: router.ArmID("slm/" + string(boot.Backend)),
|
||||
Provider: slmProvider,
|
||||
ModelName: boot.Model,
|
||||
IsLocal: true,
|
||||
MaxComplexity: 0.3,
|
||||
Capabilities: provider.Capabilities{ToolUse: boot.ToolSupport},
|
||||
})
|
||||
} else {
|
||||
logger.Info("SLM registered as classifier only ([slm].register_as_arm=false)",
|
||||
"model", boot.Model)
|
||||
}
|
||||
slmCleanup = boot.Close
|
||||
slmInfo.Active = true
|
||||
slmInfo.Backend = string(boot.Backend)
|
||||
|
||||
+31
-8
@@ -12,7 +12,7 @@ import (
|
||||
)
|
||||
|
||||
// runRouterCommand handles `gnoma router <subcommand>`. Returns an exit code.
|
||||
func runRouterCommand(args []string, profile gnomacfg.Profile) int {
|
||||
func runRouterCommand(args []string, cfg *gnomacfg.Config, profile gnomacfg.Profile) int {
|
||||
if len(args) == 0 {
|
||||
fmt.Fprintln(os.Stderr, "usage: gnoma router <command>")
|
||||
fmt.Fprintln(os.Stderr, "commands:")
|
||||
@@ -21,14 +21,14 @@ func runRouterCommand(args []string, profile gnomacfg.Profile) int {
|
||||
}
|
||||
switch args[0] {
|
||||
case "stats":
|
||||
return runRouterStats(profile)
|
||||
return runRouterStats(cfg, profile)
|
||||
default:
|
||||
fmt.Fprintf(os.Stderr, "unknown router command: %s\n", args[0])
|
||||
return 1
|
||||
}
|
||||
}
|
||||
|
||||
func runRouterStats(profile gnomacfg.Profile) int {
|
||||
func runRouterStats(cfg *gnomacfg.Config, profile gnomacfg.Profile) int {
|
||||
path := profile.QualityFile(gnomacfg.GlobalConfigDir())
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
@@ -52,7 +52,7 @@ func runRouterStats(profile gnomacfg.Profile) int {
|
||||
}
|
||||
printArmTable(snap)
|
||||
fmt.Println()
|
||||
printClassifierTable(snap)
|
||||
printClassifierTable(snap, cfg)
|
||||
return 0
|
||||
}
|
||||
|
||||
@@ -86,7 +86,7 @@ func printArmTable(snap router.QualitySnapshot) {
|
||||
_ = tw.Flush()
|
||||
}
|
||||
|
||||
func printClassifierTable(snap router.QualitySnapshot) {
|
||||
func printClassifierTable(snap router.QualitySnapshot, cfg *gnomacfg.Config) {
|
||||
fmt.Println("Classifier source breakdown:")
|
||||
counts := snap.ClassifierCounts
|
||||
if len(counts) == 0 {
|
||||
@@ -125,16 +125,39 @@ func printClassifierTable(snap router.QualitySnapshot) {
|
||||
_ = tw.Flush()
|
||||
fmt.Printf(" total observations: %d\n", total)
|
||||
|
||||
// Phase-4 trust hint.
|
||||
// Effective heuristic share: both pure heuristic and slm_fallback
|
||||
// observations were routed via the HeuristicClassifier — the only
|
||||
// difference is whether the SLM was attempted first. Surfacing the
|
||||
// combined share answers "how often did the SLM actually drive
|
||||
// routing?" honestly.
|
||||
effectiveHeuristic := counts["heuristic"] + counts["slm_fallback"]
|
||||
if total > 0 {
|
||||
fmt.Printf(" effective heuristic share: %.1f%% (%d fallbacks + %d pure heuristic)\n",
|
||||
float64(effectiveHeuristic)/float64(total)*100,
|
||||
counts["slm_fallback"], counts["heuristic"])
|
||||
}
|
||||
|
||||
// Phase-4 trust hint. Distinguishes the three diagnostic cases —
|
||||
// SLM never called, SLM called but every call failed, SLM working
|
||||
// but minority share — and templates the actionable advice off
|
||||
// the configured backend so the hint doesn't mention llamafile
|
||||
// when the user is on ollama (or vice versa).
|
||||
slmShare := 0.0
|
||||
if total > 0 {
|
||||
slmShare = float64(counts["slm"]) / float64(total) * 100
|
||||
}
|
||||
backend := "the SLM"
|
||||
if cfg != nil && cfg.SLM.Backend != "" {
|
||||
backend = cfg.SLM.Backend
|
||||
}
|
||||
switch {
|
||||
case total < 50:
|
||||
fmt.Println(" hint: < 50 observations — too sparse for Phase 4 trust signal yet.")
|
||||
case counts["slm"] == 0:
|
||||
fmt.Println(" hint: SLM has never classified — check that llamafile boots before short-lived runs end.")
|
||||
case counts["slm"] == 0 && counts["slm_fallback"] == 0:
|
||||
fmt.Printf(" hint: SLM never called — check [slm].enabled and that %s is reachable.\n", backend)
|
||||
case counts["slm"] == 0 && counts["slm_fallback"] > 0:
|
||||
fmt.Printf(" hint: SLM was called %d times but every call fell back — run with `--verbose` to see the underlying error (likely a timeout or parse failure for %s).\n",
|
||||
counts["slm_fallback"], backend)
|
||||
case slmShare < 50:
|
||||
fmt.Printf(" hint: SLM share is %.0f%% — fallback is doing most of the work.\n", slmShare)
|
||||
}
|
||||
|
||||
+24
-10
@@ -24,27 +24,41 @@ The "ollama" path is the easiest if you're already running a local model — it
|
||||
|
||||
## Presets
|
||||
|
||||
Presets use `reecdev/tiny3.5:500m` as the default model — a 500 M-parameter Qwen3.5 distillation with tool support, available on Ollama. Pull it once with:
|
||||
Presets use `qwen3:0.6b` as the default model — a 600 M-parameter Qwen3 instruction-tuned model with native `/no_think` support, available on Ollama. Pull it once with:
|
||||
|
||||
```bash
|
||||
ollama pull reecdev/tiny3.5:500m # ~1 GB
|
||||
# or the 1.5 B variant for slightly better quality:
|
||||
ollama pull reecdev/tiny3.5:1.5b # ~3 GB
|
||||
ollama pull qwen3:0.6b # ~520 MB
|
||||
```
|
||||
|
||||
### Model choice notes
|
||||
|
||||
Empirical testing (2026-05-25) across three candidate SLMs on identical prompts:
|
||||
|
||||
| Model | Classifier success | Notes |
|
||||
|---|---|---|
|
||||
| `qwen3:0.6b` | consistent across trivial + knowledge prompts | recommended default; honours `/no_think` cleanly |
|
||||
| `functiongemma:270m` | works on trivial prompts, derails on knowledge ones | needs function-signature prompt rewrite or LoRA fine-tune to be reliable |
|
||||
| `gemma3:1b` | unusable | emits malformed JSON (just `{` or invented keys) |
|
||||
| `reecdev/tiny3.5:1.5b` | unusable | thinking-mode distillation; ignores `/no_think` and emits `<Thought Process>` blocks |
|
||||
| `qwen2.5-coder:1.5b` | unusable | code-completion-tuned; ignores the classifier prompt entirely and answers in prose |
|
||||
|
||||
Substitute any small Ollama model you prefer. The probe at startup reads each model's actual capability — `tools` enables the SLM arm to handle simple file reads; without it, the SLM only handles knowledge-only prompts.
|
||||
|
||||
If your SLM is task-specialised (function-call models like FunctionGemma; embedding-only models; code-completion-tuned models) and produces wrong-shape output when asked to answer a general prompt, set `register_as_arm = false` so the SLM stays classifier-only and execution routes to other local arms.
|
||||
|
||||
### Preset 1 — Ollama (recommended for most users)
|
||||
|
||||
```toml
|
||||
[slm]
|
||||
enabled = true
|
||||
backend = "ollama"
|
||||
model = "reecdev/tiny3.5:500m"
|
||||
enabled = true
|
||||
backend = "ollama"
|
||||
model = "qwen3:0.6b"
|
||||
register_as_arm = true # default; set false for classifier-only models
|
||||
classify_timeout = "15s" # default; bump for slow cold-load
|
||||
# base_url defaults to http://localhost:11434
|
||||
```
|
||||
|
||||
Prereq: `ollama pull reecdev/tiny3.5:500m` (or any model you'd rather use).
|
||||
Prereq: `ollama pull qwen3:0.6b` (or any model you'd rather use).
|
||||
|
||||
### Preset 2 — llama.cpp server
|
||||
|
||||
@@ -150,10 +164,10 @@ Output looks like:
|
||||
```
|
||||
slm enabled: true
|
||||
slm backend: ollama
|
||||
model: reecdev/tiny3.5:500m
|
||||
model: qwen3:0.6b
|
||||
|
||||
live probe:
|
||||
✓ ollama ready (model=reecdev/tiny3.5:500m, boot=0s)
|
||||
✓ ollama ready (model=qwen3:0.6b, boot=0s)
|
||||
```
|
||||
|
||||
Run a few prompts, then check:
|
||||
|
||||
@@ -1,5 +1,14 @@
|
||||
# Tool-Router Specialization (functiongemma) — 2026-05-23
|
||||
|
||||
> **Companion plan from 2026-05-25:**
|
||||
> [`2026-05-25-encoder-bandit-router.md`](2026-05-25-encoder-bandit-router.md)
|
||||
> sketches an alternative architecture (encoder + contextual bandit
|
||||
> instead of decoder-SLM-as-classifier). The two are complementary,
|
||||
> not competing — FunctionGemma fits as the optional Phase 5 "JSON
|
||||
> sanity layer" in that plan. Decide which track to invest in based
|
||||
> on the did-switch-rate telemetry (this plan) vs the bandit-data
|
||||
> accumulation (companion plan).
|
||||
|
||||
Follow-up to
|
||||
[`2026-05-19-post-slm-unlock.md`](2026-05-19-post-slm-unlock.md)
|
||||
Phase A, which shipped two-stage tool routing: round 1 sends a single
|
||||
|
||||
@@ -0,0 +1,344 @@
|
||||
# Encoder + Contextual-Bandit Router — 2026-05-25
|
||||
|
||||
Proposes a long-arc architectural rethink of gnoma's routing layer:
|
||||
**replace the decoder-SLM-as-classifier design with an encoder-only
|
||||
embedding model feeding a contextual bandit policy**, and treat a
|
||||
strict tiny SLM (FunctionGemma-270M-it) as the optional "emit a
|
||||
structured route decision" layer rather than the primary classifier.
|
||||
|
||||
Surfaced from external research (RouteLLM, ModernBERT, Gemma 3
|
||||
270M, Qwen3-Embedding, BGE-M3) brought into the 2026-05-25
|
||||
diagnostic session where gnoma's current decoder-SLM classifier
|
||||
exhibited a 100% failure rate across two model swaps
|
||||
(`reecdev/tiny3.5:1.5b`, `qwen2.5-coder:1.5b`).
|
||||
|
||||
This plan is **strategic / multi-month**. Phase 1 below is the only
|
||||
piece scoped for near-term implementation; everything else hinges on
|
||||
the bandit-vs-SLM strategic decision tracked in the existing
|
||||
`Bandit selector — design decisions deferred` TODO entry.
|
||||
|
||||
Sibling plans:
|
||||
[`2026-05-23-tool-router-specialization.md`](2026-05-23-tool-router-specialization.md)
|
||||
already covers the **FunctionGemma fine-tune** track as the
|
||||
strict-SLM option; this plan adds the **encoder + bandit** track
|
||||
as the alternative (and arguably better-suited) architecture.
|
||||
|
||||
---
|
||||
|
||||
## Problem
|
||||
|
||||
The current router has three coupled problems:
|
||||
|
||||
1. **The classifier is a decoder LLM in a job an encoder would do
|
||||
better.** Routing is a classification task with cost/quality
|
||||
trade-offs, not a reasoning task. Asking a decoder model to emit
|
||||
structured JSON for every classify call is high-latency, fragile
|
||||
to chain-of-thought leakage, and indeterministic.
|
||||
|
||||
2. **The bandit can't actually learn quality** because the only
|
||||
success signal is `err == nil` (per `internal/engine/loop.go:118`).
|
||||
EMA scores converge to 1.00 for every arm — see the 2026-05-24
|
||||
`router stats` snapshot where 22 of 25 arm/task pairs sit at
|
||||
exactly 1.00.
|
||||
|
||||
3. **The classifier and bandit live in adjacent code but were
|
||||
designed in separate phases**, so the integration point (`Task`
|
||||
built by SLM classifier → fed to `selectBest`) is just data
|
||||
flow, not a learning loop. The SLM's wins/losses don't update
|
||||
the SLM; the bandit's wins/losses don't change which arms the
|
||||
classifier considers.
|
||||
|
||||
The 100% SLM-failure incident on 2026-05-25 made (1) urgent. The
|
||||
zero-discrimination EMA on 2026-05-24 made (2) urgent. (3) is the
|
||||
underlying integration debt.
|
||||
|
||||
---
|
||||
|
||||
## Non-goals
|
||||
|
||||
- **Killing the existing SLM classifier today.** Phase 1 of this
|
||||
plan is purely additive (encoder feature extraction); the existing
|
||||
classifier stays as a baseline until the new path is measurably
|
||||
better.
|
||||
- **Reimplementing bandit math.** LinUCB and Thompson Sampling are
|
||||
well-understood. The work is the feature pipeline and reward
|
||||
function, not the policy core.
|
||||
- **Choosing a single embedding model permanently.** Phase 1 ships
|
||||
with a default but exposes a `[slm.embedding].model` knob so
|
||||
swapping is config-only.
|
||||
- **The strict-SLM track.** FunctionGemma fine-tuning is the sibling
|
||||
`2026-05-23-tool-router-specialization.md` plan; this plan
|
||||
references it but does not duplicate it.
|
||||
|
||||
---
|
||||
|
||||
## Background — research summary
|
||||
|
||||
Citations follow the user-provided research thread (RouteLLM 2024,
|
||||
ModernBERT 2024, Google FunctionGemma 2025).
|
||||
|
||||
- **RouteLLM** tested router types as a classification problem:
|
||||
similarity routing, matrix factorization, BERT classifier, causal
|
||||
LLM classifier. The BERT classifier was competitive with the
|
||||
causal-LLM classifier at lower cost and latency. Routing is a
|
||||
classification task; treating it like a generation task is paying
|
||||
generation cost for classification value.
|
||||
- **ModernBERT** (Dec 2024) is an encoder-only model with 8k context,
|
||||
trained partly on code, designed for fast classification and
|
||||
retrieval. The 'base' size is ~150M parameters, the 'large' size
|
||||
~400M. Both are tiny compared to even small decoder LLMs.
|
||||
- **FunctionGemma-270M-it** (Aug 2025) is Google's small model
|
||||
fine-tuned for natural-language → function-call output. Google's
|
||||
own positioning materials list **query routing** as a use case.
|
||||
- **Qwen3-Embedding-0.6B** and **BGE-M3** are strong multilingual
|
||||
embedding models with long-context support; either can serve as
|
||||
feature extractors for downstream classification or bandit
|
||||
policies.
|
||||
|
||||
The throughline: **encoder models are the right tool for the
|
||||
classification side of routing**; generative SLMs (FunctionGemma)
|
||||
are the right tool only when the *output* must be a structured
|
||||
decision blob with confidence + tags + fallback. For pure routing,
|
||||
encoder features + bandit policy is cheaper, faster, more
|
||||
deterministic.
|
||||
|
||||
---
|
||||
|
||||
## Approach overview
|
||||
|
||||
Five phases. Phase 1 is near-term; Phases 2–4 are the actual
|
||||
architectural shift; Phase 5 is the long-arc fine-tune.
|
||||
|
||||
### Phase 1 — Embedding feature scaffold (near-term, additive)
|
||||
|
||||
Add an embedding pipeline that runs alongside the existing
|
||||
classifier. Extract features for every prompt; log them to disk
|
||||
next to the existing quality-EMA. No routing decision changes yet.
|
||||
|
||||
**Why first:** lets us build up a labelled dataset of (prompt,
|
||||
features, arm, outcome) tuples without disturbing today's routing
|
||||
behaviour. Phase 2 trains against this dataset.
|
||||
|
||||
### Phase 2 — Contextual bandit over the feature set
|
||||
|
||||
Once Phase 1 has ~500–1000 labelled observations, swap `selectBest`
|
||||
from heuristic quality + EMA score to a LinUCB-style contextual
|
||||
bandit that takes the embedding features + the existing arm metadata
|
||||
(MaxComplexity, CostWeight, Strengths). The existing EMA quality
|
||||
score becomes one feature among many.
|
||||
|
||||
### Phase 3 — Retire the decoder-SLM classifier
|
||||
|
||||
When Phase 2 routing is measurably better than today's heuristic +
|
||||
EMA blend, the decoder-SLM classifier (currently producing 0
|
||||
useful classifications on the user's setup) is no longer
|
||||
load-bearing. Deprecate it; keep the same `[slm]` config knobs for
|
||||
backwards compatibility but route them at a different runtime path.
|
||||
|
||||
### Phase 4 — ModernBERT fine-tune
|
||||
|
||||
The off-the-shelf embedding model from Phase 1 (BGE-M3 or
|
||||
Qwen3-Embedding-0.6B by default) gives general-purpose embeddings.
|
||||
Phase 4 fine-tunes a router-specific classification head on top of
|
||||
ModernBERT-base using the labelled dataset accumulated since Phase
|
||||
1. Pure performance win; falls back gracefully to off-the-shelf
|
||||
embeddings if the fine-tune isn't loaded.
|
||||
|
||||
### Phase 5 — FunctionGemma JSON sanity layer (optional)
|
||||
|
||||
For users who want a structured route decision (arm + confidence +
|
||||
fallback) alongside or instead of the bandit output, plug
|
||||
FunctionGemma-270M-it (fine-tuned per the
|
||||
`tool-router-specialization` plan) as a final-stage decision blob
|
||||
emitter. Sits *after* the encoder + bandit, not in front of them.
|
||||
|
||||
---
|
||||
|
||||
## Phase 1 — Embedding feature scaffold (detailed)
|
||||
|
||||
This is the only phase scoped for near-term implementation. The
|
||||
others depend on Phase 1's data accumulation.
|
||||
|
||||
### What lands
|
||||
|
||||
- New package `internal/router/features` with:
|
||||
- `Embedder` interface: `Embed(ctx, prompt string) ([]float32, error)`.
|
||||
- Implementations: `OllamaEmbedder`, `BGE3Embedder`, `NoopEmbedder`
|
||||
(default; returns nil features when no embedding model is
|
||||
configured).
|
||||
- New config `[slm.embedding]` section:
|
||||
```toml
|
||||
[slm.embedding]
|
||||
enabled = false # default off; opt-in
|
||||
backend = "ollama" # ollama | bge-m3 | noop
|
||||
model = "qwen3-embedding:0.6b" # ollama model tag
|
||||
base_url = "" # backend endpoint override
|
||||
```
|
||||
- Feature extraction hook in `internal/engine/loop.go`: after the
|
||||
classifier runs but before `selectBest`, compute the embedding
|
||||
for the prompt and attach to the routing `Task` as an opaque
|
||||
`Features []float32` field.
|
||||
- New on-disk store at `~/.config/gnoma/router-features.jsonl`,
|
||||
one record per observation: `{ts, prompt_hash, features,
|
||||
task_type, arm_id, success, tokens, duration}`.
|
||||
- `prompt_hash` is a SHA-256 of the prompt — never the prompt
|
||||
itself — to keep the file local-only-but-not-secret-laden.
|
||||
- Append-only, atomic-write, incognito-gated, same discipline as
|
||||
the firewall audit log.
|
||||
- No selector change. `selectBest` continues to use today's
|
||||
heuristic + EMA blend. Phase 1 just observes.
|
||||
|
||||
### Why off by default
|
||||
|
||||
Embedding inference adds 50–200ms per prompt depending on backend
|
||||
and model size. That latency is fine for ollama users running on
|
||||
a workstation, painful for users on slower setups. Opt-in keeps
|
||||
the regression risk at zero.
|
||||
|
||||
### Phase 1 task list
|
||||
|
||||
- **F1-1:** Define the `Embedder` interface and `NoopEmbedder` in
|
||||
`internal/router/features/`.
|
||||
- **F1-2:** `OllamaEmbedder` wraps `provider/openaicompat` with the
|
||||
ollama embedding endpoint (`/api/embeddings`).
|
||||
- **F1-3:** Add the `[slm.embedding]` config section to
|
||||
`internal/config/config.go` with the same defaults-via-zero
|
||||
discipline as the rest of the config.
|
||||
- **F1-4:** Wire the embedder into `loop.go` between classifier and
|
||||
selector. Failures log at Debug and don't block routing.
|
||||
- **F1-5:** Append-only feature store in
|
||||
`~/.config/gnoma/router-features.jsonl` with atomic writes,
|
||||
incognito gate, opt-out via `[slm.embedding].enabled = false`.
|
||||
- **F1-6:** Tests covering: embedder mock + observation record;
|
||||
noop embedder produces empty features; incognito skips the
|
||||
store entirely.
|
||||
|
||||
---
|
||||
|
||||
## Phase 2+ — Bandit policy (sketch only; needs data first)
|
||||
|
||||
Spelled out for context. Not for near-term implementation.
|
||||
|
||||
### Feature set per the research
|
||||
|
||||
```
|
||||
prompt_embedding — 384-1024 dim depending on model
|
||||
token_count — len of tokenized prompt
|
||||
language — ISO code from a small lang-detect
|
||||
has_code — fenced-block heuristic
|
||||
has_error_log — pattern match for stack traces
|
||||
needs_tools — from current heuristic
|
||||
needs_vision — from [Image:...] markers
|
||||
estimated_complexity — current heuristic score
|
||||
requested_latency — turn-budget hint (future)
|
||||
arm_context_window — from arm metadata
|
||||
arm_vram_cost — from arm metadata
|
||||
arm_avg_latency — from quality EMA
|
||||
arm_success_rate — from quality EMA
|
||||
```
|
||||
|
||||
### Reward function per the research
|
||||
|
||||
```
|
||||
reward = quality_score
|
||||
- latency_penalty
|
||||
- vram_penalty
|
||||
- failure_penalty
|
||||
- escalation_penalty
|
||||
```
|
||||
|
||||
- `quality_score`: 1.0 on success, 0.0 on hard error today; richer
|
||||
signal (elf-mediated, user thumbs, tool-call success) once the
|
||||
TODO `Bandit selector — design decisions deferred` resolves.
|
||||
- `latency_penalty`: monotone in observed seconds.
|
||||
- `vram_penalty`: monotone in declared VRAM cost.
|
||||
- `failure_penalty`: hard cost on explicit errors (sandbox
|
||||
denied, parse failed).
|
||||
- `escalation_penalty`: cost when a downstream elf had to escalate
|
||||
to a heavier arm because this arm failed.
|
||||
|
||||
### Policy
|
||||
|
||||
LinUCB (linear contextual bandit, deterministic exploration
|
||||
bounded by UCB) or Thompson Sampling (Bayesian, smoother
|
||||
exploration). LinUCB is the safer starting point — fewer
|
||||
hyperparameters, well-known behaviour, easier to debug.
|
||||
|
||||
---
|
||||
|
||||
## Risks
|
||||
|
||||
- **Latency.** Embedding inference adds 50–200ms per prompt. Phase
|
||||
1's opt-in default means users see no regression; Phase 2's
|
||||
"make it default" decision requires latency benchmarks first.
|
||||
- **Data sparsity for fine-tuning (Phase 4).** ModernBERT
|
||||
fine-tuning needs ~10k labelled observations to start being
|
||||
useful. Phase 1 might run for months before Phase 4 is viable.
|
||||
Plan B: synthesise labels from existing prompt logs + rule-based
|
||||
pre-labels.
|
||||
- **Off-the-shelf embedding quality.** BGE-M3 / Qwen3-Embedding
|
||||
weren't trained specifically for routing decisions. Phase 4
|
||||
exists precisely to close this gap; Phase 1's data accumulation
|
||||
is what makes Phase 4 possible.
|
||||
- **Architectural complexity.** This plan introduces an entire new
|
||||
ML pipeline (embedder → feature store → bandit → reward loop).
|
||||
Phase 1 keeps it side-by-side with the existing path; Phase 2's
|
||||
"swap" decision is reversible because the existing path stays
|
||||
in code.
|
||||
- **Privacy.** Prompt hashes (not raw prompts) in the feature
|
||||
store. Still a local-only file; same opt-out plumbing as the
|
||||
project registry from the config-migration plan.
|
||||
|
||||
---
|
||||
|
||||
## Open questions
|
||||
|
||||
- **Should the feature store be per-project or global?** Per-project
|
||||
is more privacy-respecting (one project's prompts don't influence
|
||||
another's routing). Global is more data-efficient (more samples
|
||||
→ better bandit). Phase 1 chooses global by default; revisit
|
||||
during Phase 2.
|
||||
- **How does this interact with `[router].prefer = local|cloud`?**
|
||||
Easy answer: prefer policy stays as a hard tier-shift, applied
|
||||
after bandit selection. Bandit picks the best feasible arm; the
|
||||
prefer policy is consulted as a final filter / weight.
|
||||
- **What about CLI-agent subprocess arms?** They proxy to cloud but
|
||||
run locally; today's `prefer` treats them as non-local. Bandit
|
||||
features should include `is_subprocess` as a distinct feature
|
||||
so the policy can learn the user's preferences for those arms
|
||||
independent of local/cloud.
|
||||
- **Cold start.** With no observations, the bandit defaults to
|
||||
pure exploration. Should we seed with the existing heuristic
|
||||
defaults from `internal/router/defaults.go`? Probably yes —
|
||||
warm-start with the curated Strengths as priors.
|
||||
|
||||
---
|
||||
|
||||
## Rollout
|
||||
|
||||
- **Phase 1** ships as v0.5.0 (additive, opt-in, no behaviour
|
||||
change by default). Schema-touching so warrants a minor bump.
|
||||
- **Phase 2** ships when Phase 1 has accumulated enough data
|
||||
(~500–1000 observations per user) — opt-in via
|
||||
`[router].bandit_policy = "linucb"` initially, becoming default
|
||||
in a later release once measured better.
|
||||
- **Phase 3 (deprecation of decoder-SLM classifier)** is a v0.6.x
|
||||
conversation, gated on Phase 2 measurably outperforming.
|
||||
- **Phase 4 (ModernBERT fine-tune)** is v0.7+ — requires the
|
||||
fine-tuned model artifact distributed via Ollama or HF, plus
|
||||
the auto-download story.
|
||||
- **Phase 5 (FunctionGemma sanity layer)** is independent of all
|
||||
of the above; lands when the sibling `tool-router-specialization`
|
||||
plan justifies it on did-switch-rate telemetry.
|
||||
|
||||
---
|
||||
|
||||
## Cross-references
|
||||
|
||||
- TODO.md entry "Bandit selector — design decisions deferred" —
|
||||
the strategic question this plan answers in the long run.
|
||||
- TODO.md entry "Tool-router specialization (functiongemma)" — the
|
||||
sibling track; complementary, not competing.
|
||||
- [`2026-05-23-tool-router-specialization.md`](2026-05-23-tool-router-specialization.md) — FunctionGemma fine-tune plan.
|
||||
- [`2026-05-07-gnoma-roadmap.md`](2026-05-07-gnoma-roadmap.md) §Phase 4 — the original "re-evaluate bandit learning" entry.
|
||||
- 2026-05-25 diagnostic session (this conversation) — the trigger.
|
||||
@@ -48,6 +48,27 @@ type SLMSection struct {
|
||||
DataDir string `toml:"data_dir"` // llamafile-only: where to put it (empty = XDG default)
|
||||
ExpectedSHA256 string `toml:"expected_sha256"` // llamafile-only: verify hash if non-empty
|
||||
StartupTimeout Duration `toml:"startup_timeout"` // llamafile-only: first-launch wait budget; 0 = default 5s
|
||||
|
||||
// ClassifyTimeout caps each task-classification call to the SLM.
|
||||
// 0 here means "use the built-in default" (15s). Cold-start model
|
||||
// loads + thinking-mode first-token latency can easily exceed 5s
|
||||
// on smaller hardware, so the default is generous. Tune down to
|
||||
// 2-3s on fast setups, or up to 30s for very slow ones.
|
||||
ClassifyTimeout Duration `toml:"classify_timeout"`
|
||||
|
||||
// RegisterAsArm controls whether the SLM model is registered as
|
||||
// a tier-0 execution arm in addition to its classifier role.
|
||||
// nil (absent) → true (preserve historical behaviour: SLM is
|
||||
// both classifier and an execution arm for trivial-complexity
|
||||
// prompts). Explicitly false → SLM is classifier-only; trivial
|
||||
// prompts route to other local arms instead.
|
||||
//
|
||||
// Set this to false when the SLM model is task-specialised
|
||||
// (FunctionGemma, embedding-only models, code-completion-tuned
|
||||
// models) and would produce wrong-shape output if asked to
|
||||
// answer a general prompt. Pointer type so the absent-value
|
||||
// case can be distinguished from explicit false.
|
||||
RegisterAsArm *bool `toml:"register_as_arm"`
|
||||
}
|
||||
|
||||
// ArmConfig tunes routing for a single registered arm. Multiple [[arms]]
|
||||
|
||||
@@ -5,6 +5,8 @@ import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/BurntSushi/toml"
|
||||
)
|
||||
|
||||
func TestDefaults(t *testing.T) {
|
||||
@@ -448,3 +450,50 @@ model = "claude-haiku"
|
||||
t.Errorf("MaxTokens = %d, want 4096 (from global)", cfg.Provider.MaxTokens)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSLMSection_RegisterAsArm_AbsentDefaultsToTrue(t *testing.T) {
|
||||
// Absent field → nil pointer → caller treats as default true,
|
||||
// preserving pre-config behaviour where the SLM is always
|
||||
// registered as an execution arm.
|
||||
var cfg Config
|
||||
if _, err := toml.Decode(`[slm]
|
||||
enabled = true
|
||||
`, &cfg); err != nil {
|
||||
t.Fatalf("decode: %v", err)
|
||||
}
|
||||
if cfg.SLM.RegisterAsArm != nil {
|
||||
t.Errorf("expected nil pointer for absent register_as_arm, got %v", *cfg.SLM.RegisterAsArm)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSLMSection_RegisterAsArm_ExplicitFalse(t *testing.T) {
|
||||
var cfg Config
|
||||
if _, err := toml.Decode(`[slm]
|
||||
enabled = true
|
||||
register_as_arm = false
|
||||
`, &cfg); err != nil {
|
||||
t.Fatalf("decode: %v", err)
|
||||
}
|
||||
if cfg.SLM.RegisterAsArm == nil {
|
||||
t.Fatal("expected non-nil pointer when register_as_arm is set")
|
||||
}
|
||||
if *cfg.SLM.RegisterAsArm {
|
||||
t.Errorf("expected register_as_arm=false to decode as *false, got *true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSLMSection_RegisterAsArm_ExplicitTrue(t *testing.T) {
|
||||
var cfg Config
|
||||
if _, err := toml.Decode(`[slm]
|
||||
enabled = true
|
||||
register_as_arm = true
|
||||
`, &cfg); err != nil {
|
||||
t.Fatalf("decode: %v", err)
|
||||
}
|
||||
if cfg.SLM.RegisterAsArm == nil {
|
||||
t.Fatal("expected non-nil pointer when register_as_arm is set")
|
||||
}
|
||||
if !*cfg.SLM.RegisterAsArm {
|
||||
t.Errorf("expected register_as_arm=true to decode as *true, got *false")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -186,6 +186,26 @@ func translateRequest(req provider.Request) oai.ChatCompletionNewParams {
|
||||
params.ReasoningEffort = effortToReasoningEffort(req.Thinking.Level)
|
||||
}
|
||||
|
||||
// Honour ResponseFormat. ollama (via OpenAI-compatible endpoint) and
|
||||
// llama.cpp both translate response_format=json_object to a decoding-
|
||||
// time JSON constraint, which is the only reliable way to keep small
|
||||
// models from emitting prose where structured output is required.
|
||||
// Previously this field was silently dropped on the OpenAI path,
|
||||
// which is why the SLM classifier saw a 100% prose-failure rate even
|
||||
// after Move 1 wired ResponseFormat at the gnoma layer.
|
||||
if req.ResponseFormat != nil {
|
||||
switch req.ResponseFormat.Type {
|
||||
case provider.ResponseJSON:
|
||||
params.ResponseFormat = oai.ChatCompletionNewParamsResponseFormatUnion{
|
||||
OfJSONObject: &shared.ResponseFormatJSONObjectParam{},
|
||||
}
|
||||
case provider.ResponseText:
|
||||
params.ResponseFormat = oai.ChatCompletionNewParamsResponseFormatUnion{
|
||||
OfText: &shared.ResponseFormatTextParam{},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(params.Tools) > 0 {
|
||||
choice := "auto"
|
||||
if req.ToolChoice != "" {
|
||||
|
||||
@@ -189,3 +189,47 @@ func TestTranslateRequest_ToolChoiceDefault(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestTranslateRequest_ResponseFormatJSON(t *testing.T) {
|
||||
req := provider.Request{
|
||||
Model: "qwen2.5-coder:1.5b",
|
||||
Messages: []message.Message{
|
||||
{Role: message.RoleUser, Content: []message.Content{{Type: message.ContentText, Text: "hi"}}},
|
||||
},
|
||||
ResponseFormat: &provider.ResponseFormat{Type: provider.ResponseJSON},
|
||||
}
|
||||
params := translateRequest(req)
|
||||
if params.ResponseFormat.OfJSONObject == nil {
|
||||
t.Errorf("expected OfJSONObject set when ResponseFormat=ResponseJSON, got %+v", params.ResponseFormat)
|
||||
}
|
||||
if params.ResponseFormat.OfText != nil {
|
||||
t.Errorf("expected OfText nil when ResponseFormat=ResponseJSON")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTranslateRequest_ResponseFormatText(t *testing.T) {
|
||||
req := provider.Request{
|
||||
Model: "qwen2.5-coder:1.5b",
|
||||
Messages: []message.Message{
|
||||
{Role: message.RoleUser, Content: []message.Content{{Type: message.ContentText, Text: "hi"}}},
|
||||
},
|
||||
ResponseFormat: &provider.ResponseFormat{Type: provider.ResponseText},
|
||||
}
|
||||
params := translateRequest(req)
|
||||
if params.ResponseFormat.OfText == nil {
|
||||
t.Errorf("expected OfText set when ResponseFormat=ResponseText, got %+v", params.ResponseFormat)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTranslateRequest_ResponseFormatUnset(t *testing.T) {
|
||||
req := provider.Request{
|
||||
Model: "qwen2.5-coder:1.5b",
|
||||
Messages: []message.Message{
|
||||
{Role: message.RoleUser, Content: []message.Content{{Type: message.ContentText, Text: "hi"}}},
|
||||
},
|
||||
}
|
||||
params := translateRequest(req)
|
||||
if params.ResponseFormat.OfJSONObject != nil || params.ResponseFormat.OfText != nil {
|
||||
t.Errorf("expected zero-valued ResponseFormat when not set, got %+v", params.ResponseFormat)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -93,16 +93,27 @@ func DiscoverOllama(ctx context.Context, baseURL string, probeCache map[string]O
|
||||
Size: m.Size,
|
||||
}
|
||||
|
||||
// Always probe; the cache is optional. Previously nil-cache was
|
||||
// treated as "skip probing entirely", which left SupportsTools
|
||||
// at its zero value (false) for every model — every ollama-
|
||||
// discovered arm then got marked as tool-unsupported and
|
||||
// rejected by filterFeasible for any tool-requiring task. main.go
|
||||
// passes nil from the synchronous discovery path; we still want
|
||||
// real probe data there.
|
||||
var result OllamaProbeResult
|
||||
if probeCache != nil {
|
||||
result, ok := probeCache[m.Name]
|
||||
if !ok {
|
||||
if cached, ok := probeCache[m.Name]; ok {
|
||||
result = cached
|
||||
} else {
|
||||
result = probeOllamaModel(ctx, baseURL, m.Name)
|
||||
probeCache[m.Name] = result
|
||||
}
|
||||
dm.SupportsTools = result.SupportsTools
|
||||
dm.SupportsVision = result.SupportsVision
|
||||
dm.ContextSize = result.ContextSize
|
||||
} else {
|
||||
result = probeOllamaModel(ctx, baseURL, m.Name)
|
||||
}
|
||||
dm.SupportsTools = result.SupportsTools
|
||||
dm.SupportsVision = result.SupportsVision
|
||||
dm.ContextSize = result.ContextSize
|
||||
|
||||
if dm.ContextSize == 0 {
|
||||
dm.ContextSize = defaultOllamaContextSize
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package router
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
"math"
|
||||
)
|
||||
|
||||
@@ -281,20 +282,39 @@ func effectiveCost(arm *Arm, task Task) float64 {
|
||||
// filterFeasible returns arms that can handle the task (tools, pool capacity, quality).
|
||||
// Arms that pass tool and pool checks but fall below the task's minimum quality threshold
|
||||
// are collected separately and used as a last resort if no arm meets the threshold.
|
||||
//
|
||||
// When the result is empty the caller surfaces a generic "no feasible arm"
|
||||
// error; rejection reasons are logged here at slog.Debug per-arm so users
|
||||
// debugging "why did the router reject everything?" with --verbose can see
|
||||
// the actual constraint each arm tripped instead of guessing.
|
||||
func filterFeasible(arms []*Arm, task Task) []*Arm {
|
||||
threshold := DefaultThresholds[task.Type]
|
||||
|
||||
var feasible []*Arm
|
||||
var belowQuality []*Arm // passed tool+pool but scored below minimum quality
|
||||
|
||||
reject := func(arm *Arm, reason string, fields ...any) {
|
||||
base := []any{
|
||||
"arm", arm.ID,
|
||||
"task", task.Type,
|
||||
"complexity", task.ComplexityScore,
|
||||
"reason", reason,
|
||||
}
|
||||
slog.Debug("filterFeasible: rejected", append(base, fields...)...)
|
||||
}
|
||||
|
||||
for _, arm := range arms {
|
||||
// Complexity ceiling: zero means no ceiling (preserves behavior for all existing arms).
|
||||
if arm.MaxComplexity > 0 && task.ComplexityScore > arm.MaxComplexity {
|
||||
reject(arm, "complexity_exceeds_max",
|
||||
"max_complexity", arm.MaxComplexity)
|
||||
continue
|
||||
}
|
||||
|
||||
// Must support tools if task requires them
|
||||
if task.RequiresTools && !arm.SupportsTools() {
|
||||
reject(arm, "tools_required_but_unsupported",
|
||||
"tool_use_capability", arm.Capabilities.ToolUse)
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -303,11 +323,15 @@ func filterFeasible(arms []*Arm, task Task) []*Arm {
|
||||
// cannot consume the image bytes, so degrading to it would silently
|
||||
// drop the image and confuse the model.
|
||||
if task.RequiresVision && !arm.Capabilities.Vision {
|
||||
reject(arm, "vision_required_but_unsupported",
|
||||
"vision_capability", arm.Capabilities.Vision)
|
||||
continue
|
||||
}
|
||||
|
||||
// Must support the required effort level (EffortAuto always passes)
|
||||
if !arm.Capabilities.SupportsEffort(task.RequiredEffort) {
|
||||
reject(arm, "effort_level_unsupported",
|
||||
"required_effort", task.RequiredEffort)
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -316,6 +340,8 @@ func filterFeasible(arms []*Arm, task Task) []*Arm {
|
||||
for _, pool := range arm.Pools {
|
||||
pool.CheckReset()
|
||||
if !pool.CanAfford(arm.ID, task.EstimatedTokens) {
|
||||
reject(arm, "pool_capacity_exceeded",
|
||||
"estimated_tokens", task.EstimatedTokens)
|
||||
poolsOK = false
|
||||
break
|
||||
}
|
||||
@@ -333,6 +359,16 @@ func filterFeasible(arms []*Arm, task Task) []*Arm {
|
||||
feasible = append(feasible, arm)
|
||||
}
|
||||
|
||||
if len(feasible) == 0 && len(belowQuality) == 0 {
|
||||
slog.Debug("filterFeasible: no arms feasible at any quality level",
|
||||
"task", task.Type,
|
||||
"complexity", task.ComplexityScore,
|
||||
"requires_tools", task.RequiresTools,
|
||||
"requires_vision", task.RequiresVision,
|
||||
"arms_considered", len(arms),
|
||||
)
|
||||
}
|
||||
|
||||
// Degrade gracefully: if no arm meets quality threshold, use below-quality ones
|
||||
if len(feasible) == 0 && len(belowQuality) > 0 {
|
||||
return belowQuality
|
||||
|
||||
@@ -14,10 +14,13 @@ import (
|
||||
"somegit.dev/Owlibou/gnoma/internal/stream"
|
||||
)
|
||||
|
||||
// defaultClassifyTimeout — 5 s accommodates thinking-mode models like
|
||||
// Qwen3 distillations (Tiny3.5) that emit reasoning tokens before output.
|
||||
// Non-thinking models complete in well under 1 s.
|
||||
const defaultClassifyTimeout = 5 * time.Second
|
||||
// defaultClassifyTimeout — 15 s accommodates cold-start model loads
|
||||
// (ollama lazily loads on first call, ~2-8s for a 1.5B model on SSD)
|
||||
// combined with thinking-mode first-token latency (Qwen3 distillations
|
||||
// like Tiny3.5 sometimes emit <think> tokens before the JSON output
|
||||
// even with /no_think). Non-thinking warm models complete in well
|
||||
// under 1 s. Tune via [slm].classify_timeout in config.
|
||||
const defaultClassifyTimeout = 15 * time.Second
|
||||
|
||||
const classifySystemPrompt = `Classify the following coding request. /no_think
|
||||
Respond with JSON only, no other text, no reasoning, no thinking tags.
|
||||
@@ -47,14 +50,18 @@ type Classifier struct {
|
||||
|
||||
// NewClassifier creates a Classifier. model is the model name passed to the provider
|
||||
// (llamafile ignores it but openaicompat requires a non-empty value).
|
||||
func NewClassifier(p provider.Provider, model string, logger *slog.Logger) *Classifier {
|
||||
// Pass timeout=0 to use the built-in default (defaultClassifyTimeout).
|
||||
func NewClassifier(p provider.Provider, model string, timeout time.Duration, logger *slog.Logger) *Classifier {
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
if timeout <= 0 {
|
||||
timeout = defaultClassifyTimeout
|
||||
}
|
||||
return &Classifier{
|
||||
provider: p,
|
||||
model: model,
|
||||
timeout: defaultClassifyTimeout,
|
||||
timeout: timeout,
|
||||
logger: logger,
|
||||
}
|
||||
}
|
||||
@@ -68,7 +75,11 @@ func (c *Classifier) Classify(ctx context.Context, prompt string, history []mess
|
||||
|
||||
resp, err := c.callSLM(tctx, prompt)
|
||||
if err != nil {
|
||||
c.logger.Debug("slm classify fallback", "error", err)
|
||||
// Warn-level so a first-time misconfiguration (timeout too tight,
|
||||
// wrong endpoint, malformed JSON from the model) surfaces without
|
||||
// requiring --verbose. The fallback path itself is benign; the
|
||||
// signal is that the SLM isn't doing the work it was supposed to.
|
||||
c.logger.Warn("slm classify fallback", "error", err, "timeout", c.timeout)
|
||||
t, ferr := router.HeuristicClassifier{}.Classify(ctx, prompt, history)
|
||||
t.ClassifierSource = router.ClassifierSLMFallback
|
||||
return t, ferr
|
||||
@@ -91,9 +102,25 @@ func (c *Classifier) Classify(ctx context.Context, prompt string, history []mess
|
||||
}
|
||||
|
||||
func (c *Classifier) callSLM(ctx context.Context, prompt string) (*classifyResponse, error) {
|
||||
// Constrain the model toward valid, deterministic JSON output. Without
|
||||
// these settings small models routinely ignore the JSON-only system
|
||||
// prompt, emit reasoning blocks (<think>, <Thought Process>) or just
|
||||
// answer the user's prompt in prose. ResponseFormat=json_object asks
|
||||
// the provider to enforce JSON at decoding time where supported
|
||||
// (ollama 'format=json', llama.cpp grammar, OpenAI json_object). Even
|
||||
// when the provider can't enforce, the explicit signal nudges the
|
||||
// adapter to set the right backend flag.
|
||||
temp := 0.0
|
||||
topP := 1.0
|
||||
req := provider.Request{
|
||||
Model: c.model,
|
||||
SystemPrompt: classifySystemPrompt,
|
||||
Temperature: &temp,
|
||||
TopP: &topP,
|
||||
MaxTokens: 128, // classification output is ~50 tokens; cap to prevent runaway reasoning
|
||||
ResponseFormat: &provider.ResponseFormat{
|
||||
Type: provider.ResponseJSON,
|
||||
},
|
||||
Messages: []message.Message{
|
||||
{
|
||||
Role: message.RoleUser,
|
||||
@@ -127,10 +154,22 @@ func (c *Classifier) callSLM(ctx context.Context, prompt string) (*classifyRespo
|
||||
return &resp, nil
|
||||
}
|
||||
|
||||
// extractJSON pulls the first {...} substring from s, stripping markdown fences if present.
|
||||
// extractJSON pulls the first {...} substring from s, stripping markdown
|
||||
// fences and known thinking-block tags. Small models routinely violate
|
||||
// the JSON-only system prompt by emitting reasoning tokens first, so
|
||||
// the extractor must tolerate prefixes the model wasn't asked to emit.
|
||||
func extractJSON(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
|
||||
// Strip known thinking-block tags. Order matters: longer/more-
|
||||
// specific names first so a partial match doesn't shadow a real
|
||||
// one. Seen in the wild on Qwen3 (<think>) and tiny3.5
|
||||
// (<Thought Process>); the others are defensive against similar
|
||||
// fine-tunes.
|
||||
for _, tag := range []string{"Thought Process", "thinking", "reasoning", "thoughts", "think"} {
|
||||
s = stripTagBlock(s, tag)
|
||||
}
|
||||
|
||||
// Strip ```json ... ``` fences.
|
||||
if strings.HasPrefix(s, "```") {
|
||||
end := strings.LastIndex(s, "```")
|
||||
@@ -160,3 +199,28 @@ func extractJSON(s string) string {
|
||||
}
|
||||
return s[start:]
|
||||
}
|
||||
|
||||
// stripTagBlock removes <tag>...</tag> blocks (case-insensitive on the
|
||||
// tag name) from the start of s. Returns the original string if the tag
|
||||
// is not at the start. Idempotent; safe to call repeatedly.
|
||||
func stripTagBlock(s, tag string) string {
|
||||
trimmed := strings.TrimSpace(s)
|
||||
open := "<" + tag
|
||||
lower := strings.ToLower(trimmed)
|
||||
if !strings.HasPrefix(lower, strings.ToLower(open)) {
|
||||
return s
|
||||
}
|
||||
// Find the matching closing tag, case-insensitive.
|
||||
close := "</" + tag + ">"
|
||||
closeIdx := strings.Index(strings.ToLower(trimmed), strings.ToLower(close))
|
||||
if closeIdx < 0 {
|
||||
// Unterminated thinking block — strip up to the first '{'
|
||||
// so we still have a shot at extracting JSON that follows.
|
||||
braceIdx := strings.IndexByte(trimmed, '{')
|
||||
if braceIdx > 0 {
|
||||
return strings.TrimSpace(trimmed[braceIdx:])
|
||||
}
|
||||
return s
|
||||
}
|
||||
return strings.TrimSpace(trimmed[closeIdx+len(close):])
|
||||
}
|
||||
|
||||
@@ -54,7 +54,7 @@ func TestClassifier_HappyPath(t *testing.T) {
|
||||
// SLM complexity 0.55 stays above the Debug floor (0.4), so the SLM
|
||||
// value is preserved verbatim.
|
||||
p := &mockProvider{text: `{"task_type":"Debug","complexity":0.55,"requires_tools":false}`}
|
||||
cls := NewClassifier(p, "default", nil)
|
||||
cls := NewClassifier(p, "default", 0, nil)
|
||||
|
||||
task, err := cls.Classify(context.Background(), "fix the failing test", nil)
|
||||
if err != nil {
|
||||
@@ -76,7 +76,7 @@ func TestClassifier_AppliesTaskTypeFloor(t *testing.T) {
|
||||
// bump ComplexityScore up to the floor so the SLM arm can't be picked
|
||||
// for its own kind of misclassification.
|
||||
p := &mockProvider{text: `{"task_type":"Debug","complexity":0.25,"requires_tools":false}`}
|
||||
cls := NewClassifier(p, "default", nil)
|
||||
cls := NewClassifier(p, "default", 0, nil)
|
||||
|
||||
task, err := cls.Classify(context.Background(), "fix the failing test", nil)
|
||||
if err != nil {
|
||||
@@ -91,7 +91,7 @@ func TestClassifier_AppliesTaskTypeFloor(t *testing.T) {
|
||||
func TestClassifier_BlendHeuristic(t *testing.T) {
|
||||
// SLM returns one type; other Task fields should come from heuristic.
|
||||
p := &mockProvider{text: `{"task_type":"Boilerplate","complexity":0.1,"requires_tools":false}`}
|
||||
cls := NewClassifier(p, "default", nil)
|
||||
cls := NewClassifier(p, "default", 0, nil)
|
||||
|
||||
task, err := cls.Classify(context.Background(), "scaffold a new HTTP handler", nil)
|
||||
if err != nil {
|
||||
@@ -108,7 +108,7 @@ func TestClassifier_BlendHeuristic(t *testing.T) {
|
||||
|
||||
func TestClassifier_FallbackOnBadJSON(t *testing.T) {
|
||||
p := &mockProvider{text: "I cannot classify that."}
|
||||
cls := NewClassifier(p, "default", nil)
|
||||
cls := NewClassifier(p, "default", 0, nil)
|
||||
|
||||
// Should not error — falls back to heuristic.
|
||||
task, err := cls.Classify(context.Background(), "write unit tests for the parser", nil)
|
||||
@@ -123,7 +123,7 @@ func TestClassifier_FallbackOnBadJSON(t *testing.T) {
|
||||
|
||||
func TestClassifier_FallbackOnProviderError(t *testing.T) {
|
||||
p := &mockProvider{err: errors.New("connection refused")}
|
||||
cls := NewClassifier(p, "default", nil)
|
||||
cls := NewClassifier(p, "default", 0, nil)
|
||||
|
||||
task, err := cls.Classify(context.Background(), "explain how generics work", nil)
|
||||
if err != nil {
|
||||
@@ -137,7 +137,7 @@ func TestClassifier_FallbackOnProviderError(t *testing.T) {
|
||||
|
||||
func TestClassifier_FallbackOnTimeout(t *testing.T) {
|
||||
p := &mockProvider{delay: 500 * time.Millisecond}
|
||||
cls := NewClassifier(p, "default", nil)
|
||||
cls := NewClassifier(p, "default", 0, nil)
|
||||
cls.timeout = 50 * time.Millisecond // force timeout
|
||||
|
||||
task, err := cls.Classify(context.Background(), "debug the failing test", nil)
|
||||
@@ -153,7 +153,7 @@ func TestClassifier_FallbackOnTimeout(t *testing.T) {
|
||||
func TestClassifier_FenceStripping(t *testing.T) {
|
||||
fenced := "```json\n{\"task_type\":\"Refactor\",\"complexity\":0.5,\"requires_tools\":true}\n```"
|
||||
p := &mockProvider{text: fenced}
|
||||
cls := NewClassifier(p, "default", nil)
|
||||
cls := NewClassifier(p, "default", 0, nil)
|
||||
|
||||
task, err := cls.Classify(context.Background(), "refactor the auth middleware", nil)
|
||||
if err != nil {
|
||||
@@ -166,7 +166,7 @@ func TestClassifier_FenceStripping(t *testing.T) {
|
||||
|
||||
func TestClassifier_UnknownTaskType_FallsBackToHeuristic(t *testing.T) {
|
||||
p := &mockProvider{text: `{"task_type":"FooBar","complexity":0.3,"requires_tools":false}`}
|
||||
cls := NewClassifier(p, "default", nil)
|
||||
cls := NewClassifier(p, "default", 0, nil)
|
||||
|
||||
task, err := cls.Classify(context.Background(), "implement a binary search function", nil)
|
||||
if err != nil {
|
||||
@@ -178,7 +178,7 @@ func TestClassifier_UnknownTaskType_FallsBackToHeuristic(t *testing.T) {
|
||||
|
||||
func TestClassifier_SetsClassifierSource_OnSuccess(t *testing.T) {
|
||||
p := &mockProvider{text: `{"task_type":"Debug","complexity":0.3,"requires_tools":true}`}
|
||||
cls := NewClassifier(p, "default", nil)
|
||||
cls := NewClassifier(p, "default", 0, nil)
|
||||
task, err := cls.Classify(context.Background(), "fix the failing test", nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
@@ -190,7 +190,7 @@ func TestClassifier_SetsClassifierSource_OnSuccess(t *testing.T) {
|
||||
|
||||
func TestClassifier_SetsClassifierSource_OnFallback(t *testing.T) {
|
||||
p := &mockProvider{err: errors.New("backend unreachable")}
|
||||
cls := NewClassifier(p, "default", nil)
|
||||
cls := NewClassifier(p, "default", 0, nil)
|
||||
task, err := cls.Classify(context.Background(), "fix the failing test", nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
@@ -202,7 +202,7 @@ func TestClassifier_SetsClassifierSource_OnFallback(t *testing.T) {
|
||||
|
||||
func TestClassifier_ContextPassedToHistory(t *testing.T) {
|
||||
p := &mockProvider{text: `{"task_type":"Explain","complexity":0.2,"requires_tools":false}`}
|
||||
cls := NewClassifier(p, "default", nil)
|
||||
cls := NewClassifier(p, "default", 0, nil)
|
||||
|
||||
history := []message.Message{
|
||||
{Role: message.RoleUser, Content: []message.Content{{Type: message.ContentText, Text: "prior"}}},
|
||||
@@ -215,3 +215,45 @@ func TestClassifier_ContextPassedToHistory(t *testing.T) {
|
||||
t.Errorf("Type = %s, want Explain", task.Type)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractJSON_StripsThinkingTags(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
in string
|
||||
want string
|
||||
}{
|
||||
{
|
||||
name: "qwen-think-block",
|
||||
in: `<think>Let me decide</think>{"task_type":"Debug","complexity":0.5,"requires_tools":true}`,
|
||||
want: `{"task_type":"Debug","complexity":0.5,"requires_tools":true}`,
|
||||
},
|
||||
{
|
||||
name: "tiny3.5-thought-process",
|
||||
in: "<Thought Process>\nUser wants debugging help.\n</Thought Process>\n{\"task_type\":\"Debug\",\"complexity\":0.4,\"requires_tools\":true}",
|
||||
want: `{"task_type":"Debug","complexity":0.4,"requires_tools":true}`,
|
||||
},
|
||||
{
|
||||
name: "unterminated-think-falls-back-to-brace",
|
||||
in: `<think>incomplete reasoning {"task_type":"Explain","complexity":0.2,"requires_tools":false}`,
|
||||
want: `{"task_type":"Explain","complexity":0.2,"requires_tools":false}`,
|
||||
},
|
||||
{
|
||||
name: "no-tags-still-works",
|
||||
in: `{"task_type":"Generation","complexity":0.6,"requires_tools":false}`,
|
||||
want: `{"task_type":"Generation","complexity":0.6,"requires_tools":false}`,
|
||||
},
|
||||
{
|
||||
name: "fenced-json-still-works",
|
||||
in: "```json\n{\"task_type\":\"Refactor\",\"complexity\":0.5,\"requires_tools\":true}\n```",
|
||||
want: `{"task_type":"Refactor","complexity":0.5,"requires_tools":true}`,
|
||||
},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got := extractJSON(tc.in)
|
||||
if got != tc.want {
|
||||
t.Errorf("extractJSON(...)\n got: %q\n want: %q", got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1146,6 +1146,15 @@ func (m Model) submitInput(input string) (tea.Model, tea.Cmd) {
|
||||
m.thinkingBuf.Reset()
|
||||
m.streamFilterClose = ""
|
||||
|
||||
// Recover from a prior StateError before submitting a fresh user
|
||||
// prompt. A transient routing or engine failure used to leave the
|
||||
// session in error state, blocking every subsequent prompt with
|
||||
// "session not idle (state: error)" until the user restarted gnoma.
|
||||
// User-initiated sends always carry an intent-to-retry, so resetting
|
||||
// here is the safe default; the /init retry path has its own explicit
|
||||
// ResetError that we leave alone.
|
||||
m.session.ResetError()
|
||||
|
||||
if err := m.session.Send(expandedInput); err != nil {
|
||||
m.messages = append(m.messages, chatMessage{role: "error", content: formatError(err)})
|
||||
m.streaming = false
|
||||
@@ -1494,6 +1503,8 @@ func (m Model) handleCommand(cmd string) (tea.Model, tea.Cmd) {
|
||||
m.initWriteNudged = false
|
||||
|
||||
opts := engine.TurnOptions{}
|
||||
// Recover from prior StateError before /init can submit.
|
||||
m.session.ResetError()
|
||||
if err := m.session.SendWithOptions(prompt, opts); err != nil {
|
||||
m.messages = append(m.messages, chatMessage{role: "error", content: formatError(err)})
|
||||
m.streaming = false
|
||||
@@ -1695,6 +1706,8 @@ func (m Model) handleCommand(cmd string) (tea.Model, tea.Cmd) {
|
||||
AllowedTools: sk.Frontmatter.AllowedTools,
|
||||
AllowedPaths: sk.Frontmatter.Paths,
|
||||
}
|
||||
// Recover from prior StateError before the skill submits.
|
||||
m.session.ResetError()
|
||||
if err := m.session.SendWithOptions(rendered, skillOpts); err != nil {
|
||||
m.messages = append(m.messages, chatMessage{role: "error", content: formatError(err)})
|
||||
m.streaming = false
|
||||
|
||||
Reference in New Issue
Block a user