Files
vikingowl 0d3d190a8b fix(slm,session,router): classifier-only SLMs + session error recovery + feasibility diagnostics
Three coupled fixes that surfaced from a single FunctionGemma test
session where the SLM-as-execution-arm assumption broke down and
every subsequent prompt failed with 'session not idle (state: error)'.

(A) [slm].register_as_arm config. The SLM has always been
unconditionally registered as both classifier AND tier-0 execution
arm. Fine for general-purpose models (ministral, qwen3-chat); breaks
for task-specialised models (FunctionGemma emits function-call
syntax instead of prose; embedding models can't generate). New
pointer-bool config: nil/absent preserves the historical default
(true), explicit false makes the SLM classifier-only and the
execution path skips the slm/* arm. Three table tests cover absent
/ explicit-false / explicit-true decode paths.

(B) Session error recovery. After any routing or engine error, the
session moved to StateError and stayed there until restart — every
new user prompt got rejected with 'session not idle (state: error)'.
ResetError() was already wired for the /init retry path, but the
general user-input and slash-command paths didn't call it. Added
ResetError() before every user-initiated Send in the TUI so a fresh
prompt always represents intent-to-retry. The /init internal retry
already had its own ResetError; left alone.

(C) filterFeasible per-arm rejection logging. Today's 'no feasible
arm for task X' error tells you THAT every arm was rejected but
nothing about WHY. Added slog.Debug per rejection (arm, task,
complexity, reason, the specific violated constraint) plus a
summary line when zero arms are feasible at any quality. Visible
with --verbose; quiet otherwise. Surface area expansion only — no
behaviour change for users not chasing a bug.
2026-05-25 01:57:16 +02:00

404 lines
12 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package router
import (
"log/slog"
"math"
)
// Strategy identifies how a task should be executed.
type Strategy int
const (
StrategySingleArm Strategy = iota
// Future (M9): StrategyCascade, StrategyParallelEnsemble, StrategyMultiRound
)
// RoutingDecision is the result of arm selection.
type RoutingDecision struct {
Strategy Strategy
Arm *Arm // primary arm
Error error
reservations []*Reservation // pool reservations held until commit/rollback
}
// Commit finalizes the routing decision, recording actual token consumption.
// Must be called when the request completes successfully.
func (d RoutingDecision) Commit(actualTokens int) {
for _, r := range d.reservations {
r.Commit(actualTokens)
}
}
// Rollback releases the routing decision's pool reservations without recording usage.
// Must be called when the request fails before any tokens are consumed.
func (d RoutingDecision) Rollback() {
for _, r := range d.reservations {
r.Rollback()
}
}
// armTier returns the routing tier for an arm in the context of a task.
// Lower tier = higher preference.
// - 0: specialized small arm (MaxComplexity > 0) whose ceiling fits this
// task — picked first so "the SLM does small stuff" actually happens.
// - 1: CLI agent
// - 2: local model (general purpose, no complexity ceiling)
// - 3: API provider
//
// When prefer is PreferLocal, non-local non-CLI-agent arms (true cloud
// API arms) are demoted by +2 tiers so any local or CLI-agent option
// is preferred. When prefer is PreferCloud, IsLocal arms are demoted
// by +2 tiers so cloud arms win the tier walk. The +2 shift is enough
// to drop cloud below the locals (tier 3 → 5) and locals below cloud
// (tier 2 → 4) without colliding with any normal tier value, keeping
// the tier walk deterministic.
//
// The Strengths-promoted path in selectBest bypasses the tier walk
// entirely, so prefer-policy never blocks a strongly-tagged arm from
// winning the task it's tagged for. This is the intended interaction.
func armTier(arm *Arm, task Task, prefer PreferPolicy) int {
base := armBaseTier(arm, task)
switch prefer {
case PreferLocal:
// Demote pure cloud arms. CLI-agent arms proxy to cloud but
// remain "local" from a tooling perspective — leave them where
// they are. Users who want to exclude them should use
// `--provider X` or the existing exclude mechanisms.
if !arm.IsLocal && !arm.IsCLIAgent {
return base + 2
}
case PreferCloud:
if arm.IsLocal {
return base + 2
}
}
return base
}
func armBaseTier(arm *Arm, task Task) int {
if arm.MaxComplexity > 0 && task.ComplexityScore <= arm.MaxComplexity {
return 0
}
if arm.IsCLIAgent {
return 1
}
if arm.IsLocal {
return 2
}
return 3
}
// selectBest picks the best arm.
//
// Step 1: arms whose Strengths list contains task.Type cross all tier
// boundaries — Opus tagged with SecurityReview beats a CLI-agent tier-1
// arm for that task. Strengths are a preference, not a pin: if no
// strength-matching arm is in the input set (filterFeasible already
// removed arms in backoff, lacking tool support, or out of pool capacity),
// selection falls through to the default tier order.
//
// Step 2 (fallback): walk tiers low→high. Within a tier, highest-scoring
// arm wins.
func selectBest(qt *QualityTracker, params BanditParams, arms []*Arm, task Task, prefer PreferPolicy) *Arm {
if len(arms) == 0 {
return nil
}
var promoted []*Arm
for _, arm := range arms {
if arm.HasStrength(task.Type) {
promoted = append(promoted, arm)
}
}
if len(promoted) > 0 {
return bestScored(qt, params, promoted, task, prefer)
}
// Walk tiers low→high. armTier returns up to 5 when prefer is set
// (a dispreferred tier-3 cloud arm under PreferLocal lands at 5);
// the loop bound has to cover that.
for tier := 0; tier <= 5; tier++ {
var inTier []*Arm
for _, arm := range arms {
if armTier(arm, task, prefer) == tier {
inTier = append(inTier, arm)
}
}
if len(inTier) > 0 {
return bestScored(qt, params, inTier, task, prefer)
}
}
return nil
}
// bestScored returns the highest-scoring arm within a set.
func bestScored(qt *QualityTracker, params BanditParams, arms []*Arm, task Task, prefer PreferPolicy) *Arm {
var best *Arm
bestScore := math.Inf(-1)
for _, arm := range arms {
score := scoreArm(qt, params, arm, task) * policyMultiplier(arm, prefer)
if score > bestScore {
bestScore = score
best = arm
}
}
return best
}
// policyMultiplier returns the prefer-policy score multiplier for an
// arm. Soft bias only — does not zero out the dispreferred set, so
// when only cloud arms are feasible under PreferLocal a cloud arm can
// still win. Calibrated against the typical scoreArm output range
// (~0.52.0) so a 0.3 multiplier is roughly equivalent to "non-local
// arm must be ~3x better than local to win."
//
// CLI-agent subprocess arms count as non-local because they proxy to
// cloud — the prefer knob is about the privacy/cost axis, not the
// tooling-locality axis. Users who want to pin subprocess specifically
// should use --provider subprocess, which bypasses the policy.
func policyMultiplier(arm *Arm, p PreferPolicy) float64 {
switch p {
case PreferLocal:
if arm.IsLocal {
return 1.0
}
return 0.3
case PreferCloud:
if arm.IsLocal {
return 0.5
}
return 1.0
default:
return 1.0
}
}
// scoreArm computes a quality/cost score for an arm.
// When the quality tracker has sufficient observations, blends observed EMA
// (default 70%) with heuristic (default 30%). Falls back to pure heuristic
// otherwise. The blend ratio and strength bonus are tunable via
// BanditParams (config: [router.bandit]); a zero-valued params falls back
// to the built-in defaults.
//
// Strengths add a fixed bonus to quality when matching task.Type. CostWeight
// dampens the cost penalty linearly:
//
// effectiveCost = 1 + CostWeight * (cost - 1)
//
// With CostWeight=1.0 (or unset → resolved to 1.0) the formula collapses to
// the original effectiveCost == cost. With CostWeight=0 cost is fully
// ignored (effectiveCost = 1.0). Local arms with sub-1 raw costs are not
// amplified by fractional weights (the linear formula stays monotone).
func scoreArm(qt *QualityTracker, params BanditParams, arm *Arm, task Task) float64 {
params = resolveBanditParams(params)
hq := heuristicQuality(arm, task)
quality := hq
if qt != nil {
if observed, hasData := qt.Quality(arm.ID, task.Type); hasData {
quality = params.ObservedWeight*observed + (1-params.ObservedWeight)*hq
}
}
if arm.HasStrength(task.Type) {
quality += params.StrengthBonus
}
value := task.ValueScore()
rawCost := effectiveCost(arm, task)
if rawCost <= 0 {
rawCost = 0.001
}
weighted := 1.0 + arm.ResolvedCostWeight()*(rawCost-1.0)
if weighted <= 0 {
weighted = 0.001
}
return (quality * value) / weighted
}
// heuristicQuality estimates arm quality without historical data.
func heuristicQuality(arm *Arm, task Task) float64 {
score := 0.5 // base
// Larger context window = better for complex tasks
if arm.Capabilities.ContextWindow >= 100000 {
score += 0.1
}
if arm.Capabilities.ContextWindow >= 200000 {
score += 0.05
}
// Thinking capability valuable for planning/orchestration/security
if arm.Capabilities.SupportsThinking() {
switch task.Type {
case TaskPlanning, TaskOrchestration, TaskSecurityReview:
score += 0.2
case TaskDebug, TaskRefactor:
score += 0.1
}
}
// Tool support required — arm without tools gets heavy penalty
if task.RequiresTools && !arm.SupportsTools() {
score *= 0.1
}
// Local models get a small boost (no network latency, privacy)
if arm.IsLocal {
score += 0.05
}
// Complexity adjustment — complex tasks penalize small/local models
if task.ComplexityScore > 0.7 && arm.IsLocal {
score *= 0.7
}
// Clamp
if score > 1.0 {
score = 1.0
}
if score < 0.0 {
score = 0.0
}
return score
}
// effectiveCost returns the base cost inflated by pool scarcity.
func effectiveCost(arm *Arm, task Task) float64 {
base := arm.EstimateCost(task.EstimatedTokens)
if base <= 0 {
base = 0.001 // local models are ~free but not zero for scoring
}
// Apply maximum scarcity multiplier across all pools
maxMultiplier := 1.0
for _, pool := range arm.Pools {
m := pool.ScarcityMultiplier()
if m > maxMultiplier {
maxMultiplier = m
}
}
return base * maxMultiplier
}
// filterFeasible returns arms that can handle the task (tools, pool capacity, quality).
// Arms that pass tool and pool checks but fall below the task's minimum quality threshold
// are collected separately and used as a last resort if no arm meets the threshold.
//
// When the result is empty the caller surfaces a generic "no feasible arm"
// error; rejection reasons are logged here at slog.Debug per-arm so users
// debugging "why did the router reject everything?" with --verbose can see
// the actual constraint each arm tripped instead of guessing.
func filterFeasible(arms []*Arm, task Task) []*Arm {
threshold := DefaultThresholds[task.Type]
var feasible []*Arm
var belowQuality []*Arm // passed tool+pool but scored below minimum quality
reject := func(arm *Arm, reason string, fields ...any) {
base := []any{
"arm", arm.ID,
"task", task.Type,
"complexity", task.ComplexityScore,
"reason", reason,
}
slog.Debug("filterFeasible: rejected", append(base, fields...)...)
}
for _, arm := range arms {
// Complexity ceiling: zero means no ceiling (preserves behavior for all existing arms).
if arm.MaxComplexity > 0 && task.ComplexityScore > arm.MaxComplexity {
reject(arm, "complexity_exceeds_max",
"max_complexity", arm.MaxComplexity)
continue
}
// Must support tools if task requires them
if task.RequiresTools && !arm.SupportsTools() {
reject(arm, "tools_required_but_unsupported",
"tool_use_capability", arm.Capabilities.ToolUse)
continue
}
// Must support vision if task carries inline image content.
// No tools/quality fallback for vision: a non-vision arm physically
// cannot consume the image bytes, so degrading to it would silently
// drop the image and confuse the model.
if task.RequiresVision && !arm.Capabilities.Vision {
reject(arm, "vision_required_but_unsupported",
"vision_capability", arm.Capabilities.Vision)
continue
}
// Must support the required effort level (EffortAuto always passes)
if !arm.Capabilities.SupportsEffort(task.RequiredEffort) {
reject(arm, "effort_level_unsupported",
"required_effort", task.RequiredEffort)
continue
}
// Check all pools have capacity
poolsOK := true
for _, pool := range arm.Pools {
pool.CheckReset()
if !pool.CanAfford(arm.ID, task.EstimatedTokens) {
reject(arm, "pool_capacity_exceeded",
"estimated_tokens", task.EstimatedTokens)
poolsOK = false
break
}
}
if !poolsOK {
continue
}
// Quality floor: arms below minimum are set aside, not discarded
if heuristicQuality(arm, task) < threshold.Minimum {
belowQuality = append(belowQuality, arm)
continue
}
feasible = append(feasible, arm)
}
if len(feasible) == 0 && len(belowQuality) == 0 {
slog.Debug("filterFeasible: no arms feasible at any quality level",
"task", task.Type,
"complexity", task.ComplexityScore,
"requires_tools", task.RequiresTools,
"requires_vision", task.RequiresVision,
"arms_considered", len(arms),
)
}
// Degrade gracefully: if no arm meets quality threshold, use below-quality ones
if len(feasible) == 0 && len(belowQuality) > 0 {
return belowQuality
}
// If still empty and task requires tools, relax pool checks (last resort)
if len(feasible) == 0 && task.RequiresTools {
for _, arm := range arms {
if !arm.Capabilities.ToolUse {
continue
}
// Vision requirement is hard: a non-vision arm cannot
// consume image bytes, so even the last-resort fallback
// must respect it.
if task.RequiresVision && !arm.Capabilities.Vision {
continue
}
poolsOK := true
for _, pool := range arm.Pools {
if !pool.CanAfford(arm.ID, task.EstimatedTokens) {
poolsOK = false
break
}
}
if poolsOK {
feasible = append(feasible, arm)
}
}
}
return feasible
}