package router import ( "regexp" "strconv" "strings" ) // FamilyDefaults are the per-model-family routing defaults applied at // discovery time when the user has not supplied an [[arms]] override in // config. Populated from the benchmark snapshot dated 2026-05-23 // (artificialanalysis.ai v4.0, llm-stats.com, kilo.ai); see // docs/superpowers/plans/2026-05-23-routing-defaults-refresh.md for // rationale per entry. // // Zero-valued fields mean "router default" — only non-zero fields are // applied. That keeps the table honest: an unset MaxComplexity stays 0 // (no ceiling) rather than getting a fake value. // // For families that span a wide parameter range (ministral-3 from // 3B to 14B, qwen3 from 4B to 14B, tiny3.5 from 0.5B to 1.5B), use // SizeCaps instead of MaxComplexity. The first SizeCap whose // MinSizeB threshold the parsed model size meets wins; entries must // be ordered largest-first. type FamilyDefaults struct { Strengths []TaskType MaxComplexity float64 CostWeight float64 Disabled bool SizeCaps []SizeCap } // SizeCap maps a minimum parameter count (in billions) to a // MaxComplexity ceiling. Used in FamilyDefaults.SizeCaps when a family // covers many sizes that warrant different ceilings. type SizeCap struct { MinSizeB float64 Cap float64 } // knownFamilyDefaults is the family-prefix → defaults lookup table. // Matching is longest-prefix-wins via ResolveFamilyDefaults, so // "qwen3-coder" beats "qwen3" beats "qwen". Keys are matched against the // model ID with case-insensitive prefix; namespace prefixes ending in "/" // are stripped before matching (so reecdev/tiny3.5:1.5b also matches // "tiny3.5"). // // See the routing-defaults-refresh plan for the rationale per row. // functiongemma is the only Disabled entry; everything else is auto- // routable. Coder-family Strengths lean on the SWE-bench / Aider / // HumanEval rankings in the 2026-05-23 snapshot; reasoning-family // Strengths lean on MMLU / MATH / GPQA. var knownFamilyDefaults = map[string]FamilyDefaults{ // --- Coder specialists -------------------------------------------------- "qwen3-coder": { Strengths: []TaskType{TaskGeneration, TaskRefactor, TaskDebug}, MaxComplexity: 0.85, // 30B-A3B; 44.3% SWE-Bench Pro }, "qwen2.5-coder": { Strengths: []TaskType{TaskGeneration, TaskRefactor, TaskUnitTest}, MaxComplexity: 0.70, // 14B; Aider 73.7 }, "devstral": { Strengths: []TaskType{TaskGeneration, TaskRefactor, TaskDebug}, MaxComplexity: 0.85, // 24B; 68% SWE-bench Verified, vision-capable }, "yi-coder": { Strengths: []TaskType{TaskGeneration, TaskRefactor}, MaxComplexity: 0.55, // 9B; HumanEval 85.4 }, "deepseek-coder": { Strengths: []TaskType{TaskGeneration, TaskRefactor}, MaxComplexity: 0.65, // V2 Lite MoE; 16B-quality at 3B-speed }, "starcoder": { Strengths: []TaskType{TaskGeneration}, MaxComplexity: 0.45, // fill-in-middle specialist }, // --- Reasoning specialists ---------------------------------------------- "phi-4-mini": { Strengths: []TaskType{TaskBoilerplate, TaskExplain}, MaxComplexity: 0.35, // 3.8B compact }, "phi-4": { Strengths: []TaskType{TaskPlanning, TaskDebug, TaskReview}, MaxComplexity: 0.65, // 14B; MMLU 84.8, HumanEval 82.6 }, // --- Gemma family ------------------------------------------------------- "gemma4-e": { // Ollama-style edge ("gemma4-e4b-uc:latest") Strengths: []TaskType{TaskExplain, TaskBoilerplate}, MaxComplexity: 0.45, }, "gemma-4-e": { // GGUF-style edge ("gemma-4-e2b-it", "gemma-4-e4b-it") Strengths: []TaskType{TaskExplain, TaskBoilerplate}, MaxComplexity: 0.45, }, "gemma4": { // base ~9B multimodal Strengths: []TaskType{TaskExplain, TaskReview, TaskGeneration}, MaxComplexity: 0.70, }, "gemma-4": { // GGUF base variant — catch-all under hyphenated naming Strengths: []TaskType{TaskExplain, TaskReview, TaskGeneration}, MaxComplexity: 0.70, }, "gemma3": { Strengths: []TaskType{TaskExplain, TaskReview}, MaxComplexity: 0.55, }, "gemma2": { Strengths: []TaskType{TaskExplain}, MaxComplexity: 0.40, }, // --- Qwen family (size-keyed for the variants that span ranges) -------- "qwen3.5": { Strengths: []TaskType{TaskBoilerplate, TaskExplain, TaskOrchestration}, SizeCaps: []SizeCap{ {MinSizeB: 9, Cap: 0.65}, // 9B distill (e.g. qwen3.5-9b-glm5.1-distill-v1) {MinSizeB: 4, Cap: 0.50}, {MinSizeB: 0, Cap: 0.40}, }, }, "qwen3": { Strengths: []TaskType{TaskGeneration, TaskRefactor, TaskDebug}, SizeCaps: []SizeCap{ {MinSizeB: 14, Cap: 0.75}, {MinSizeB: 7, Cap: 0.65}, {MinSizeB: 0, Cap: 0.50}, }, }, "qwen2.5": { Strengths: []TaskType{TaskExplain, TaskRefactor}, SizeCaps: []SizeCap{ {MinSizeB: 14, Cap: 0.65}, {MinSizeB: 7, Cap: 0.55}, {MinSizeB: 0, Cap: 0.40}, }, }, "qwen": { // catch-all for unmatched Qwen variants Strengths: []TaskType{TaskExplain}, MaxComplexity: 0.40, }, // --- Mistral / Ministral families -------------------------------------- "ministral-3": { Strengths: []TaskType{TaskOrchestration, TaskPlanning}, SizeCaps: []SizeCap{ {MinSizeB: 14, Cap: 0.70}, {MinSizeB: 8, Cap: 0.55}, {MinSizeB: 0, Cap: 0.35}, }, }, "mistral-small-3": { Strengths: []TaskType{TaskOrchestration, TaskReview}, MaxComplexity: 0.65, // 24B; MMLU 81 }, "mistral": { // catch-all for Mistral 7B / Nemo / etc. Strengths: []TaskType{TaskGeneration, TaskRefactor}, MaxComplexity: 0.50, }, // --- Llama family ------------------------------------------------------- "llama4": { Strengths: []TaskType{TaskExplain, TaskReview}, MaxComplexity: 0.50, // Scout / Maverick variants }, "llama3.2": { Strengths: []TaskType{TaskExplain, TaskBoilerplate}, MaxComplexity: 0.35, // tool-call friendly small }, // --- Tiny / draft-class ------------------------------------------------- "tiny3.5": { Strengths: []TaskType{TaskBoilerplate, TaskExplain}, SizeCaps: []SizeCap{ {MinSizeB: 1.5, Cap: 0.30}, {MinSizeB: 0, Cap: 0.20}, }, }, "granite": { Strengths: []TaskType{TaskExplain, TaskBoilerplate}, MaxComplexity: 0.30, // IBM 8B and similar }, // --- Vision-capable / specialists -------------------------------------- "minicpm-v": { Strengths: []TaskType{TaskPlanning, TaskReview}, MaxComplexity: 0.55, // vision-thinking; vision flag set via prefix list }, "glm-ocr": { // No Strengths — narrow OCR-only specialist. Vision flag is set // via knownVisionModelPrefixes; arm is registered but the router // will rarely pick it because nothing promotes it. MaxComplexity: 0.30, }, "glm": { // catch-all GLM family Strengths: []TaskType{TaskExplain}, MaxComplexity: 0.45, }, // --- Closed-source frontier (cloud arms) -------------------------------- // Cloud entries set Strengths and CostWeight but leave MaxComplexity // zero — cloud arms shouldn't have a complexity ceiling. CostWeight // rationale per the 2026-05-23 plan: // - 0.3 on frontier arms (Opus 4.7, GPT-5.5): keep them competitive // for high-stakes tasks (SecurityReview, Planning) despite $4+/Mtok. // - 0.5-0.7 on mid-tier coding specialists: standard cost influence. // - 1.2 on cheap fast arms (Gemini 3.5 Flash): penalize cost more // so they win only when cost is genuinely decisive. "claude-opus-4-7": { Strengths: []TaskType{TaskPlanning, TaskSecurityReview, TaskDebug, TaskRefactor}, CostWeight: 0.3, }, "claude-sonnet-4-6": { Strengths: []TaskType{TaskGeneration, TaskRefactor, TaskReview}, CostWeight: 0.7, }, "gpt-5.5": { Strengths: []TaskType{TaskPlanning, TaskSecurityReview, TaskGeneration}, CostWeight: 0.3, }, "gpt-5.3-codex": { Strengths: []TaskType{TaskGeneration, TaskRefactor, TaskDebug, TaskUnitTest}, CostWeight: 0.6, }, "gpt-5.2": { Strengths: []TaskType{TaskOrchestration, TaskReview}, CostWeight: 0.8, }, "gemini-3.1-pro": { Strengths: []TaskType{TaskPlanning, TaskReview, TaskOrchestration}, CostWeight: 0.5, }, "gemini-3.5-flash": { Strengths: []TaskType{TaskBoilerplate, TaskExplain, TaskOrchestration}, CostWeight: 1.2, }, // --- Tool-router specialist (reserved, not auto-routed) ----------------- // functiongemma is Google's 270M function-calling specialist. It is // not a chat model — it emits structured tool calls, not prose. We // register it so it shows up in `gnoma providers` but mark it // Disabled to keep it out of auto-routing until the dedicated // ArmRoleToolRouter path ships. See // docs/superpowers/plans/2026-05-23-tool-router-specialization.md // for the phased plan (telemetry → fine-tune → wire in). "functiongemma": { Strengths: []TaskType{TaskOrchestration}, MaxComplexity: 0.40, Disabled: true, }, } // ResolveFamilyDefaults returns the defaults for the given model ID, if // any family prefix matches. Matching strategy: // // 1. Lowercase the ID. // 2. Strip any namespace prefix ending in "/" (so "reecdev/tiny3.5:1.5b" // becomes "tiny3.5:1.5b"). // 3. Among the family keys whose lowercase value is a prefix of the // stripped ID, return the entry with the longest matching key. // // Returns (FamilyDefaults{}, false) when no family matches. func ResolveFamilyDefaults(modelID string) (FamilyDefaults, bool) { low := strings.ToLower(modelID) if slash := strings.LastIndex(low, "/"); slash >= 0 { low = low[slash+1:] } var bestKey string var bestDefaults FamilyDefaults found := false for key, defaults := range knownFamilyDefaults { k := strings.ToLower(key) if !strings.HasPrefix(low, k) { continue } if len(k) > len(bestKey) { bestKey = k bestDefaults = defaults found = true } } return bestDefaults, found } // ResolveMaxComplexity returns the MaxComplexity ceiling for the given // model ID using its family defaults. If the family declares SizeCaps, // the parsed parameter count selects the matching cap. If size parsing // fails or the family has neither SizeCaps nor MaxComplexity, returns // (0, false). func ResolveMaxComplexity(modelID string) (float64, bool) { defaults, ok := ResolveFamilyDefaults(modelID) if !ok { return 0, false } if len(defaults.SizeCaps) > 0 { sizeB, sized := parseSizeFromModelID(modelID) if !sized { // Size parse failed — fall back to the smallest cap so we're // conservative rather than optimistic. return defaults.SizeCaps[len(defaults.SizeCaps)-1].Cap, true } for _, sc := range defaults.SizeCaps { if sizeB >= sc.MinSizeB { return sc.Cap, true } } return defaults.SizeCaps[len(defaults.SizeCaps)-1].Cap, true } if defaults.MaxComplexity > 0 { return defaults.MaxComplexity, true } return 0, false } // applyFamilyDefaults populates zero-valued routing fields on an Arm from // the family-defaults table. Only fields that are still at their zero // value get filled — user-supplied Strengths, MaxComplexity, CostWeight, // or Disabled are never overwritten. Returns true when at least one // family entry matched, false when the model is unknown. // // Looks up by arm.ModelName first; falls back to arm.ID.Model() when // ModelName is empty (which test code commonly omits). func applyFamilyDefaults(arm *Arm) bool { if arm == nil { return false } modelKey := arm.ModelName if modelKey == "" { modelKey = arm.ID.Model() } defaults, ok := ResolveFamilyDefaults(modelKey) if !ok { return false } if len(arm.Strengths) == 0 && len(defaults.Strengths) > 0 { arm.Strengths = defaults.Strengths } if arm.MaxComplexity == 0 { if cap, capOK := ResolveMaxComplexity(modelKey); capOK { arm.MaxComplexity = cap } } if arm.CostWeight == 0 && defaults.CostWeight > 0 { arm.CostWeight = defaults.CostWeight } if defaults.Disabled { arm.Disabled = true } return true } // pureSizeToken matches a token consisting of digits (optionally with a // single decimal point) followed by 'b' or 'm' — and nothing else. Used // after splitting the model ID on `:`, `-`, `_`, `/` to extract a pure // parameter-size token like "14b", "1.5b", "500m" while ignoring tokens // like "a3b" (active params, MoE) or "v0.3" (version). var pureSizeToken = regexp.MustCompile(`^([0-9]+(?:\.[0-9]+)?)([bm])$`) // parseSizeFromModelID extracts the model's parameter count in billions // from its ID. Splits on common separators and looks for tokens of the // form `b` or `m` (millions converted to billions). Returns the // largest match — for IDs like "qwen3-coder:30b-a3b-q4_K_M" we want the // total (30) rather than the active-params token (a3b would be skipped // anyway because it isn't pure-digit prefixed). func parseSizeFromModelID(id string) (float64, bool) { low := strings.ToLower(id) pieces := strings.FieldsFunc(low, func(r rune) bool { switch r { case ':', '-', '_', '/': return true } return false }) var best float64 found := false for _, p := range pieces { m := pureSizeToken.FindStringSubmatch(p) if m == nil { continue } n, err := strconv.ParseFloat(m[1], 64) if err != nil { continue } if m[2] == "m" { n /= 1000.0 } if n > best { best = n found = true } } return best, found }