gnoma/internal/router/defaults.go

package router

import (
	"regexp"
	"strconv"
	"strings"
)

// FamilyDefaults are the per-model-family routing defaults applied at
// discovery time when the user has not supplied an [[arms]] override in
// config. Populated from the benchmark snapshot dated 2026-05-23
// (artificialanalysis.ai v4.0, llm-stats.com, kilo.ai); see
// docs/superpowers/plans/2026-05-23-routing-defaults-refresh.md for
// rationale per entry.
//
// Zero-valued fields mean "router default" — only non-zero fields are
// applied. That keeps the table honest: an unset MaxComplexity stays 0
// (no ceiling) rather than getting a fake value.
//
// For families that span a wide parameter range (ministral-3 from
// 3B to 14B, qwen3 from 4B to 14B, tiny3.5 from 0.5B to 1.5B), use
// SizeCaps instead of MaxComplexity. The first SizeCap whose
// MinSizeB threshold the parsed model size meets wins; entries must
// be ordered largest-first.
type FamilyDefaults struct {
	Strengths     []TaskType
	MaxComplexity float64
	CostWeight    float64
	Disabled      bool
	SizeCaps      []SizeCap
}

// SizeCap maps a minimum parameter count (in billions) to a
// MaxComplexity ceiling. Used in FamilyDefaults.SizeCaps when a family
// covers many sizes that warrant different ceilings.
type SizeCap struct {
	MinSizeB float64
	Cap      float64
}

// knownFamilyDefaults is the family-prefix → defaults lookup table.
// Matching is longest-prefix-wins via ResolveFamilyDefaults, so
// "qwen3-coder" beats "qwen3" beats "qwen". Keys are matched against the
// model ID with case-insensitive prefix; namespace prefixes ending in "/"
// are stripped before matching (so reecdev/tiny3.5:1.5b also matches
// "tiny3.5").
//
// See the routing-defaults-refresh plan for the rationale per row.
// functiongemma is the only Disabled entry; everything else is auto-
// routable. Coder-family Strengths lean on the SWE-bench / Aider /
// HumanEval rankings in the 2026-05-23 snapshot; reasoning-family
// Strengths lean on MMLU / MATH / GPQA.
var knownFamilyDefaults = map[string]FamilyDefaults{
	// --- Coder specialists --------------------------------------------------
	"qwen3-coder": {
		Strengths:     []TaskType{TaskGeneration, TaskRefactor, TaskDebug},
		MaxComplexity: 0.85, // 30B-A3B; 44.3% SWE-Bench Pro
	},
	"qwen2.5-coder": {
		Strengths:     []TaskType{TaskGeneration, TaskRefactor, TaskUnitTest},
		MaxComplexity: 0.70, // 14B; Aider 73.7
	},
	"devstral": {
		Strengths:     []TaskType{TaskGeneration, TaskRefactor, TaskDebug},
		MaxComplexity: 0.85, // 24B; 68% SWE-bench Verified, vision-capable
	},
	"yi-coder": {
		Strengths:     []TaskType{TaskGeneration, TaskRefactor},
		MaxComplexity: 0.55, // 9B; HumanEval 85.4
	},
	"deepseek-coder": {
		Strengths:     []TaskType{TaskGeneration, TaskRefactor},
		MaxComplexity: 0.65, // V2 Lite MoE; 16B-quality at 3B-speed
	},
	"starcoder": {
		Strengths:     []TaskType{TaskGeneration},
		MaxComplexity: 0.45, // fill-in-middle specialist
	},

	// --- Reasoning specialists ----------------------------------------------
	"phi-4-mini": {
		Strengths:     []TaskType{TaskBoilerplate, TaskExplain},
		MaxComplexity: 0.35, // 3.8B compact
	},
	"phi-4": {
		Strengths:     []TaskType{TaskPlanning, TaskDebug, TaskReview},
		MaxComplexity: 0.65, // 14B; MMLU 84.8, HumanEval 82.6
	},

	// --- Gemma family -------------------------------------------------------
	"gemma4-e": { // Ollama-style edge ("gemma4-e4b-uc:latest")
		Strengths:     []TaskType{TaskExplain, TaskBoilerplate},
		MaxComplexity: 0.45,
	},
	"gemma-4-e": { // GGUF-style edge ("gemma-4-e2b-it", "gemma-4-e4b-it")
		Strengths:     []TaskType{TaskExplain, TaskBoilerplate},
		MaxComplexity: 0.45,
	},
	"gemma4": { // base ~9B multimodal
		Strengths:     []TaskType{TaskExplain, TaskReview, TaskGeneration},
		MaxComplexity: 0.70,
	},
	"gemma-4": { // GGUF base variant — catch-all under hyphenated naming
		Strengths:     []TaskType{TaskExplain, TaskReview, TaskGeneration},
		MaxComplexity: 0.70,
	},
	"gemma3": {
		Strengths:     []TaskType{TaskExplain, TaskReview},
		MaxComplexity: 0.55,
	},
	"gemma2": {
		Strengths:     []TaskType{TaskExplain},
		MaxComplexity: 0.40,
	},

	// --- Qwen family (size-keyed for the variants that span ranges) --------
	"qwen3.5": {
		Strengths: []TaskType{TaskBoilerplate, TaskExplain, TaskOrchestration},
		SizeCaps: []SizeCap{
			{MinSizeB: 9, Cap: 0.65}, // 9B distill (e.g. qwen3.5-9b-glm5.1-distill-v1)
			{MinSizeB: 4, Cap: 0.50},
			{MinSizeB: 0, Cap: 0.40},
		},
	},
	"qwen3": {
		Strengths: []TaskType{TaskGeneration, TaskRefactor, TaskDebug},
		SizeCaps: []SizeCap{
			{MinSizeB: 14, Cap: 0.75},
			{MinSizeB: 7, Cap: 0.65},
			{MinSizeB: 0, Cap: 0.50},
		},
	},
	"qwen2.5": {
		Strengths: []TaskType{TaskExplain, TaskRefactor},
		SizeCaps: []SizeCap{
			{MinSizeB: 14, Cap: 0.65},
			{MinSizeB: 7, Cap: 0.55},
			{MinSizeB: 0, Cap: 0.40},
		},
	},
	"qwen": { // catch-all for unmatched Qwen variants
		Strengths:     []TaskType{TaskExplain},
		MaxComplexity: 0.40,
	},

	// --- Mistral / Ministral families --------------------------------------
	"ministral-3": {
		Strengths: []TaskType{TaskOrchestration, TaskPlanning},
		SizeCaps: []SizeCap{
			{MinSizeB: 14, Cap: 0.70},
			{MinSizeB: 8, Cap: 0.55},
			{MinSizeB: 0, Cap: 0.35},
		},
	},
	"mistral-small-3": {
		Strengths:     []TaskType{TaskOrchestration, TaskReview},
		MaxComplexity: 0.65, // 24B; MMLU 81
	},
	"mistral": { // catch-all for Mistral 7B / Nemo / etc.
		Strengths:     []TaskType{TaskGeneration, TaskRefactor},
		MaxComplexity: 0.50,
	},

	// --- Llama family -------------------------------------------------------
	"llama4": {
		Strengths:     []TaskType{TaskExplain, TaskReview},
		MaxComplexity: 0.50, // Scout / Maverick variants
	},
	"llama3.2": {
		Strengths:     []TaskType{TaskExplain, TaskBoilerplate},
		MaxComplexity: 0.35, // tool-call friendly small
	},

	// --- Tiny / draft-class -------------------------------------------------
	"tiny3.5": {
		Strengths: []TaskType{TaskBoilerplate, TaskExplain},
		SizeCaps: []SizeCap{
			{MinSizeB: 1.5, Cap: 0.30},
			{MinSizeB: 0, Cap: 0.20},
		},
	},
	"granite": {
		Strengths:     []TaskType{TaskExplain, TaskBoilerplate},
		MaxComplexity: 0.30, // IBM 8B and similar
	},

	// --- Vision-capable / specialists --------------------------------------
	"minicpm-v": {
		Strengths:     []TaskType{TaskPlanning, TaskReview},
		MaxComplexity: 0.55, // vision-thinking; vision flag set via prefix list
	},
	"glm-ocr": {
		// No Strengths — narrow OCR-only specialist. Vision flag is set
		// via knownVisionModelPrefixes; arm is registered but the router
		// will rarely pick it because nothing promotes it.
		MaxComplexity: 0.30,
	},
	"glm": { // catch-all GLM family
		Strengths:     []TaskType{TaskExplain},
		MaxComplexity: 0.45,
	},

	// --- Closed-source frontier (cloud arms) --------------------------------
	// Cloud entries set Strengths and CostWeight but leave MaxComplexity
	// zero — cloud arms shouldn't have a complexity ceiling. CostWeight
	// rationale per the 2026-05-23 plan:
	//   - 0.3 on frontier arms (Opus 4.7, GPT-5.5): keep them competitive
	//     for high-stakes tasks (SecurityReview, Planning) despite $4+/Mtok.
	//   - 0.5-0.7 on mid-tier coding specialists: standard cost influence.
	//   - 1.2 on cheap fast arms (Gemini 3.5 Flash): penalize cost more
	//     so they win only when cost is genuinely decisive.
	"claude-opus-4-7": {
		Strengths:  []TaskType{TaskPlanning, TaskSecurityReview, TaskDebug, TaskRefactor},
		CostWeight: 0.3,
	},
	"claude-sonnet-4-6": {
		Strengths:  []TaskType{TaskGeneration, TaskRefactor, TaskReview},
		CostWeight: 0.7,
	},
	"gpt-5.5": {
		Strengths:  []TaskType{TaskPlanning, TaskSecurityReview, TaskGeneration},
		CostWeight: 0.3,
	},
	"gpt-5.3-codex": {
		Strengths:  []TaskType{TaskGeneration, TaskRefactor, TaskDebug, TaskUnitTest},
		CostWeight: 0.6,
	},
	"gpt-5.2": {
		Strengths:  []TaskType{TaskOrchestration, TaskReview},
		CostWeight: 0.8,
	},
	"gemini-3.1-pro": {
		Strengths:  []TaskType{TaskPlanning, TaskReview, TaskOrchestration},
		CostWeight: 0.5,
	},
	"gemini-3.5-flash": {
		Strengths:  []TaskType{TaskBoilerplate, TaskExplain, TaskOrchestration},
		CostWeight: 1.2,
	},

	// --- Tool-router specialist (reserved, not auto-routed) -----------------
	// functiongemma is Google's 270M function-calling specialist. It is
	// not a chat model — it emits structured tool calls, not prose. We
	// register it so it shows up in `gnoma providers` but mark it
	// Disabled to keep it out of auto-routing until the dedicated
	// ArmRoleToolRouter path ships. See
	// docs/superpowers/plans/2026-05-23-tool-router-specialization.md
	// for the phased plan (telemetry → fine-tune → wire in).
	"functiongemma": {
		Strengths:     []TaskType{TaskOrchestration},
		MaxComplexity: 0.40,
		Disabled:      true,
	},
}

// ResolveFamilyDefaults returns the defaults for the given model ID, if
// any family prefix matches. Matching strategy:
//
//  1. Lowercase the ID.
//  2. Strip any namespace prefix ending in "/" (so "reecdev/tiny3.5:1.5b"
//     becomes "tiny3.5:1.5b").
//  3. Among the family keys whose lowercase value is a prefix of the
//     stripped ID, return the entry with the longest matching key.
//
// Returns (FamilyDefaults{}, false) when no family matches.
func ResolveFamilyDefaults(modelID string) (FamilyDefaults, bool) {
	low := strings.ToLower(modelID)
	if slash := strings.LastIndex(low, "/"); slash >= 0 {
		low = low[slash+1:]
	}

	var bestKey string
	var bestDefaults FamilyDefaults
	found := false
	for key, defaults := range knownFamilyDefaults {
		k := strings.ToLower(key)
		if !strings.HasPrefix(low, k) {
			continue
		}
		if len(k) > len(bestKey) {
			bestKey = k
			bestDefaults = defaults
			found = true
		}
	}
	return bestDefaults, found
}

// ResolveMaxComplexity returns the MaxComplexity ceiling for the given
// model ID using its family defaults. If the family declares SizeCaps,
// the parsed parameter count selects the matching cap. If size parsing
// fails or the family has neither SizeCaps nor MaxComplexity, returns
// (0, false).
func ResolveMaxComplexity(modelID string) (float64, bool) {
	defaults, ok := ResolveFamilyDefaults(modelID)
	if !ok {
		return 0, false
	}
	if len(defaults.SizeCaps) > 0 {
		sizeB, sized := parseSizeFromModelID(modelID)
		if !sized {
			// Size parse failed — fall back to the smallest cap so we're
			// conservative rather than optimistic.
			return defaults.SizeCaps[len(defaults.SizeCaps)-1].Cap, true
		}
		for _, sc := range defaults.SizeCaps {
			if sizeB >= sc.MinSizeB {
				return sc.Cap, true
			}
		}
		return defaults.SizeCaps[len(defaults.SizeCaps)-1].Cap, true
	}
	if defaults.MaxComplexity > 0 {
		return defaults.MaxComplexity, true
	}
	return 0, false
}

// applyFamilyDefaults populates zero-valued routing fields on an Arm from
// the family-defaults table. Only fields that are still at their zero
// value get filled — user-supplied Strengths, MaxComplexity, CostWeight,
// or Disabled are never overwritten. Returns true when at least one
// family entry matched, false when the model is unknown.
//
// Looks up by arm.ModelName first; falls back to arm.ID.Model() when
// ModelName is empty (which test code commonly omits).
func applyFamilyDefaults(arm *Arm) bool {
	if arm == nil {
		return false
	}
	modelKey := arm.ModelName
	if modelKey == "" {
		modelKey = arm.ID.Model()
	}
	defaults, ok := ResolveFamilyDefaults(modelKey)
	if !ok {
		return false
	}
	if len(arm.Strengths) == 0 && len(defaults.Strengths) > 0 {
		arm.Strengths = defaults.Strengths
	}
	if arm.MaxComplexity == 0 {
		if cap, capOK := ResolveMaxComplexity(modelKey); capOK {
			arm.MaxComplexity = cap
		}
	}
	if arm.CostWeight == 0 && defaults.CostWeight > 0 {
		arm.CostWeight = defaults.CostWeight
	}
	if defaults.Disabled {
		arm.Disabled = true
	}
	return true
}

// pureSizeToken matches a token consisting of digits (optionally with a
// single decimal point) followed by 'b' or 'm' — and nothing else. Used
// after splitting the model ID on `:`, `-`, `_`, `/` to extract a pure
// parameter-size token like "14b", "1.5b", "500m" while ignoring tokens
// like "a3b" (active params, MoE) or "v0.3" (version).
var pureSizeToken = regexp.MustCompile(`^([0-9]+(?:\.[0-9]+)?)([bm])$`)

// parseSizeFromModelID extracts the model's parameter count in billions
// from its ID. Splits on common separators and looks for tokens of the
// form `<N>b` or `<N>m` (millions converted to billions). Returns the
// largest match — for IDs like "qwen3-coder:30b-a3b-q4_K_M" we want the
// total (30) rather than the active-params token (a3b would be skipped
// anyway because it isn't pure-digit prefixed).
func parseSizeFromModelID(id string) (float64, bool) {
	low := strings.ToLower(id)
	pieces := strings.FieldsFunc(low, func(r rune) bool {
		switch r {
		case ':', '-', '_', '/':
			return true
		}
		return false
	})
	var best float64
	found := false
	for _, p := range pieces {
		m := pureSizeToken.FindStringSubmatch(p)
		if m == nil {
			continue
		}
		n, err := strconv.ParseFloat(m[1], 64)
		if err != nil {
			continue
		}
		if m[2] == "m" {
			n /= 1000.0
		}
		if n > best {
			best = n
			found = true
		}
	}
	return best, found
}