feat(router): cloud-arm defaults, gpt-5.3-codex registration

Closes R-4 and R-5 of the routing-defaults plan.

R-4: Strengths + CostWeight defaults for closed frontier models.
Cloud entries land in the same knownFamilyDefaults table as local
ones, with MaxComplexity intentionally left zero (cloud arms get
no complexity ceiling). CostWeight tuned per the plan's rationale:

  claude-opus-4-7    → Planning/SecurityReview/Debug/Refactor, 0.3
  claude-sonnet-4-6  → Generation/Refactor/Review,             0.7
  gpt-5.5            → Planning/SecurityReview/Generation,     0.3
  gpt-5.3-codex      → Generation/Refactor/Debug/UnitTest,     0.6
  gpt-5.2            → Orchestration/Review,                   0.8
  gemini-3.1-pro     → Planning/Review/Orchestration,          0.5
  gemini-3.5-flash   → Boilerplate/Explain/Orchestration,      1.2

The 0.3 weight on frontier arms keeps them competitive on
SecurityReview / Planning despite $4+/Mtok; 1.2 on Gemini Flash
penalizes cost more so it only wins when cost is genuinely
decisive (boilerplate, explain).

Mechanism: extracted applyFamilyDefaults into defaults.go and call
it from Router.RegisterArm. Single source of truth — both local
discovery and the primary-provider path in cmd/gnoma/main.go now
flow through the same defaults application. Removed the duplicate
apply block from RegisterDiscoveredModels.

Legacy model IDs (claude-opus-4-20250514, gpt-4o, o3, gemini-2.5-pro,
etc.) intentionally do not match any table entry — keeps users on
pinned older models safe from imposed 2026 Strengths.

R-5: gpt-5.3-codex registration.

  - internal/provider/openai/provider.go: added to fallbackModels
    and inferOpenAIModelCapabilities (400K context, 32K output).
  - internal/provider/ratelimits.go: gpt-5.3-codex and its dated
    alias gpt-5.3-codex-2026-02-15 added with the same Tier 1
    quotas as gpt-5.2.

Gemini 3.x (3.1-pro-preview, 3.5-flash, 3.1-flash-lite) was already
registered in both google/provider.go and ratelimits.go — no change
needed for that part of R-5.

Test coverage:
- ResolveFamilyDefaults table-driven across all 7 cloud entries
  including prefix-sharing (gpt-5.5-pro → gpt-5.5 defaults,
  gemini-3.1-pro-preview → gemini-3.1-pro defaults).
- Legacy IDs return !ok.
- RegisterArm applies cloud defaults end-to-end.
- User-supplied Strengths and CostWeight are not overridden.
- ID.Model() fallback works when ModelName is empty (test code
  often constructs arms this way).

Refs: docs/superpowers/plans/2026-05-23-routing-defaults-refresh.md
This commit is contained in:
2026-05-23 21:39:48 +02:00
parent 9bb775a4aa
commit 2f8d4c412f
6 changed files with 222 additions and 25 deletions
+14
View File
@@ -132,6 +132,17 @@ func (p *Provider) fallbackModels() []provider.ModelInfo {
MaxOutput: 32000,
},
},
{
ID: "gpt-5.3-codex", Name: "GPT-5.3 Codex", Provider: p.name,
Capabilities: provider.Capabilities{
ToolUse: true,
JSONOutput: true,
Vision: true,
ThinkingModes: []provider.EffortLevel{provider.EffortLow, provider.EffortMedium, provider.EffortHigh},
ContextWindow: 400000,
MaxOutput: 32000,
},
},
{
ID: "gpt-5.2", Name: "GPT-5.2 Thinking", Provider: p.name,
Capabilities: provider.Capabilities{
@@ -205,6 +216,9 @@ func inferOpenAIModelCapabilities(modelID string) provider.Capabilities {
case "gpt-5.5", "gpt-5.5-pro":
caps.ContextWindow = 1_000_000
caps.MaxOutput = 32000
case "gpt-5.3-codex":
caps.ContextWindow = 400000
caps.MaxOutput = 32000
case "gpt-5.2", "gpt-5.2-chat-latest":
caps.ContextWindow = 400000
caps.MaxOutput = 32000
+3
View File
@@ -140,6 +140,9 @@ func openaiDefaults() ProviderDefaults {
"gpt-5.5": {RPM: 500, TPM: 30_000, RPD: 10_000},
"gpt-5.5-pro": {RPM: 500, TPM: 30_000, RPD: 10_000},
"gpt-5.5-2026-04-23": {RPM: 500, TPM: 30_000, RPD: 10_000},
// GPT-5.3 Codex (coding-specialist branch).
"gpt-5.3-codex": {RPM: 500, TPM: 200_000, RPD: 10_000},
"gpt-5.3-codex-2026-02-15": {RPM: 500, TPM: 200_000, RPD: 10_000},
// GPT-5.2 generation.
"gpt-5.2": {RPM: 500, TPM: 200_000, RPD: 10_000},
"gpt-5.2-chat-latest": {RPM: 500, TPM: 200_000, RPD: 10_000},
+75
View File
@@ -200,6 +200,44 @@ var knownFamilyDefaults = map[string]FamilyDefaults{
MaxComplexity: 0.45,
},
// --- Closed-source frontier (cloud arms) --------------------------------
// Cloud entries set Strengths and CostWeight but leave MaxComplexity
// zero — cloud arms shouldn't have a complexity ceiling. CostWeight
// rationale per the 2026-05-23 plan:
// - 0.3 on frontier arms (Opus 4.7, GPT-5.5): keep them competitive
// for high-stakes tasks (SecurityReview, Planning) despite $4+/Mtok.
// - 0.5-0.7 on mid-tier coding specialists: standard cost influence.
// - 1.2 on cheap fast arms (Gemini 3.5 Flash): penalize cost more
// so they win only when cost is genuinely decisive.
"claude-opus-4-7": {
Strengths: []TaskType{TaskPlanning, TaskSecurityReview, TaskDebug, TaskRefactor},
CostWeight: 0.3,
},
"claude-sonnet-4-6": {
Strengths: []TaskType{TaskGeneration, TaskRefactor, TaskReview},
CostWeight: 0.7,
},
"gpt-5.5": {
Strengths: []TaskType{TaskPlanning, TaskSecurityReview, TaskGeneration},
CostWeight: 0.3,
},
"gpt-5.3-codex": {
Strengths: []TaskType{TaskGeneration, TaskRefactor, TaskDebug, TaskUnitTest},
CostWeight: 0.6,
},
"gpt-5.2": {
Strengths: []TaskType{TaskOrchestration, TaskReview},
CostWeight: 0.8,
},
"gemini-3.1-pro": {
Strengths: []TaskType{TaskPlanning, TaskReview, TaskOrchestration},
CostWeight: 0.5,
},
"gemini-3.5-flash": {
Strengths: []TaskType{TaskBoilerplate, TaskExplain, TaskOrchestration},
CostWeight: 1.2,
},
// --- Tool-router specialist (reserved, not auto-routed) -----------------
// functiongemma is Google's 270M function-calling specialist. It is
// not a chat model — it emits structured tool calls, not prose. We
@@ -278,6 +316,43 @@ func ResolveMaxComplexity(modelID string) (float64, bool) {
return 0, false
}
// applyFamilyDefaults populates zero-valued routing fields on an Arm from
// the family-defaults table. Only fields that are still at their zero
// value get filled — user-supplied Strengths, MaxComplexity, CostWeight,
// or Disabled are never overwritten. Returns true when at least one
// family entry matched, false when the model is unknown.
//
// Looks up by arm.ModelName first; falls back to arm.ID.Model() when
// ModelName is empty (which test code commonly omits).
func applyFamilyDefaults(arm *Arm) bool {
if arm == nil {
return false
}
modelKey := arm.ModelName
if modelKey == "" {
modelKey = arm.ID.Model()
}
defaults, ok := ResolveFamilyDefaults(modelKey)
if !ok {
return false
}
if len(arm.Strengths) == 0 && len(defaults.Strengths) > 0 {
arm.Strengths = defaults.Strengths
}
if arm.MaxComplexity == 0 {
if cap, capOK := ResolveMaxComplexity(modelKey); capOK {
arm.MaxComplexity = cap
}
}
if arm.CostWeight == 0 && defaults.CostWeight > 0 {
arm.CostWeight = defaults.CostWeight
}
if defaults.Disabled {
arm.Disabled = true
}
return true
}
// pureSizeToken matches a token consisting of digits (optionally with a
// single decimal point) followed by 'b' or 'm' — and nothing else. Used
// after splitting the model ID on `:`, `-`, `_`, `/` to extract a pure
+117
View File
@@ -186,6 +186,123 @@ func TestKnownFamilyDefaults_NoDualSpec(t *testing.T) {
}
}
// --- Cloud defaults --------------------------------------------------------
func TestResolveFamilyDefaults_CloudArms(t *testing.T) {
cases := []struct {
modelID string
wantStrengths []TaskType
wantCostWeight float64
}{
{"claude-opus-4-7", []TaskType{TaskPlanning, TaskSecurityReview, TaskDebug, TaskRefactor}, 0.3},
{"claude-sonnet-4-6", []TaskType{TaskGeneration, TaskRefactor, TaskReview}, 0.7},
{"gpt-5.5", []TaskType{TaskPlanning, TaskSecurityReview, TaskGeneration}, 0.3},
{"gpt-5.5-pro", []TaskType{TaskPlanning, TaskSecurityReview, TaskGeneration}, 0.3}, // shares prefix with gpt-5.5
{"gpt-5.3-codex", []TaskType{TaskGeneration, TaskRefactor, TaskDebug, TaskUnitTest}, 0.6},
{"gpt-5.2", []TaskType{TaskOrchestration, TaskReview}, 0.8},
{"gpt-5.2-chat-latest", []TaskType{TaskOrchestration, TaskReview}, 0.8},
{"gemini-3.1-pro", []TaskType{TaskPlanning, TaskReview, TaskOrchestration}, 0.5},
{"gemini-3.1-pro-preview", []TaskType{TaskPlanning, TaskReview, TaskOrchestration}, 0.5},
{"gemini-3.5-flash", []TaskType{TaskBoilerplate, TaskExplain, TaskOrchestration}, 1.2},
}
for _, tc := range cases {
t.Run(tc.modelID, func(t *testing.T) {
got, ok := ResolveFamilyDefaults(tc.modelID)
if !ok {
t.Fatalf("ResolveFamilyDefaults(%q) returned !ok", tc.modelID)
}
if !reflect.DeepEqual(got.Strengths, tc.wantStrengths) {
t.Errorf("%q Strengths = %v, want %v", tc.modelID, got.Strengths, tc.wantStrengths)
}
if got.CostWeight != tc.wantCostWeight {
t.Errorf("%q CostWeight = %v, want %v", tc.modelID, got.CostWeight, tc.wantCostWeight)
}
if got.MaxComplexity != 0 {
t.Errorf("%q MaxComplexity = %v, want 0 (cloud arms have no ceiling)", tc.modelID, got.MaxComplexity)
}
})
}
}
func TestResolveFamilyDefaults_CloudLegacyUnaffected(t *testing.T) {
// Legacy / unrelated cloud IDs must NOT pick up defaults — keeping
// users on older pinned models safe from imposed Strengths.
noMatch := []string{
"claude-opus-4-20250514",
"claude-sonnet-4-20250514",
"claude-haiku-4-5-20251001",
"gpt-4o",
"gpt-4o-mini",
"o3",
"o3-mini",
"gemini-2.5-pro",
"gemini-2.0-flash",
}
for _, id := range noMatch {
if _, ok := ResolveFamilyDefaults(id); ok {
t.Errorf("ResolveFamilyDefaults(%q) should not match (legacy model)", id)
}
}
}
func TestRegisterArm_AppliesCloudDefaults(t *testing.T) {
r := New(Config{})
r.RegisterArm(&Arm{
ID: NewArmID("openai", "gpt-5.3-codex"),
ModelName: "gpt-5.3-codex",
Capabilities: provider.Capabilities{
ToolUse: true, JSONOutput: true,
ContextWindow: 400000,
},
})
arm, ok := r.LookupArm(NewArmID("openai", "gpt-5.3-codex"))
if !ok {
t.Fatal("gpt-5.3-codex arm should be registered")
}
wantStrengths := []TaskType{TaskGeneration, TaskRefactor, TaskDebug, TaskUnitTest}
if !reflect.DeepEqual(arm.Strengths, wantStrengths) {
t.Errorf("Strengths = %v, want %v", arm.Strengths, wantStrengths)
}
if arm.CostWeight != 0.6 {
t.Errorf("CostWeight = %v, want 0.6", arm.CostWeight)
}
if arm.MaxComplexity != 0 {
t.Errorf("MaxComplexity = %v, want 0 (cloud arm)", arm.MaxComplexity)
}
}
func TestRegisterArm_DoesNotOverrideUserStrengths(t *testing.T) {
r := New(Config{})
r.RegisterArm(&Arm{
ID: NewArmID("anthropic", "claude-opus-4-7"),
ModelName: "claude-opus-4-7",
Strengths: []TaskType{TaskUnitTest}, // user-supplied; defaults should not overwrite
CostWeight: 0.5, // user-supplied
})
arm, _ := r.LookupArm(NewArmID("anthropic", "claude-opus-4-7"))
if !reflect.DeepEqual(arm.Strengths, []TaskType{TaskUnitTest}) {
t.Errorf("user-supplied Strengths overridden by defaults: got %v", arm.Strengths)
}
if arm.CostWeight != 0.5 {
t.Errorf("user-supplied CostWeight overridden: got %v", arm.CostWeight)
}
}
func TestRegisterArm_FallsBackToIDWhenModelNameMissing(t *testing.T) {
// Some test code constructs arms with ID but no ModelName.
// applyFamilyDefaults should fall back to ID.Model() so defaults
// still flow through.
r := New(Config{})
r.RegisterArm(&Arm{
ID: NewArmID("openai", "gpt-5.3-codex"),
// ModelName intentionally empty
})
arm, _ := r.LookupArm(NewArmID("openai", "gpt-5.3-codex"))
if arm.CostWeight != 0.6 {
t.Errorf("CostWeight = %v, want 0.6 (defaults should resolve via ID.Model() fallback)", arm.CostWeight)
}
}
// --- Integration: routing-payoff scenario --------------------------------
// TestRoutingDefaults_PayoffScenario is the user-facing demonstration that
+7 -24
View File
@@ -497,7 +497,12 @@ func RegisterDiscoveredModels(r *Router, models []DiscoveredModel, providerFacto
continue
}
arm := &Arm{
// Family-keyed defaults (Strengths, MaxComplexity, CostWeight,
// Disabled) are applied inside Router.RegisterArm — single source
// of truth so cloud-arm and local-arm registration paths agree.
// User-supplied [[arms]] config in TOML overrides defaults later
// via ApplyArmOverrides.
r.RegisterArm(&Arm{
ID: armID,
Provider: prov,
ModelName: m.ID,
@@ -513,28 +518,6 @@ func RegisterDiscoveredModels(r *Router, models []DiscoveredModel, providerFacto
Vision: m.SupportsVision,
ContextWindow: m.ContextSize,
},
}
// Apply family-keyed defaults (Strengths, MaxComplexity, CostWeight,
// Disabled) for known model families. User-supplied [[arms]] config
// in TOML overrides these later via ApplyArmOverrides.
if defaults, ok := ResolveFamilyDefaults(m.ID); ok {
if len(arm.Strengths) == 0 && len(defaults.Strengths) > 0 {
arm.Strengths = defaults.Strengths
}
if arm.MaxComplexity == 0 {
if cap, ok := ResolveMaxComplexity(m.ID); ok {
arm.MaxComplexity = cap
}
}
if arm.CostWeight == 0 && defaults.CostWeight > 0 {
arm.CostWeight = defaults.CostWeight
}
if defaults.Disabled {
arm.Disabled = true
}
}
r.RegisterArm(arm)
})
}
}
+6 -1
View File
@@ -42,8 +42,13 @@ func New(cfg Config) *Router {
}
}
// RegisterArm adds an arm to the router.
// RegisterArm adds an arm to the router. Family-keyed defaults
// (Strengths, MaxComplexity, CostWeight, Disabled) are applied to any
// fields still at their zero value — user-supplied values are never
// overwritten. See defaults.go for the family table.
func (r *Router) RegisterArm(arm *Arm) {
applyFamilyDefaults(arm)
r.mu.Lock()
defer r.mu.Unlock()
r.arms[arm.ID] = arm