feat(router): cloud-arm defaults, gpt-5.3-codex registration

Closes R-4 and R-5 of the routing-defaults plan. R-4: Strengths + CostWeight defaults for closed frontier models. Cloud entries land in the same knownFamilyDefaults table as local ones, with MaxComplexity intentionally left zero (cloud arms get no complexity ceiling). CostWeight tuned per the plan's rationale: claude-opus-4-7 → Planning/SecurityReview/Debug/Refactor, 0.3 claude-sonnet-4-6 → Generation/Refactor/Review, 0.7 gpt-5.5 → Planning/SecurityReview/Generation, 0.3 gpt-5.3-codex → Generation/Refactor/Debug/UnitTest, 0.6 gpt-5.2 → Orchestration/Review, 0.8 gemini-3.1-pro → Planning/Review/Orchestration, 0.5 gemini-3.5-flash → Boilerplate/Explain/Orchestration, 1.2 The 0.3 weight on frontier arms keeps them competitive on SecurityReview / Planning despite $4+/Mtok; 1.2 on Gemini Flash penalizes cost more so it only wins when cost is genuinely decisive (boilerplate, explain). Mechanism: extracted applyFamilyDefaults into defaults.go and call it from Router.RegisterArm. Single source of truth — both local discovery and the primary-provider path in cmd/gnoma/main.go now flow through the same defaults application. Removed the duplicate apply block from RegisterDiscoveredModels. Legacy model IDs (claude-opus-4-20250514, gpt-4o, o3, gemini-2.5-pro, etc.) intentionally do not match any table entry — keeps users on pinned older models safe from imposed 2026 Strengths. R-5: gpt-5.3-codex registration. - internal/provider/openai/provider.go: added to fallbackModels and inferOpenAIModelCapabilities (400K context, 32K output). - internal/provider/ratelimits.go: gpt-5.3-codex and its dated alias gpt-5.3-codex-2026-02-15 added with the same Tier 1 quotas as gpt-5.2. Gemini 3.x (3.1-pro-preview, 3.5-flash, 3.1-flash-lite) was already registered in both google/provider.go and ratelimits.go — no change needed for that part of R-5. Test coverage: - ResolveFamilyDefaults table-driven across all 7 cloud entries including prefix-sharing (gpt-5.5-pro → gpt-5.5 defaults, gemini-3.1-pro-preview → gemini-3.1-pro defaults). - Legacy IDs return !ok. - RegisterArm applies cloud defaults end-to-end. - User-supplied Strengths and CostWeight are not overridden. - ID.Model() fallback works when ModelName is empty (test code often constructs arms this way). Refs: docs/superpowers/plans/2026-05-23-routing-defaults-refresh.md
2026-05-23 21:39:48 +02:00
parent 9bb775a4aa
commit 2f8d4c412f
6 changed files with 222 additions and 25 deletions
@@ -132,6 +132,17 @@ func (p *Provider) fallbackModels() []provider.ModelInfo {
 				MaxOutput:     32000,
 			},
 		},
+		{
+			ID: "gpt-5.3-codex", Name: "GPT-5.3 Codex", Provider: p.name,
+			Capabilities: provider.Capabilities{
+				ToolUse:       true,
+				JSONOutput:    true,
+				Vision:        true,
+				ThinkingModes: []provider.EffortLevel{provider.EffortLow, provider.EffortMedium, provider.EffortHigh},
+				ContextWindow: 400000,
+				MaxOutput:     32000,
+			},
+		},
 		{
 			ID: "gpt-5.2", Name: "GPT-5.2 Thinking", Provider: p.name,
 			Capabilities: provider.Capabilities{
@@ -205,6 +216,9 @@ func inferOpenAIModelCapabilities(modelID string) provider.Capabilities {
 	case "gpt-5.5", "gpt-5.5-pro":
 		caps.ContextWindow = 1_000_000
 		caps.MaxOutput = 32000
+	case "gpt-5.3-codex":
+		caps.ContextWindow = 400000
+		caps.MaxOutput = 32000
 	case "gpt-5.2", "gpt-5.2-chat-latest":
 		caps.ContextWindow = 400000
 		caps.MaxOutput = 32000
@@ -140,6 +140,9 @@ func openaiDefaults() ProviderDefaults {
 			"gpt-5.5":            {RPM: 500, TPM: 30_000, RPD: 10_000},
 			"gpt-5.5-pro":        {RPM: 500, TPM: 30_000, RPD: 10_000},
 			"gpt-5.5-2026-04-23": {RPM: 500, TPM: 30_000, RPD: 10_000},
+			// GPT-5.3 Codex (coding-specialist branch).
+			"gpt-5.3-codex":            {RPM: 500, TPM: 200_000, RPD: 10_000},
+			"gpt-5.3-codex-2026-02-15": {RPM: 500, TPM: 200_000, RPD: 10_000},
 			// GPT-5.2 generation.
 			"gpt-5.2":             {RPM: 500, TPM: 200_000, RPD: 10_000},
 			"gpt-5.2-chat-latest": {RPM: 500, TPM: 200_000, RPD: 10_000},
@@ -200,6 +200,44 @@ var knownFamilyDefaults = map[string]FamilyDefaults{
 		MaxComplexity: 0.45,
 	},

+	// --- Closed-source frontier (cloud arms) --------------------------------
+	// Cloud entries set Strengths and CostWeight but leave MaxComplexity
+	// zero — cloud arms shouldn't have a complexity ceiling. CostWeight
+	// rationale per the 2026-05-23 plan:
+	//   - 0.3 on frontier arms (Opus 4.7, GPT-5.5): keep them competitive
+	//     for high-stakes tasks (SecurityReview, Planning) despite $4+/Mtok.
+	//   - 0.5-0.7 on mid-tier coding specialists: standard cost influence.
+	//   - 1.2 on cheap fast arms (Gemini 3.5 Flash): penalize cost more
+	//     so they win only when cost is genuinely decisive.
+	"claude-opus-4-7": {
+		Strengths:  []TaskType{TaskPlanning, TaskSecurityReview, TaskDebug, TaskRefactor},
+		CostWeight: 0.3,
+	},
+	"claude-sonnet-4-6": {
+		Strengths:  []TaskType{TaskGeneration, TaskRefactor, TaskReview},
+		CostWeight: 0.7,
+	},
+	"gpt-5.5": {
+		Strengths:  []TaskType{TaskPlanning, TaskSecurityReview, TaskGeneration},
+		CostWeight: 0.3,
+	},
+	"gpt-5.3-codex": {
+		Strengths:  []TaskType{TaskGeneration, TaskRefactor, TaskDebug, TaskUnitTest},
+		CostWeight: 0.6,
+	},
+	"gpt-5.2": {
+		Strengths:  []TaskType{TaskOrchestration, TaskReview},
+		CostWeight: 0.8,
+	},
+	"gemini-3.1-pro": {
+		Strengths:  []TaskType{TaskPlanning, TaskReview, TaskOrchestration},
+		CostWeight: 0.5,
+	},
+	"gemini-3.5-flash": {
+		Strengths:  []TaskType{TaskBoilerplate, TaskExplain, TaskOrchestration},
+		CostWeight: 1.2,
+	},
+
 	// --- Tool-router specialist (reserved, not auto-routed) -----------------
 	// functiongemma is Google's 270M function-calling specialist. It is
 	// not a chat model — it emits structured tool calls, not prose. We
@@ -278,6 +316,43 @@ func ResolveMaxComplexity(modelID string) (float64, bool) {
 	return 0, false
 }

+// applyFamilyDefaults populates zero-valued routing fields on an Arm from
+// the family-defaults table. Only fields that are still at their zero
+// value get filled — user-supplied Strengths, MaxComplexity, CostWeight,
+// or Disabled are never overwritten. Returns true when at least one
+// family entry matched, false when the model is unknown.
+//
+// Looks up by arm.ModelName first; falls back to arm.ID.Model() when
+// ModelName is empty (which test code commonly omits).
+func applyFamilyDefaults(arm *Arm) bool {
+	if arm == nil {
+		return false
+	}
+	modelKey := arm.ModelName
+	if modelKey == "" {
+		modelKey = arm.ID.Model()
+	}
+	defaults, ok := ResolveFamilyDefaults(modelKey)
+	if !ok {
+		return false
+	}
+	if len(arm.Strengths) == 0 && len(defaults.Strengths) > 0 {
+		arm.Strengths = defaults.Strengths
+	}
+	if arm.MaxComplexity == 0 {
+		if cap, capOK := ResolveMaxComplexity(modelKey); capOK {
+			arm.MaxComplexity = cap
+		}
+	}
+	if arm.CostWeight == 0 && defaults.CostWeight > 0 {
+		arm.CostWeight = defaults.CostWeight
+	}
+	if defaults.Disabled {
+		arm.Disabled = true
+	}
+	return true
+}
+
 // pureSizeToken matches a token consisting of digits (optionally with a
 // single decimal point) followed by 'b' or 'm' — and nothing else. Used
 // after splitting the model ID on `:`, `-`, `_`, `/` to extract a pure
@@ -186,6 +186,123 @@ func TestKnownFamilyDefaults_NoDualSpec(t *testing.T) {
 	}
 }

+// --- Cloud defaults --------------------------------------------------------
+
+func TestResolveFamilyDefaults_CloudArms(t *testing.T) {
+	cases := []struct {
+		modelID        string
+		wantStrengths  []TaskType
+		wantCostWeight float64
+	}{
+		{"claude-opus-4-7", []TaskType{TaskPlanning, TaskSecurityReview, TaskDebug, TaskRefactor}, 0.3},
+		{"claude-sonnet-4-6", []TaskType{TaskGeneration, TaskRefactor, TaskReview}, 0.7},
+		{"gpt-5.5", []TaskType{TaskPlanning, TaskSecurityReview, TaskGeneration}, 0.3},
+		{"gpt-5.5-pro", []TaskType{TaskPlanning, TaskSecurityReview, TaskGeneration}, 0.3}, // shares prefix with gpt-5.5
+		{"gpt-5.3-codex", []TaskType{TaskGeneration, TaskRefactor, TaskDebug, TaskUnitTest}, 0.6},
+		{"gpt-5.2", []TaskType{TaskOrchestration, TaskReview}, 0.8},
+		{"gpt-5.2-chat-latest", []TaskType{TaskOrchestration, TaskReview}, 0.8},
+		{"gemini-3.1-pro", []TaskType{TaskPlanning, TaskReview, TaskOrchestration}, 0.5},
+		{"gemini-3.1-pro-preview", []TaskType{TaskPlanning, TaskReview, TaskOrchestration}, 0.5},
+		{"gemini-3.5-flash", []TaskType{TaskBoilerplate, TaskExplain, TaskOrchestration}, 1.2},
+	}
+	for _, tc := range cases {
+		t.Run(tc.modelID, func(t *testing.T) {
+			got, ok := ResolveFamilyDefaults(tc.modelID)
+			if !ok {
+				t.Fatalf("ResolveFamilyDefaults(%q) returned !ok", tc.modelID)
+			}
+			if !reflect.DeepEqual(got.Strengths, tc.wantStrengths) {
+				t.Errorf("%q Strengths = %v, want %v", tc.modelID, got.Strengths, tc.wantStrengths)
+			}
+			if got.CostWeight != tc.wantCostWeight {
+				t.Errorf("%q CostWeight = %v, want %v", tc.modelID, got.CostWeight, tc.wantCostWeight)
+			}
+			if got.MaxComplexity != 0 {
+				t.Errorf("%q MaxComplexity = %v, want 0 (cloud arms have no ceiling)", tc.modelID, got.MaxComplexity)
+			}
+		})
+	}
+}
+
+func TestResolveFamilyDefaults_CloudLegacyUnaffected(t *testing.T) {
+	// Legacy / unrelated cloud IDs must NOT pick up defaults — keeping
+	// users on older pinned models safe from imposed Strengths.
+	noMatch := []string{
+		"claude-opus-4-20250514",
+		"claude-sonnet-4-20250514",
+		"claude-haiku-4-5-20251001",
+		"gpt-4o",
+		"gpt-4o-mini",
+		"o3",
+		"o3-mini",
+		"gemini-2.5-pro",
+		"gemini-2.0-flash",
+	}
+	for _, id := range noMatch {
+		if _, ok := ResolveFamilyDefaults(id); ok {
+			t.Errorf("ResolveFamilyDefaults(%q) should not match (legacy model)", id)
+		}
+	}
+}
+
+func TestRegisterArm_AppliesCloudDefaults(t *testing.T) {
+	r := New(Config{})
+	r.RegisterArm(&Arm{
+		ID:        NewArmID("openai", "gpt-5.3-codex"),
+		ModelName: "gpt-5.3-codex",
+		Capabilities: provider.Capabilities{
+			ToolUse: true, JSONOutput: true,
+			ContextWindow: 400000,
+		},
+	})
+	arm, ok := r.LookupArm(NewArmID("openai", "gpt-5.3-codex"))
+	if !ok {
+		t.Fatal("gpt-5.3-codex arm should be registered")
+	}
+	wantStrengths := []TaskType{TaskGeneration, TaskRefactor, TaskDebug, TaskUnitTest}
+	if !reflect.DeepEqual(arm.Strengths, wantStrengths) {
+		t.Errorf("Strengths = %v, want %v", arm.Strengths, wantStrengths)
+	}
+	if arm.CostWeight != 0.6 {
+		t.Errorf("CostWeight = %v, want 0.6", arm.CostWeight)
+	}
+	if arm.MaxComplexity != 0 {
+		t.Errorf("MaxComplexity = %v, want 0 (cloud arm)", arm.MaxComplexity)
+	}
+}
+
+func TestRegisterArm_DoesNotOverrideUserStrengths(t *testing.T) {
+	r := New(Config{})
+	r.RegisterArm(&Arm{
+		ID:         NewArmID("anthropic", "claude-opus-4-7"),
+		ModelName:  "claude-opus-4-7",
+		Strengths:  []TaskType{TaskUnitTest}, // user-supplied; defaults should not overwrite
+		CostWeight: 0.5,                      // user-supplied
+	})
+	arm, _ := r.LookupArm(NewArmID("anthropic", "claude-opus-4-7"))
+	if !reflect.DeepEqual(arm.Strengths, []TaskType{TaskUnitTest}) {
+		t.Errorf("user-supplied Strengths overridden by defaults: got %v", arm.Strengths)
+	}
+	if arm.CostWeight != 0.5 {
+		t.Errorf("user-supplied CostWeight overridden: got %v", arm.CostWeight)
+	}
+}
+
+func TestRegisterArm_FallsBackToIDWhenModelNameMissing(t *testing.T) {
+	// Some test code constructs arms with ID but no ModelName.
+	// applyFamilyDefaults should fall back to ID.Model() so defaults
+	// still flow through.
+	r := New(Config{})
+	r.RegisterArm(&Arm{
+		ID: NewArmID("openai", "gpt-5.3-codex"),
+		// ModelName intentionally empty
+	})
+	arm, _ := r.LookupArm(NewArmID("openai", "gpt-5.3-codex"))
+	if arm.CostWeight != 0.6 {
+		t.Errorf("CostWeight = %v, want 0.6 (defaults should resolve via ID.Model() fallback)", arm.CostWeight)
+	}
+}
+
 // --- Integration: routing-payoff scenario --------------------------------

 // TestRoutingDefaults_PayoffScenario is the user-facing demonstration that
@@ -497,7 +497,12 @@ func RegisterDiscoveredModels(r *Router, models []DiscoveredModel, providerFacto
 			continue
 		}

-		arm := &Arm{
+		// Family-keyed defaults (Strengths, MaxComplexity, CostWeight,
+		// Disabled) are applied inside Router.RegisterArm — single source
+		// of truth so cloud-arm and local-arm registration paths agree.
+		// User-supplied [[arms]] config in TOML overrides defaults later
+		// via ApplyArmOverrides.
+		r.RegisterArm(&Arm{
 			ID:        armID,
 			Provider:  prov,
 			ModelName: m.ID,
@@ -513,28 +518,6 @@ func RegisterDiscoveredModels(r *Router, models []DiscoveredModel, providerFacto
 				Vision:        m.SupportsVision,
 				ContextWindow: m.ContextSize,
 			},
-		}
-
-		// Apply family-keyed defaults (Strengths, MaxComplexity, CostWeight,
-		// Disabled) for known model families. User-supplied [[arms]] config
-		// in TOML overrides these later via ApplyArmOverrides.
-		if defaults, ok := ResolveFamilyDefaults(m.ID); ok {
-			if len(arm.Strengths) == 0 && len(defaults.Strengths) > 0 {
-				arm.Strengths = defaults.Strengths
-			}
-			if arm.MaxComplexity == 0 {
-				if cap, ok := ResolveMaxComplexity(m.ID); ok {
-					arm.MaxComplexity = cap
-				}
-			}
-			if arm.CostWeight == 0 && defaults.CostWeight > 0 {
-				arm.CostWeight = defaults.CostWeight
-			}
-			if defaults.Disabled {
-				arm.Disabled = true
-			}
-		}
-
-		r.RegisterArm(arm)
+		})
 	}
 }
@@ -42,8 +42,13 @@ func New(cfg Config) *Router {
 	}
 }

-// RegisterArm adds an arm to the router.
+// RegisterArm adds an arm to the router. Family-keyed defaults
+// (Strengths, MaxComplexity, CostWeight, Disabled) are applied to any
+// fields still at their zero value — user-supplied values are never
+// overwritten. See defaults.go for the family table.
 func (r *Router) RegisterArm(arm *Arm) {
+	applyFamilyDefaults(arm)
+
 	r.mu.Lock()
 	defer r.mu.Unlock()
 	r.arms[arm.ID] = arm