feat: QualityTracker — EMA router feedback from elf outcomes, ResultFilePaths tracking

2026-04-05 22:08:08 +02:00
parent 8a846bd024
commit 6cf5e92957
9 changed files with 208 additions and 39 deletions
--- a/internal/elf/elf.go
+++ b/internal/elf/elf.go
@@ -49,6 +49,7 @@ type Result struct {
 	Output          string // final text output
 	Error           error
 	Duration        time.Duration
 	ResultFilePaths []string // paths to /tmp results produced by this elf's tools
 }
 // Elf is a sub-agent with its own engine and conversation history.
--- a/internal/elf/manager.go
+++ b/internal/elf/manager.go
@@ -140,6 +140,7 @@ func (m *Manager) ReportResult(result Result) {
 		Success:         result.Status == StatusCompleted,
 		Tokens:          int(result.Usage.TotalTokens()),
 		Duration:        result.Duration,
 		ResultFilePaths: result.ResultFilePaths,
 	})
 }
--- a/internal/router/feedback.go
+++ b/internal/router/feedback.go
@@ -0,0 +1,67 @@
 package router
 import "sync"
 const (
 	qualityAlpha    = 0.3 // EMA smoothing factor (~3-sample memory)
 	minObservations = 3   // min samples before observed score overrides heuristic
 )
 // EMAScore tracks an exponential moving average quality score.
 type EMAScore struct {
 	Value float64
 	Count int
 }
 // QualityTracker records per-arm, per-task-type EMA quality scores from elf outcomes.
 type QualityTracker struct {
 	mu     sync.RWMutex
 	scores map[ArmID]map[TaskType]*EMAScore
 }
 // NewQualityTracker returns an empty QualityTracker.
 func NewQualityTracker() *QualityTracker {
 	return &QualityTracker{
 		scores: make(map[ArmID]map[TaskType]*EMAScore),
 	}
 }
 // Record updates the EMA score for the given arm and task type.
 func (qt *QualityTracker) Record(armID ArmID, taskType TaskType, success bool) {
 	observation := 0.0
 	if success {
 		observation = 1.0
 	}
 	qt.mu.Lock()
 	defer qt.mu.Unlock()
 	if qt.scores[armID] == nil {
 		qt.scores[armID] = make(map[TaskType]*EMAScore)
 	}
 	s := qt.scores[armID][taskType]
 	if s == nil {
 		s = &EMAScore{}
 		qt.scores[armID][taskType] = s
 	}
 	if s.Count == 0 {
 		s.Value = observation
 	} else {
 		s.Value = qualityAlpha*observation + (1-qualityAlpha)*s.Value
 	}
 	s.Count++
 }
 // Quality returns the observed EMA score for an arm+task combination.
 // Returns (0, false) when fewer than minObservations have been recorded.
 func (qt *QualityTracker) Quality(armID ArmID, taskType TaskType) (score float64, hasData bool) {
 	qt.mu.RLock()
 	defer qt.mu.RUnlock()
 	m, ok := qt.scores[armID]
 	if !ok {
 		return 0, false
 	}
 	s, ok := m[taskType]
 	if !ok || s.Count < minObservations {
 		return 0, false
 	}
 	return s.Value, true
 }
--- a/internal/router/feedback_test.go
+++ b/internal/router/feedback_test.go
@@ -0,0 +1,58 @@
 package router_test
 import (
 	"testing"
 	"somegit.dev/Owlibou/gnoma/internal/router"
 )
 func TestQualityTracker_NoDataReturnsHeuristic(t *testing.T) {
 	qt := router.NewQualityTracker()
 	_, hasData := qt.Quality("arm:model", router.TaskGeneration)
 	if hasData {
 		t.Error("expected no data for unobserved arm")
 	}
 }
 func TestQualityTracker_RecordUpdatesEMA(t *testing.T) {
 	qt := router.NewQualityTracker()
 	for i := 0; i < 3; i++ {
 		qt.Record("arm:model", router.TaskGeneration, true)
 	}
 	score, hasData := qt.Quality("arm:model", router.TaskGeneration)
 	if !hasData {
 		t.Fatal("expected data after 3 observations")
 	}
 	if score <= 0 || score > 1 {
 		t.Errorf("score out of range [0,1]: %f", score)
 	}
 }
 func TestQualityTracker_AllFailuresLowScore(t *testing.T) {
 	qt := router.NewQualityTracker()
 	for i := 0; i < 5; i++ {
 		qt.Record("arm:model", router.TaskDebug, false)
 	}
 	score, _ := qt.Quality("arm:model", router.TaskDebug)
 	if score > 0.3 {
 		t.Errorf("expected low score after all failures, got %f", score)
 	}
 }
 func TestQualityTracker_ConcurrentSafe(t *testing.T) {
 	qt := router.NewQualityTracker()
 	done := make(chan struct{})
 	for i := 0; i < 10; i++ {
 		go func(success bool) {
 			qt.Record("arm:model", router.TaskReview, success)
 			done <- struct{}{}
 		}(i%2 == 0)
 	}
 	for i := 0; i < 10; i++ {
 		<-done
 	}
 	score, _ := qt.Quality("arm:model", router.TaskReview)
 	if score < 0 || score > 1 {
 		t.Errorf("invalid score after concurrent writes: %f", score)
 	}
 }
--- a/internal/router/router.go
+++ b/internal/router/router.go
@@ -22,6 +22,8 @@ type Router struct {
 	forcedArm ArmID
 	// When true, only local arms are considered (incognito mode)
 	localOnly bool
 	quality *QualityTracker
 }
 type Config struct {
@@ -36,6 +38,7 @@ func New(cfg Config) *Router {
 	return &Router{
 		arms:    make(map[ArmID]*Arm),
 		logger:  logger,
 		quality: NewQualityTracker(),
 	}
 }
@@ -89,7 +92,7 @@ func (r *Router) Select(task Task) RoutingDecision {
 	}
 	// Select best
-	best := selectBest(feasible, task)
+	best := selectBest(r.quality, feasible, task)
 	if best == nil {
 		return RoutingDecision{Error: fmt.Errorf("selection failed")}
 	}
@@ -145,20 +148,30 @@ type Outcome struct {
 	Success         bool
 	Tokens          int
 	Duration        time.Duration
 	ResultFilePaths []string // paths to /tmp tool result files (for M9 analysis)
 }
 // ReportOutcome records a task execution result for quality tracking.
 // M4: logs only. M9 will use this for bandit learning.
 func (r *Router) ReportOutcome(o Outcome) {
-	r.logger.Debug("outcome reported",
+	r.quality.Record(o.ArmID, o.TaskType, o.Success)
 	r.logger.Debug("outcome recorded",
 		"arm", o.ArmID,
 		"task", o.TaskType,
 		"success", o.Success,
 		"tokens", o.Tokens,
 		"duration", o.Duration,
 		"result_files", len(o.ResultFilePaths),
 	)
 }
 // LookupArm returns the arm with the given ID, if registered.
 func (r *Router) LookupArm(id ArmID) (*Arm, bool) {
 	r.mu.RLock()
 	defer r.mu.RUnlock()
 	arm, ok := r.arms[id]
 	return arm, ok
 }
 // Arms returns all registered arms.
 func (r *Router) Arms() []*Arm {
 	r.mu.RLock()
--- a/internal/router/router_test.go
+++ b/internal/router/router_test.go
@@ -221,7 +221,7 @@ func TestSelectBest_PrefersToolSupport(t *testing.T) {
 	}
 	task := Task{Type: TaskGeneration, RequiresTools: true, Priority: PriorityNormal}
-	best := selectBest([]*Arm{withoutTools, withTools}, task)
+	best := selectBest(nil, []*Arm{withoutTools, withTools}, task)
 	if best.ID != "a/with-tools" {
 		t.Errorf("should prefer arm with tool support, got %s", best.ID)
@@ -241,7 +241,7 @@ func TestSelectBest_PrefersThinkingForPlanning(t *testing.T) {
 	}
 	task := Task{Type: TaskPlanning, RequiresTools: true, Priority: PriorityNormal, EstimatedTokens: 5000}
-	best := selectBest([]*Arm{noThinking, thinking}, task)
+	best := selectBest(nil, []*Arm{noThinking, thinking}, task)
 	if best.ID != "a/thinking" {
 		t.Errorf("should prefer thinking model for planning, got %s", best.ID)
--- a/internal/router/selector.go
+++ b/internal/router/selector.go
@@ -36,10 +36,9 @@ func (d RoutingDecision) Rollback() {
 	}
 }
-// selectBest picks the highest-scoring feasible arm using heuristic scoring.
+// selectBest picks the highest-scoring feasible arm, blending heuristic and
-// No bandit learning — that's M9. Just smart defaults based on model size,
+// observed EMA quality when enough data is available.
-// locality, task type, cost, and pool scarcity.
+func selectBest(qt *QualityTracker, arms []*Arm, task Task) *Arm {
 func selectBest(arms []*Arm, task Task) *Arm {
 	if len(arms) == 0 {
 		return nil
 	}
@@ -48,7 +47,7 @@ func selectBest(arms []*Arm, task Task) *Arm {
 	bestScore := math.Inf(-1)
 	for _, arm := range arms {
-		score := scoreArm(arm, task)
+		score := scoreArm(qt, arm, task)
 		if score > bestScore {
 			bestScore = score
 			best = arm
@@ -58,17 +57,23 @@ func selectBest(arms []*Arm, task Task) *Arm {
 	return best
 }
-// scoreArm computes a heuristic quality/cost score for an arm.
+// scoreArm computes a quality/cost score for an arm.
 // When the quality tracker has sufficient observations, blends observed EMA
 // (70%) with heuristic (30%). Falls back to pure heuristic otherwise.
 // Score = (quality × value) / effective_cost
-func scoreArm(arm *Arm, task Task) float64 {
+func scoreArm(qt *QualityTracker, arm *Arm, task Task) float64 {
-	quality := heuristicQuality(arm, task)
+	hq := heuristicQuality(arm, task)
 	quality := hq
 	if qt != nil {
 		if observed, hasData := qt.Quality(arm.ID, task.Type); hasData {
 			quality = 0.7*observed + 0.3*hq
 		}
 	}
 	value := task.ValueScore()
 	cost := effectiveCost(arm, task)
 	if cost <= 0 {
-		cost = 0.001 // prevent division by zero for free local models
+		cost = 0.001
 	}
 	return (quality * value) / cost
 }
--- a/internal/tool/agent/agent.go
+++ b/internal/tool/agent/agent.go
@@ -86,7 +86,6 @@ func (t *Tool) Execute(ctx context.Context, args json.RawMessage) (tool.Result,
 	if t.store != nil {
 		preSave, _ = t.store.List("")
 	}
 	_ = preSave // used in Task 4 for ResultFilePaths diff
 	e, err := t.manager.Spawn(ctx, taskType, a.Prompt, systemPrompt, maxTurns)
 	if err != nil {
@@ -174,7 +173,19 @@ func (t *Tool) Execute(ctx context.Context, args json.RawMessage) (tool.Result,
 		return tool.Result{Output: "Elf timed out after 5 minutes"}, nil
 	}
-	// Report outcome to router for quality feedback
+	// Attribute /tmp result files produced during this elf's run
 	if t.store != nil {
 		postSave, _ := t.store.List("")
 		preSet := make(map[string]bool, len(preSave))
 		for _, f := range preSave {
 			preSet[f.Path] = true
 		}
 		for _, f := range postSave {
 			if !preSet[f.Path] {
 				result.ResultFilePaths = append(result.ResultFilePaths, f.Path)
 			}
 		}
 	}
 	t.manager.ReportResult(result)
 	// Send done signal — stays in tree until turn completes
--- a/internal/tool/agent/batch.go
+++ b/internal/tool/agent/batch.go
@@ -97,7 +97,6 @@ func (t *BatchTool) Execute(ctx context.Context, args json.RawMessage) (tool.Res
 	if t.store != nil {
 		preSave, _ = t.store.List("")
 	}
 	_ = preSave // used in Task 4
 	// Spawn all elfs with slight stagger to avoid rate limit bursts
 	type elfEntry struct {
@@ -178,7 +177,21 @@ func (t *BatchTool) Execute(ctx context.Context, args json.RawMessage) (tool.Res
 				}
 			}
-			// Report outcome to router
+			// For batch elfs, attribute all new /tmp files produced during the batch
 			if t.store != nil {
 				postSave, _ := t.store.List("")
 				preSet := make(map[string]bool, len(preSave))
 				for _, f := range preSave {
 					preSet[f.Path] = true
 				}
 				var newPaths []string
 				for _, f := range postSave {
 					if !preSet[f.Path] {
 						newPaths = append(newPaths, f.Path)
 					}
 				}
 				results[idx].ResultFilePaths = newPaths
 			}
 			t.manager.ReportResult(results[idx])
 			// Send done progress