feat: QualityTracker — EMA router feedback from elf outcomes, ResultFilePaths tracking

2026-04-05 22:08:08 +02:00
parent 8a846bd024
commit 6cf5e92957
9 changed files with 208 additions and 39 deletions
--- a/internal/elf/elf.go
+++ b/internal/elf/elf.go
@@ -42,13 +42,14 @@ func (s Status) String() string {

 // Result is the output of a completed elf.
 type Result struct {
-	ID       string
-	Status   Status
-	Messages []message.Message
-	Usage    message.Usage
-	Output   string // final text output
-	Error    error
-	Duration time.Duration
+	ID              string
+	Status          Status
+	Messages        []message.Message
+	Usage           message.Usage
+	Output          string // final text output
+	Error           error
+	Duration        time.Duration
+	ResultFilePaths []string // paths to /tmp results produced by this elf's tools
 }

 // Elf is a sub-agent with its own engine and conversation history.
--- a/internal/elf/manager.go
+++ b/internal/elf/manager.go
@@ -135,11 +135,12 @@ func (m *Manager) ReportResult(result Result) {
 	meta.decision.Commit(int(result.Usage.TotalTokens()))

 	m.router.ReportOutcome(router.Outcome{
-		ArmID:    meta.armID,
-		TaskType: meta.taskType,
-		Success:  result.Status == StatusCompleted,
-		Tokens:   int(result.Usage.TotalTokens()),
-		Duration: result.Duration,
+		ArmID:           meta.armID,
+		TaskType:        meta.taskType,
+		Success:         result.Status == StatusCompleted,
+		Tokens:          int(result.Usage.TotalTokens()),
+		Duration:        result.Duration,
+		ResultFilePaths: result.ResultFilePaths,
 	})
 }

--- a/internal/router/feedback.go
+++ b/internal/router/feedback.go
@@ -0,0 +1,67 @@
+package router
+
+import "sync"
+
+const (
+	qualityAlpha    = 0.3 // EMA smoothing factor (~3-sample memory)
+	minObservations = 3   // min samples before observed score overrides heuristic
+)
+
+// EMAScore tracks an exponential moving average quality score.
+type EMAScore struct {
+	Value float64
+	Count int
+}
+
+// QualityTracker records per-arm, per-task-type EMA quality scores from elf outcomes.
+type QualityTracker struct {
+	mu     sync.RWMutex
+	scores map[ArmID]map[TaskType]*EMAScore
+}
+
+// NewQualityTracker returns an empty QualityTracker.
+func NewQualityTracker() *QualityTracker {
+	return &QualityTracker{
+		scores: make(map[ArmID]map[TaskType]*EMAScore),
+	}
+}
+
+// Record updates the EMA score for the given arm and task type.
+func (qt *QualityTracker) Record(armID ArmID, taskType TaskType, success bool) {
+	observation := 0.0
+	if success {
+		observation = 1.0
+	}
+	qt.mu.Lock()
+	defer qt.mu.Unlock()
+	if qt.scores[armID] == nil {
+		qt.scores[armID] = make(map[TaskType]*EMAScore)
+	}
+	s := qt.scores[armID][taskType]
+	if s == nil {
+		s = &EMAScore{}
+		qt.scores[armID][taskType] = s
+	}
+	if s.Count == 0 {
+		s.Value = observation
+	} else {
+		s.Value = qualityAlpha*observation + (1-qualityAlpha)*s.Value
+	}
+	s.Count++
+}
+
+// Quality returns the observed EMA score for an arm+task combination.
+// Returns (0, false) when fewer than minObservations have been recorded.
+func (qt *QualityTracker) Quality(armID ArmID, taskType TaskType) (score float64, hasData bool) {
+	qt.mu.RLock()
+	defer qt.mu.RUnlock()
+	m, ok := qt.scores[armID]
+	if !ok {
+		return 0, false
+	}
+	s, ok := m[taskType]
+	if !ok || s.Count < minObservations {
+		return 0, false
+	}
+	return s.Value, true
+}
--- a/internal/router/feedback_test.go
+++ b/internal/router/feedback_test.go
@@ -0,0 +1,58 @@
+package router_test
+
+import (
+	"testing"
+
+	"somegit.dev/Owlibou/gnoma/internal/router"
+)
+
+func TestQualityTracker_NoDataReturnsHeuristic(t *testing.T) {
+	qt := router.NewQualityTracker()
+	_, hasData := qt.Quality("arm:model", router.TaskGeneration)
+	if hasData {
+		t.Error("expected no data for unobserved arm")
+	}
+}
+
+func TestQualityTracker_RecordUpdatesEMA(t *testing.T) {
+	qt := router.NewQualityTracker()
+	for i := 0; i < 3; i++ {
+		qt.Record("arm:model", router.TaskGeneration, true)
+	}
+	score, hasData := qt.Quality("arm:model", router.TaskGeneration)
+	if !hasData {
+		t.Fatal("expected data after 3 observations")
+	}
+	if score <= 0 || score > 1 {
+		t.Errorf("score out of range [0,1]: %f", score)
+	}
+}
+
+func TestQualityTracker_AllFailuresLowScore(t *testing.T) {
+	qt := router.NewQualityTracker()
+	for i := 0; i < 5; i++ {
+		qt.Record("arm:model", router.TaskDebug, false)
+	}
+	score, _ := qt.Quality("arm:model", router.TaskDebug)
+	if score > 0.3 {
+		t.Errorf("expected low score after all failures, got %f", score)
+	}
+}
+
+func TestQualityTracker_ConcurrentSafe(t *testing.T) {
+	qt := router.NewQualityTracker()
+	done := make(chan struct{})
+	for i := 0; i < 10; i++ {
+		go func(success bool) {
+			qt.Record("arm:model", router.TaskReview, success)
+			done <- struct{}{}
+		}(i%2 == 0)
+	}
+	for i := 0; i < 10; i++ {
+		<-done
+	}
+	score, _ := qt.Quality("arm:model", router.TaskReview)
+	if score < 0 || score > 1 {
+		t.Errorf("invalid score after concurrent writes: %f", score)
+	}
+}
--- a/internal/router/router.go
+++ b/internal/router/router.go
@@ -22,6 +22,8 @@ type Router struct {
 	forcedArm ArmID
 	// When true, only local arms are considered (incognito mode)
 	localOnly bool
+
+	quality *QualityTracker
 }

 type Config struct {
@@ -34,8 +36,9 @@ func New(cfg Config) *Router {
 		logger = slog.Default()
 	}
 	return &Router{
-		arms:   make(map[ArmID]*Arm),
-		logger: logger,
+		arms:    make(map[ArmID]*Arm),
+		logger:  logger,
+		quality: NewQualityTracker(),
 	}
 }

@@ -89,7 +92,7 @@ func (r *Router) Select(task Task) RoutingDecision {
 	}

 	// Select best
-	best := selectBest(feasible, task)
+	best := selectBest(r.quality, feasible, task)
 	if best == nil {
 		return RoutingDecision{Error: fmt.Errorf("selection failed")}
 	}
@@ -140,25 +143,35 @@ func (r *Router) RemoveArm(id ArmID) {

 // Outcome records the result of a task execution for quality feedback.
 type Outcome struct {
-	ArmID    ArmID
-	TaskType TaskType
-	Success  bool
-	Tokens   int
-	Duration time.Duration
+	ArmID           ArmID
+	TaskType        TaskType
+	Success         bool
+	Tokens          int
+	Duration        time.Duration
+	ResultFilePaths []string // paths to /tmp tool result files (for M9 analysis)
 }

 // ReportOutcome records a task execution result for quality tracking.
-// M4: logs only. M9 will use this for bandit learning.
 func (r *Router) ReportOutcome(o Outcome) {
-	r.logger.Debug("outcome reported",
+	r.quality.Record(o.ArmID, o.TaskType, o.Success)
+	r.logger.Debug("outcome recorded",
 		"arm", o.ArmID,
 		"task", o.TaskType,
 		"success", o.Success,
 		"tokens", o.Tokens,
 		"duration", o.Duration,
+		"result_files", len(o.ResultFilePaths),
 	)
 }

+// LookupArm returns the arm with the given ID, if registered.
+func (r *Router) LookupArm(id ArmID) (*Arm, bool) {
+	r.mu.RLock()
+	defer r.mu.RUnlock()
+	arm, ok := r.arms[id]
+	return arm, ok
+}
+
 // Arms returns all registered arms.
 func (r *Router) Arms() []*Arm {
 	r.mu.RLock()
--- a/internal/router/router_test.go
+++ b/internal/router/router_test.go
@@ -221,7 +221,7 @@ func TestSelectBest_PrefersToolSupport(t *testing.T) {
 	}

 	task := Task{Type: TaskGeneration, RequiresTools: true, Priority: PriorityNormal}
-	best := selectBest([]*Arm{withoutTools, withTools}, task)
+	best := selectBest(nil, []*Arm{withoutTools, withTools}, task)

 	if best.ID != "a/with-tools" {
 		t.Errorf("should prefer arm with tool support, got %s", best.ID)
@@ -241,7 +241,7 @@ func TestSelectBest_PrefersThinkingForPlanning(t *testing.T) {
 	}

 	task := Task{Type: TaskPlanning, RequiresTools: true, Priority: PriorityNormal, EstimatedTokens: 5000}
-	best := selectBest([]*Arm{noThinking, thinking}, task)
+	best := selectBest(nil, []*Arm{noThinking, thinking}, task)

 	if best.ID != "a/thinking" {
 		t.Errorf("should prefer thinking model for planning, got %s", best.ID)
--- a/internal/router/selector.go
+++ b/internal/router/selector.go
@@ -36,10 +36,9 @@ func (d RoutingDecision) Rollback() {
 	}
 }

-// selectBest picks the highest-scoring feasible arm using heuristic scoring.
-// No bandit learning — that's M9. Just smart defaults based on model size,
-// locality, task type, cost, and pool scarcity.
-func selectBest(arms []*Arm, task Task) *Arm {
+// selectBest picks the highest-scoring feasible arm, blending heuristic and
+// observed EMA quality when enough data is available.
+func selectBest(qt *QualityTracker, arms []*Arm, task Task) *Arm {
 	if len(arms) == 0 {
 		return nil
 	}
@@ -48,7 +47,7 @@ func selectBest(arms []*Arm, task Task) *Arm {
 	bestScore := math.Inf(-1)

 	for _, arm := range arms {
-		score := scoreArm(arm, task)
+		score := scoreArm(qt, arm, task)
 		if score > bestScore {
 			bestScore = score
 			best = arm
@@ -58,17 +57,23 @@ func selectBest(arms []*Arm, task Task) *Arm {
 	return best
 }

-// scoreArm computes a heuristic quality/cost score for an arm.
+// scoreArm computes a quality/cost score for an arm.
+// When the quality tracker has sufficient observations, blends observed EMA
+// (70%) with heuristic (30%). Falls back to pure heuristic otherwise.
 // Score = (quality × value) / effective_cost
-func scoreArm(arm *Arm, task Task) float64 {
-	quality := heuristicQuality(arm, task)
+func scoreArm(qt *QualityTracker, arm *Arm, task Task) float64 {
+	hq := heuristicQuality(arm, task)
+	quality := hq
+	if qt != nil {
+		if observed, hasData := qt.Quality(arm.ID, task.Type); hasData {
+			quality = 0.7*observed + 0.3*hq
+		}
+	}
 	value := task.ValueScore()
 	cost := effectiveCost(arm, task)
-
 	if cost <= 0 {
-		cost = 0.001 // prevent division by zero for free local models
+		cost = 0.001
 	}
-
 	return (quality * value) / cost
 }

--- a/internal/tool/agent/agent.go
+++ b/internal/tool/agent/agent.go
@@ -86,7 +86,6 @@ func (t *Tool) Execute(ctx context.Context, args json.RawMessage) (tool.Result,
 	if t.store != nil {
 		preSave, _ = t.store.List("")
 	}
-	_ = preSave // used in Task 4 for ResultFilePaths diff

 	e, err := t.manager.Spawn(ctx, taskType, a.Prompt, systemPrompt, maxTurns)
 	if err != nil {
@@ -174,7 +173,19 @@ func (t *Tool) Execute(ctx context.Context, args json.RawMessage) (tool.Result,
 		return tool.Result{Output: "Elf timed out after 5 minutes"}, nil
 	}

-	// Report outcome to router for quality feedback
+	// Attribute /tmp result files produced during this elf's run
+	if t.store != nil {
+		postSave, _ := t.store.List("")
+		preSet := make(map[string]bool, len(preSave))
+		for _, f := range preSave {
+			preSet[f.Path] = true
+		}
+		for _, f := range postSave {
+			if !preSet[f.Path] {
+				result.ResultFilePaths = append(result.ResultFilePaths, f.Path)
+			}
+		}
+	}
 	t.manager.ReportResult(result)

 	// Send done signal — stays in tree until turn completes
--- a/internal/tool/agent/batch.go
+++ b/internal/tool/agent/batch.go
@@ -97,7 +97,6 @@ func (t *BatchTool) Execute(ctx context.Context, args json.RawMessage) (tool.Res
 	if t.store != nil {
 		preSave, _ = t.store.List("")
 	}
-	_ = preSave // used in Task 4

 	// Spawn all elfs with slight stagger to avoid rate limit bursts
 	type elfEntry struct {
@@ -178,7 +177,21 @@ func (t *BatchTool) Execute(ctx context.Context, args json.RawMessage) (tool.Res
 				}
 			}

-			// Report outcome to router
+			// For batch elfs, attribute all new /tmp files produced during the batch
+			if t.store != nil {
+				postSave, _ := t.store.List("")
+				preSet := make(map[string]bool, len(preSave))
+				for _, f := range preSave {
+					preSet[f.Path] = true
+				}
+				var newPaths []string
+				for _, f := range postSave {
+					if !preSet[f.Path] {
+						newPaths = append(newPaths, f.Path)
+					}
+				}
+				results[idx].ResultFilePaths = newPaths
+			}
 			t.manager.ReportResult(results[idx])

 			// Send done progress