feat: QualityTracker — EMA router feedback from elf outcomes, ResultFilePaths tracking

This commit is contained in:
2026-04-05 22:08:08 +02:00
parent 8a846bd024
commit 6cf5e92957
9 changed files with 208 additions and 39 deletions

View File

@@ -42,13 +42,14 @@ func (s Status) String() string {
// Result is the output of a completed elf.
type Result struct {
ID string
Status Status
Messages []message.Message
Usage message.Usage
Output string // final text output
Error error
Duration time.Duration
ID string
Status Status
Messages []message.Message
Usage message.Usage
Output string // final text output
Error error
Duration time.Duration
ResultFilePaths []string // paths to /tmp results produced by this elf's tools
}
// Elf is a sub-agent with its own engine and conversation history.

View File

@@ -135,11 +135,12 @@ func (m *Manager) ReportResult(result Result) {
meta.decision.Commit(int(result.Usage.TotalTokens()))
m.router.ReportOutcome(router.Outcome{
ArmID: meta.armID,
TaskType: meta.taskType,
Success: result.Status == StatusCompleted,
Tokens: int(result.Usage.TotalTokens()),
Duration: result.Duration,
ArmID: meta.armID,
TaskType: meta.taskType,
Success: result.Status == StatusCompleted,
Tokens: int(result.Usage.TotalTokens()),
Duration: result.Duration,
ResultFilePaths: result.ResultFilePaths,
})
}

View File

@@ -0,0 +1,67 @@
package router
import "sync"
const (
qualityAlpha = 0.3 // EMA smoothing factor (~3-sample memory)
minObservations = 3 // min samples before observed score overrides heuristic
)
// EMAScore tracks an exponential moving average quality score.
type EMAScore struct {
Value float64
Count int
}
// QualityTracker records per-arm, per-task-type EMA quality scores from elf outcomes.
type QualityTracker struct {
mu sync.RWMutex
scores map[ArmID]map[TaskType]*EMAScore
}
// NewQualityTracker returns an empty QualityTracker.
func NewQualityTracker() *QualityTracker {
return &QualityTracker{
scores: make(map[ArmID]map[TaskType]*EMAScore),
}
}
// Record updates the EMA score for the given arm and task type.
func (qt *QualityTracker) Record(armID ArmID, taskType TaskType, success bool) {
observation := 0.0
if success {
observation = 1.0
}
qt.mu.Lock()
defer qt.mu.Unlock()
if qt.scores[armID] == nil {
qt.scores[armID] = make(map[TaskType]*EMAScore)
}
s := qt.scores[armID][taskType]
if s == nil {
s = &EMAScore{}
qt.scores[armID][taskType] = s
}
if s.Count == 0 {
s.Value = observation
} else {
s.Value = qualityAlpha*observation + (1-qualityAlpha)*s.Value
}
s.Count++
}
// Quality returns the observed EMA score for an arm+task combination.
// Returns (0, false) when fewer than minObservations have been recorded.
func (qt *QualityTracker) Quality(armID ArmID, taskType TaskType) (score float64, hasData bool) {
qt.mu.RLock()
defer qt.mu.RUnlock()
m, ok := qt.scores[armID]
if !ok {
return 0, false
}
s, ok := m[taskType]
if !ok || s.Count < minObservations {
return 0, false
}
return s.Value, true
}

View File

@@ -0,0 +1,58 @@
package router_test
import (
"testing"
"somegit.dev/Owlibou/gnoma/internal/router"
)
func TestQualityTracker_NoDataReturnsHeuristic(t *testing.T) {
qt := router.NewQualityTracker()
_, hasData := qt.Quality("arm:model", router.TaskGeneration)
if hasData {
t.Error("expected no data for unobserved arm")
}
}
func TestQualityTracker_RecordUpdatesEMA(t *testing.T) {
qt := router.NewQualityTracker()
for i := 0; i < 3; i++ {
qt.Record("arm:model", router.TaskGeneration, true)
}
score, hasData := qt.Quality("arm:model", router.TaskGeneration)
if !hasData {
t.Fatal("expected data after 3 observations")
}
if score <= 0 || score > 1 {
t.Errorf("score out of range [0,1]: %f", score)
}
}
func TestQualityTracker_AllFailuresLowScore(t *testing.T) {
qt := router.NewQualityTracker()
for i := 0; i < 5; i++ {
qt.Record("arm:model", router.TaskDebug, false)
}
score, _ := qt.Quality("arm:model", router.TaskDebug)
if score > 0.3 {
t.Errorf("expected low score after all failures, got %f", score)
}
}
func TestQualityTracker_ConcurrentSafe(t *testing.T) {
qt := router.NewQualityTracker()
done := make(chan struct{})
for i := 0; i < 10; i++ {
go func(success bool) {
qt.Record("arm:model", router.TaskReview, success)
done <- struct{}{}
}(i%2 == 0)
}
for i := 0; i < 10; i++ {
<-done
}
score, _ := qt.Quality("arm:model", router.TaskReview)
if score < 0 || score > 1 {
t.Errorf("invalid score after concurrent writes: %f", score)
}
}

View File

@@ -22,6 +22,8 @@ type Router struct {
forcedArm ArmID
// When true, only local arms are considered (incognito mode)
localOnly bool
quality *QualityTracker
}
type Config struct {
@@ -34,8 +36,9 @@ func New(cfg Config) *Router {
logger = slog.Default()
}
return &Router{
arms: make(map[ArmID]*Arm),
logger: logger,
arms: make(map[ArmID]*Arm),
logger: logger,
quality: NewQualityTracker(),
}
}
@@ -89,7 +92,7 @@ func (r *Router) Select(task Task) RoutingDecision {
}
// Select best
best := selectBest(feasible, task)
best := selectBest(r.quality, feasible, task)
if best == nil {
return RoutingDecision{Error: fmt.Errorf("selection failed")}
}
@@ -140,25 +143,35 @@ func (r *Router) RemoveArm(id ArmID) {
// Outcome records the result of a task execution for quality feedback.
type Outcome struct {
ArmID ArmID
TaskType TaskType
Success bool
Tokens int
Duration time.Duration
ArmID ArmID
TaskType TaskType
Success bool
Tokens int
Duration time.Duration
ResultFilePaths []string // paths to /tmp tool result files (for M9 analysis)
}
// ReportOutcome records a task execution result for quality tracking.
// M4: logs only. M9 will use this for bandit learning.
func (r *Router) ReportOutcome(o Outcome) {
r.logger.Debug("outcome reported",
r.quality.Record(o.ArmID, o.TaskType, o.Success)
r.logger.Debug("outcome recorded",
"arm", o.ArmID,
"task", o.TaskType,
"success", o.Success,
"tokens", o.Tokens,
"duration", o.Duration,
"result_files", len(o.ResultFilePaths),
)
}
// LookupArm returns the arm with the given ID, if registered.
func (r *Router) LookupArm(id ArmID) (*Arm, bool) {
r.mu.RLock()
defer r.mu.RUnlock()
arm, ok := r.arms[id]
return arm, ok
}
// Arms returns all registered arms.
func (r *Router) Arms() []*Arm {
r.mu.RLock()

View File

@@ -221,7 +221,7 @@ func TestSelectBest_PrefersToolSupport(t *testing.T) {
}
task := Task{Type: TaskGeneration, RequiresTools: true, Priority: PriorityNormal}
best := selectBest([]*Arm{withoutTools, withTools}, task)
best := selectBest(nil, []*Arm{withoutTools, withTools}, task)
if best.ID != "a/with-tools" {
t.Errorf("should prefer arm with tool support, got %s", best.ID)
@@ -241,7 +241,7 @@ func TestSelectBest_PrefersThinkingForPlanning(t *testing.T) {
}
task := Task{Type: TaskPlanning, RequiresTools: true, Priority: PriorityNormal, EstimatedTokens: 5000}
best := selectBest([]*Arm{noThinking, thinking}, task)
best := selectBest(nil, []*Arm{noThinking, thinking}, task)
if best.ID != "a/thinking" {
t.Errorf("should prefer thinking model for planning, got %s", best.ID)

View File

@@ -36,10 +36,9 @@ func (d RoutingDecision) Rollback() {
}
}
// selectBest picks the highest-scoring feasible arm using heuristic scoring.
// No bandit learning — that's M9. Just smart defaults based on model size,
// locality, task type, cost, and pool scarcity.
func selectBest(arms []*Arm, task Task) *Arm {
// selectBest picks the highest-scoring feasible arm, blending heuristic and
// observed EMA quality when enough data is available.
func selectBest(qt *QualityTracker, arms []*Arm, task Task) *Arm {
if len(arms) == 0 {
return nil
}
@@ -48,7 +47,7 @@ func selectBest(arms []*Arm, task Task) *Arm {
bestScore := math.Inf(-1)
for _, arm := range arms {
score := scoreArm(arm, task)
score := scoreArm(qt, arm, task)
if score > bestScore {
bestScore = score
best = arm
@@ -58,17 +57,23 @@ func selectBest(arms []*Arm, task Task) *Arm {
return best
}
// scoreArm computes a heuristic quality/cost score for an arm.
// scoreArm computes a quality/cost score for an arm.
// When the quality tracker has sufficient observations, blends observed EMA
// (70%) with heuristic (30%). Falls back to pure heuristic otherwise.
// Score = (quality × value) / effective_cost
func scoreArm(arm *Arm, task Task) float64 {
quality := heuristicQuality(arm, task)
func scoreArm(qt *QualityTracker, arm *Arm, task Task) float64 {
hq := heuristicQuality(arm, task)
quality := hq
if qt != nil {
if observed, hasData := qt.Quality(arm.ID, task.Type); hasData {
quality = 0.7*observed + 0.3*hq
}
}
value := task.ValueScore()
cost := effectiveCost(arm, task)
if cost <= 0 {
cost = 0.001 // prevent division by zero for free local models
cost = 0.001
}
return (quality * value) / cost
}

View File

@@ -86,7 +86,6 @@ func (t *Tool) Execute(ctx context.Context, args json.RawMessage) (tool.Result,
if t.store != nil {
preSave, _ = t.store.List("")
}
_ = preSave // used in Task 4 for ResultFilePaths diff
e, err := t.manager.Spawn(ctx, taskType, a.Prompt, systemPrompt, maxTurns)
if err != nil {
@@ -174,7 +173,19 @@ func (t *Tool) Execute(ctx context.Context, args json.RawMessage) (tool.Result,
return tool.Result{Output: "Elf timed out after 5 minutes"}, nil
}
// Report outcome to router for quality feedback
// Attribute /tmp result files produced during this elf's run
if t.store != nil {
postSave, _ := t.store.List("")
preSet := make(map[string]bool, len(preSave))
for _, f := range preSave {
preSet[f.Path] = true
}
for _, f := range postSave {
if !preSet[f.Path] {
result.ResultFilePaths = append(result.ResultFilePaths, f.Path)
}
}
}
t.manager.ReportResult(result)
// Send done signal — stays in tree until turn completes

View File

@@ -97,7 +97,6 @@ func (t *BatchTool) Execute(ctx context.Context, args json.RawMessage) (tool.Res
if t.store != nil {
preSave, _ = t.store.List("")
}
_ = preSave // used in Task 4
// Spawn all elfs with slight stagger to avoid rate limit bursts
type elfEntry struct {
@@ -178,7 +177,21 @@ func (t *BatchTool) Execute(ctx context.Context, args json.RawMessage) (tool.Res
}
}
// Report outcome to router
// For batch elfs, attribute all new /tmp files produced during the batch
if t.store != nil {
postSave, _ := t.store.List("")
preSet := make(map[string]bool, len(preSave))
for _, f := range preSave {
preSet[f.Path] = true
}
var newPaths []string
for _, f := range postSave {
if !preSet[f.Path] {
newPaths = append(newPaths, f.Path)
}
}
results[idx].ResultFilePaths = newPaths
}
t.manager.ReportResult(results[idx])
// Send done progress