feat: QualityTracker — EMA router feedback from elf outcomes, ResultFilePaths tracking
This commit is contained in:
@@ -49,6 +49,7 @@ type Result struct {
|
||||
Output string // final text output
|
||||
Error error
|
||||
Duration time.Duration
|
||||
ResultFilePaths []string // paths to /tmp results produced by this elf's tools
|
||||
}
|
||||
|
||||
// Elf is a sub-agent with its own engine and conversation history.
|
||||
|
||||
@@ -140,6 +140,7 @@ func (m *Manager) ReportResult(result Result) {
|
||||
Success: result.Status == StatusCompleted,
|
||||
Tokens: int(result.Usage.TotalTokens()),
|
||||
Duration: result.Duration,
|
||||
ResultFilePaths: result.ResultFilePaths,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
67
internal/router/feedback.go
Normal file
67
internal/router/feedback.go
Normal file
@@ -0,0 +1,67 @@
|
||||
package router
|
||||
|
||||
import "sync"
|
||||
|
||||
const (
|
||||
qualityAlpha = 0.3 // EMA smoothing factor (~3-sample memory)
|
||||
minObservations = 3 // min samples before observed score overrides heuristic
|
||||
)
|
||||
|
||||
// EMAScore tracks an exponential moving average quality score.
|
||||
type EMAScore struct {
|
||||
Value float64
|
||||
Count int
|
||||
}
|
||||
|
||||
// QualityTracker records per-arm, per-task-type EMA quality scores from elf outcomes.
|
||||
type QualityTracker struct {
|
||||
mu sync.RWMutex
|
||||
scores map[ArmID]map[TaskType]*EMAScore
|
||||
}
|
||||
|
||||
// NewQualityTracker returns an empty QualityTracker.
|
||||
func NewQualityTracker() *QualityTracker {
|
||||
return &QualityTracker{
|
||||
scores: make(map[ArmID]map[TaskType]*EMAScore),
|
||||
}
|
||||
}
|
||||
|
||||
// Record updates the EMA score for the given arm and task type.
|
||||
func (qt *QualityTracker) Record(armID ArmID, taskType TaskType, success bool) {
|
||||
observation := 0.0
|
||||
if success {
|
||||
observation = 1.0
|
||||
}
|
||||
qt.mu.Lock()
|
||||
defer qt.mu.Unlock()
|
||||
if qt.scores[armID] == nil {
|
||||
qt.scores[armID] = make(map[TaskType]*EMAScore)
|
||||
}
|
||||
s := qt.scores[armID][taskType]
|
||||
if s == nil {
|
||||
s = &EMAScore{}
|
||||
qt.scores[armID][taskType] = s
|
||||
}
|
||||
if s.Count == 0 {
|
||||
s.Value = observation
|
||||
} else {
|
||||
s.Value = qualityAlpha*observation + (1-qualityAlpha)*s.Value
|
||||
}
|
||||
s.Count++
|
||||
}
|
||||
|
||||
// Quality returns the observed EMA score for an arm+task combination.
|
||||
// Returns (0, false) when fewer than minObservations have been recorded.
|
||||
func (qt *QualityTracker) Quality(armID ArmID, taskType TaskType) (score float64, hasData bool) {
|
||||
qt.mu.RLock()
|
||||
defer qt.mu.RUnlock()
|
||||
m, ok := qt.scores[armID]
|
||||
if !ok {
|
||||
return 0, false
|
||||
}
|
||||
s, ok := m[taskType]
|
||||
if !ok || s.Count < minObservations {
|
||||
return 0, false
|
||||
}
|
||||
return s.Value, true
|
||||
}
|
||||
58
internal/router/feedback_test.go
Normal file
58
internal/router/feedback_test.go
Normal file
@@ -0,0 +1,58 @@
|
||||
package router_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"somegit.dev/Owlibou/gnoma/internal/router"
|
||||
)
|
||||
|
||||
func TestQualityTracker_NoDataReturnsHeuristic(t *testing.T) {
|
||||
qt := router.NewQualityTracker()
|
||||
_, hasData := qt.Quality("arm:model", router.TaskGeneration)
|
||||
if hasData {
|
||||
t.Error("expected no data for unobserved arm")
|
||||
}
|
||||
}
|
||||
|
||||
func TestQualityTracker_RecordUpdatesEMA(t *testing.T) {
|
||||
qt := router.NewQualityTracker()
|
||||
for i := 0; i < 3; i++ {
|
||||
qt.Record("arm:model", router.TaskGeneration, true)
|
||||
}
|
||||
score, hasData := qt.Quality("arm:model", router.TaskGeneration)
|
||||
if !hasData {
|
||||
t.Fatal("expected data after 3 observations")
|
||||
}
|
||||
if score <= 0 || score > 1 {
|
||||
t.Errorf("score out of range [0,1]: %f", score)
|
||||
}
|
||||
}
|
||||
|
||||
func TestQualityTracker_AllFailuresLowScore(t *testing.T) {
|
||||
qt := router.NewQualityTracker()
|
||||
for i := 0; i < 5; i++ {
|
||||
qt.Record("arm:model", router.TaskDebug, false)
|
||||
}
|
||||
score, _ := qt.Quality("arm:model", router.TaskDebug)
|
||||
if score > 0.3 {
|
||||
t.Errorf("expected low score after all failures, got %f", score)
|
||||
}
|
||||
}
|
||||
|
||||
func TestQualityTracker_ConcurrentSafe(t *testing.T) {
|
||||
qt := router.NewQualityTracker()
|
||||
done := make(chan struct{})
|
||||
for i := 0; i < 10; i++ {
|
||||
go func(success bool) {
|
||||
qt.Record("arm:model", router.TaskReview, success)
|
||||
done <- struct{}{}
|
||||
}(i%2 == 0)
|
||||
}
|
||||
for i := 0; i < 10; i++ {
|
||||
<-done
|
||||
}
|
||||
score, _ := qt.Quality("arm:model", router.TaskReview)
|
||||
if score < 0 || score > 1 {
|
||||
t.Errorf("invalid score after concurrent writes: %f", score)
|
||||
}
|
||||
}
|
||||
@@ -22,6 +22,8 @@ type Router struct {
|
||||
forcedArm ArmID
|
||||
// When true, only local arms are considered (incognito mode)
|
||||
localOnly bool
|
||||
|
||||
quality *QualityTracker
|
||||
}
|
||||
|
||||
type Config struct {
|
||||
@@ -36,6 +38,7 @@ func New(cfg Config) *Router {
|
||||
return &Router{
|
||||
arms: make(map[ArmID]*Arm),
|
||||
logger: logger,
|
||||
quality: NewQualityTracker(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -89,7 +92,7 @@ func (r *Router) Select(task Task) RoutingDecision {
|
||||
}
|
||||
|
||||
// Select best
|
||||
best := selectBest(feasible, task)
|
||||
best := selectBest(r.quality, feasible, task)
|
||||
if best == nil {
|
||||
return RoutingDecision{Error: fmt.Errorf("selection failed")}
|
||||
}
|
||||
@@ -145,20 +148,30 @@ type Outcome struct {
|
||||
Success bool
|
||||
Tokens int
|
||||
Duration time.Duration
|
||||
ResultFilePaths []string // paths to /tmp tool result files (for M9 analysis)
|
||||
}
|
||||
|
||||
// ReportOutcome records a task execution result for quality tracking.
|
||||
// M4: logs only. M9 will use this for bandit learning.
|
||||
func (r *Router) ReportOutcome(o Outcome) {
|
||||
r.logger.Debug("outcome reported",
|
||||
r.quality.Record(o.ArmID, o.TaskType, o.Success)
|
||||
r.logger.Debug("outcome recorded",
|
||||
"arm", o.ArmID,
|
||||
"task", o.TaskType,
|
||||
"success", o.Success,
|
||||
"tokens", o.Tokens,
|
||||
"duration", o.Duration,
|
||||
"result_files", len(o.ResultFilePaths),
|
||||
)
|
||||
}
|
||||
|
||||
// LookupArm returns the arm with the given ID, if registered.
|
||||
func (r *Router) LookupArm(id ArmID) (*Arm, bool) {
|
||||
r.mu.RLock()
|
||||
defer r.mu.RUnlock()
|
||||
arm, ok := r.arms[id]
|
||||
return arm, ok
|
||||
}
|
||||
|
||||
// Arms returns all registered arms.
|
||||
func (r *Router) Arms() []*Arm {
|
||||
r.mu.RLock()
|
||||
|
||||
@@ -221,7 +221,7 @@ func TestSelectBest_PrefersToolSupport(t *testing.T) {
|
||||
}
|
||||
|
||||
task := Task{Type: TaskGeneration, RequiresTools: true, Priority: PriorityNormal}
|
||||
best := selectBest([]*Arm{withoutTools, withTools}, task)
|
||||
best := selectBest(nil, []*Arm{withoutTools, withTools}, task)
|
||||
|
||||
if best.ID != "a/with-tools" {
|
||||
t.Errorf("should prefer arm with tool support, got %s", best.ID)
|
||||
@@ -241,7 +241,7 @@ func TestSelectBest_PrefersThinkingForPlanning(t *testing.T) {
|
||||
}
|
||||
|
||||
task := Task{Type: TaskPlanning, RequiresTools: true, Priority: PriorityNormal, EstimatedTokens: 5000}
|
||||
best := selectBest([]*Arm{noThinking, thinking}, task)
|
||||
best := selectBest(nil, []*Arm{noThinking, thinking}, task)
|
||||
|
||||
if best.ID != "a/thinking" {
|
||||
t.Errorf("should prefer thinking model for planning, got %s", best.ID)
|
||||
|
||||
@@ -36,10 +36,9 @@ func (d RoutingDecision) Rollback() {
|
||||
}
|
||||
}
|
||||
|
||||
// selectBest picks the highest-scoring feasible arm using heuristic scoring.
|
||||
// No bandit learning — that's M9. Just smart defaults based on model size,
|
||||
// locality, task type, cost, and pool scarcity.
|
||||
func selectBest(arms []*Arm, task Task) *Arm {
|
||||
// selectBest picks the highest-scoring feasible arm, blending heuristic and
|
||||
// observed EMA quality when enough data is available.
|
||||
func selectBest(qt *QualityTracker, arms []*Arm, task Task) *Arm {
|
||||
if len(arms) == 0 {
|
||||
return nil
|
||||
}
|
||||
@@ -48,7 +47,7 @@ func selectBest(arms []*Arm, task Task) *Arm {
|
||||
bestScore := math.Inf(-1)
|
||||
|
||||
for _, arm := range arms {
|
||||
score := scoreArm(arm, task)
|
||||
score := scoreArm(qt, arm, task)
|
||||
if score > bestScore {
|
||||
bestScore = score
|
||||
best = arm
|
||||
@@ -58,17 +57,23 @@ func selectBest(arms []*Arm, task Task) *Arm {
|
||||
return best
|
||||
}
|
||||
|
||||
// scoreArm computes a heuristic quality/cost score for an arm.
|
||||
// scoreArm computes a quality/cost score for an arm.
|
||||
// When the quality tracker has sufficient observations, blends observed EMA
|
||||
// (70%) with heuristic (30%). Falls back to pure heuristic otherwise.
|
||||
// Score = (quality × value) / effective_cost
|
||||
func scoreArm(arm *Arm, task Task) float64 {
|
||||
quality := heuristicQuality(arm, task)
|
||||
func scoreArm(qt *QualityTracker, arm *Arm, task Task) float64 {
|
||||
hq := heuristicQuality(arm, task)
|
||||
quality := hq
|
||||
if qt != nil {
|
||||
if observed, hasData := qt.Quality(arm.ID, task.Type); hasData {
|
||||
quality = 0.7*observed + 0.3*hq
|
||||
}
|
||||
}
|
||||
value := task.ValueScore()
|
||||
cost := effectiveCost(arm, task)
|
||||
|
||||
if cost <= 0 {
|
||||
cost = 0.001 // prevent division by zero for free local models
|
||||
cost = 0.001
|
||||
}
|
||||
|
||||
return (quality * value) / cost
|
||||
}
|
||||
|
||||
|
||||
@@ -86,7 +86,6 @@ func (t *Tool) Execute(ctx context.Context, args json.RawMessage) (tool.Result,
|
||||
if t.store != nil {
|
||||
preSave, _ = t.store.List("")
|
||||
}
|
||||
_ = preSave // used in Task 4 for ResultFilePaths diff
|
||||
|
||||
e, err := t.manager.Spawn(ctx, taskType, a.Prompt, systemPrompt, maxTurns)
|
||||
if err != nil {
|
||||
@@ -174,7 +173,19 @@ func (t *Tool) Execute(ctx context.Context, args json.RawMessage) (tool.Result,
|
||||
return tool.Result{Output: "Elf timed out after 5 minutes"}, nil
|
||||
}
|
||||
|
||||
// Report outcome to router for quality feedback
|
||||
// Attribute /tmp result files produced during this elf's run
|
||||
if t.store != nil {
|
||||
postSave, _ := t.store.List("")
|
||||
preSet := make(map[string]bool, len(preSave))
|
||||
for _, f := range preSave {
|
||||
preSet[f.Path] = true
|
||||
}
|
||||
for _, f := range postSave {
|
||||
if !preSet[f.Path] {
|
||||
result.ResultFilePaths = append(result.ResultFilePaths, f.Path)
|
||||
}
|
||||
}
|
||||
}
|
||||
t.manager.ReportResult(result)
|
||||
|
||||
// Send done signal — stays in tree until turn completes
|
||||
|
||||
@@ -97,7 +97,6 @@ func (t *BatchTool) Execute(ctx context.Context, args json.RawMessage) (tool.Res
|
||||
if t.store != nil {
|
||||
preSave, _ = t.store.List("")
|
||||
}
|
||||
_ = preSave // used in Task 4
|
||||
|
||||
// Spawn all elfs with slight stagger to avoid rate limit bursts
|
||||
type elfEntry struct {
|
||||
@@ -178,7 +177,21 @@ func (t *BatchTool) Execute(ctx context.Context, args json.RawMessage) (tool.Res
|
||||
}
|
||||
}
|
||||
|
||||
// Report outcome to router
|
||||
// For batch elfs, attribute all new /tmp files produced during the batch
|
||||
if t.store != nil {
|
||||
postSave, _ := t.store.List("")
|
||||
preSet := make(map[string]bool, len(preSave))
|
||||
for _, f := range preSave {
|
||||
preSet[f.Path] = true
|
||||
}
|
||||
var newPaths []string
|
||||
for _, f := range postSave {
|
||||
if !preSet[f.Path] {
|
||||
newPaths = append(newPaths, f.Path)
|
||||
}
|
||||
}
|
||||
results[idx].ResultFilePaths = newPaths
|
||||
}
|
||||
t.manager.ReportResult(results[idx])
|
||||
|
||||
// Send done progress
|
||||
|
||||
Reference in New Issue
Block a user