feat: QualityTracker — EMA router feedback from elf outcomes, ResultFilePaths tracking
This commit is contained in:
@@ -49,6 +49,7 @@ type Result struct {
|
|||||||
Output string // final text output
|
Output string // final text output
|
||||||
Error error
|
Error error
|
||||||
Duration time.Duration
|
Duration time.Duration
|
||||||
|
ResultFilePaths []string // paths to /tmp results produced by this elf's tools
|
||||||
}
|
}
|
||||||
|
|
||||||
// Elf is a sub-agent with its own engine and conversation history.
|
// Elf is a sub-agent with its own engine and conversation history.
|
||||||
|
|||||||
@@ -140,6 +140,7 @@ func (m *Manager) ReportResult(result Result) {
|
|||||||
Success: result.Status == StatusCompleted,
|
Success: result.Status == StatusCompleted,
|
||||||
Tokens: int(result.Usage.TotalTokens()),
|
Tokens: int(result.Usage.TotalTokens()),
|
||||||
Duration: result.Duration,
|
Duration: result.Duration,
|
||||||
|
ResultFilePaths: result.ResultFilePaths,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
67
internal/router/feedback.go
Normal file
67
internal/router/feedback.go
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
package router
|
||||||
|
|
||||||
|
import "sync"
|
||||||
|
|
||||||
|
const (
|
||||||
|
qualityAlpha = 0.3 // EMA smoothing factor (~3-sample memory)
|
||||||
|
minObservations = 3 // min samples before observed score overrides heuristic
|
||||||
|
)
|
||||||
|
|
||||||
|
// EMAScore tracks an exponential moving average quality score.
|
||||||
|
type EMAScore struct {
|
||||||
|
Value float64
|
||||||
|
Count int
|
||||||
|
}
|
||||||
|
|
||||||
|
// QualityTracker records per-arm, per-task-type EMA quality scores from elf outcomes.
|
||||||
|
type QualityTracker struct {
|
||||||
|
mu sync.RWMutex
|
||||||
|
scores map[ArmID]map[TaskType]*EMAScore
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewQualityTracker returns an empty QualityTracker.
|
||||||
|
func NewQualityTracker() *QualityTracker {
|
||||||
|
return &QualityTracker{
|
||||||
|
scores: make(map[ArmID]map[TaskType]*EMAScore),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Record updates the EMA score for the given arm and task type.
|
||||||
|
func (qt *QualityTracker) Record(armID ArmID, taskType TaskType, success bool) {
|
||||||
|
observation := 0.0
|
||||||
|
if success {
|
||||||
|
observation = 1.0
|
||||||
|
}
|
||||||
|
qt.mu.Lock()
|
||||||
|
defer qt.mu.Unlock()
|
||||||
|
if qt.scores[armID] == nil {
|
||||||
|
qt.scores[armID] = make(map[TaskType]*EMAScore)
|
||||||
|
}
|
||||||
|
s := qt.scores[armID][taskType]
|
||||||
|
if s == nil {
|
||||||
|
s = &EMAScore{}
|
||||||
|
qt.scores[armID][taskType] = s
|
||||||
|
}
|
||||||
|
if s.Count == 0 {
|
||||||
|
s.Value = observation
|
||||||
|
} else {
|
||||||
|
s.Value = qualityAlpha*observation + (1-qualityAlpha)*s.Value
|
||||||
|
}
|
||||||
|
s.Count++
|
||||||
|
}
|
||||||
|
|
||||||
|
// Quality returns the observed EMA score for an arm+task combination.
|
||||||
|
// Returns (0, false) when fewer than minObservations have been recorded.
|
||||||
|
func (qt *QualityTracker) Quality(armID ArmID, taskType TaskType) (score float64, hasData bool) {
|
||||||
|
qt.mu.RLock()
|
||||||
|
defer qt.mu.RUnlock()
|
||||||
|
m, ok := qt.scores[armID]
|
||||||
|
if !ok {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
s, ok := m[taskType]
|
||||||
|
if !ok || s.Count < minObservations {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return s.Value, true
|
||||||
|
}
|
||||||
58
internal/router/feedback_test.go
Normal file
58
internal/router/feedback_test.go
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
package router_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"somegit.dev/Owlibou/gnoma/internal/router"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestQualityTracker_NoDataReturnsHeuristic(t *testing.T) {
|
||||||
|
qt := router.NewQualityTracker()
|
||||||
|
_, hasData := qt.Quality("arm:model", router.TaskGeneration)
|
||||||
|
if hasData {
|
||||||
|
t.Error("expected no data for unobserved arm")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestQualityTracker_RecordUpdatesEMA(t *testing.T) {
|
||||||
|
qt := router.NewQualityTracker()
|
||||||
|
for i := 0; i < 3; i++ {
|
||||||
|
qt.Record("arm:model", router.TaskGeneration, true)
|
||||||
|
}
|
||||||
|
score, hasData := qt.Quality("arm:model", router.TaskGeneration)
|
||||||
|
if !hasData {
|
||||||
|
t.Fatal("expected data after 3 observations")
|
||||||
|
}
|
||||||
|
if score <= 0 || score > 1 {
|
||||||
|
t.Errorf("score out of range [0,1]: %f", score)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestQualityTracker_AllFailuresLowScore(t *testing.T) {
|
||||||
|
qt := router.NewQualityTracker()
|
||||||
|
for i := 0; i < 5; i++ {
|
||||||
|
qt.Record("arm:model", router.TaskDebug, false)
|
||||||
|
}
|
||||||
|
score, _ := qt.Quality("arm:model", router.TaskDebug)
|
||||||
|
if score > 0.3 {
|
||||||
|
t.Errorf("expected low score after all failures, got %f", score)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestQualityTracker_ConcurrentSafe(t *testing.T) {
|
||||||
|
qt := router.NewQualityTracker()
|
||||||
|
done := make(chan struct{})
|
||||||
|
for i := 0; i < 10; i++ {
|
||||||
|
go func(success bool) {
|
||||||
|
qt.Record("arm:model", router.TaskReview, success)
|
||||||
|
done <- struct{}{}
|
||||||
|
}(i%2 == 0)
|
||||||
|
}
|
||||||
|
for i := 0; i < 10; i++ {
|
||||||
|
<-done
|
||||||
|
}
|
||||||
|
score, _ := qt.Quality("arm:model", router.TaskReview)
|
||||||
|
if score < 0 || score > 1 {
|
||||||
|
t.Errorf("invalid score after concurrent writes: %f", score)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -22,6 +22,8 @@ type Router struct {
|
|||||||
forcedArm ArmID
|
forcedArm ArmID
|
||||||
// When true, only local arms are considered (incognito mode)
|
// When true, only local arms are considered (incognito mode)
|
||||||
localOnly bool
|
localOnly bool
|
||||||
|
|
||||||
|
quality *QualityTracker
|
||||||
}
|
}
|
||||||
|
|
||||||
type Config struct {
|
type Config struct {
|
||||||
@@ -36,6 +38,7 @@ func New(cfg Config) *Router {
|
|||||||
return &Router{
|
return &Router{
|
||||||
arms: make(map[ArmID]*Arm),
|
arms: make(map[ArmID]*Arm),
|
||||||
logger: logger,
|
logger: logger,
|
||||||
|
quality: NewQualityTracker(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -89,7 +92,7 @@ func (r *Router) Select(task Task) RoutingDecision {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Select best
|
// Select best
|
||||||
best := selectBest(feasible, task)
|
best := selectBest(r.quality, feasible, task)
|
||||||
if best == nil {
|
if best == nil {
|
||||||
return RoutingDecision{Error: fmt.Errorf("selection failed")}
|
return RoutingDecision{Error: fmt.Errorf("selection failed")}
|
||||||
}
|
}
|
||||||
@@ -145,20 +148,30 @@ type Outcome struct {
|
|||||||
Success bool
|
Success bool
|
||||||
Tokens int
|
Tokens int
|
||||||
Duration time.Duration
|
Duration time.Duration
|
||||||
|
ResultFilePaths []string // paths to /tmp tool result files (for M9 analysis)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ReportOutcome records a task execution result for quality tracking.
|
// ReportOutcome records a task execution result for quality tracking.
|
||||||
// M4: logs only. M9 will use this for bandit learning.
|
|
||||||
func (r *Router) ReportOutcome(o Outcome) {
|
func (r *Router) ReportOutcome(o Outcome) {
|
||||||
r.logger.Debug("outcome reported",
|
r.quality.Record(o.ArmID, o.TaskType, o.Success)
|
||||||
|
r.logger.Debug("outcome recorded",
|
||||||
"arm", o.ArmID,
|
"arm", o.ArmID,
|
||||||
"task", o.TaskType,
|
"task", o.TaskType,
|
||||||
"success", o.Success,
|
"success", o.Success,
|
||||||
"tokens", o.Tokens,
|
"tokens", o.Tokens,
|
||||||
"duration", o.Duration,
|
"duration", o.Duration,
|
||||||
|
"result_files", len(o.ResultFilePaths),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LookupArm returns the arm with the given ID, if registered.
|
||||||
|
func (r *Router) LookupArm(id ArmID) (*Arm, bool) {
|
||||||
|
r.mu.RLock()
|
||||||
|
defer r.mu.RUnlock()
|
||||||
|
arm, ok := r.arms[id]
|
||||||
|
return arm, ok
|
||||||
|
}
|
||||||
|
|
||||||
// Arms returns all registered arms.
|
// Arms returns all registered arms.
|
||||||
func (r *Router) Arms() []*Arm {
|
func (r *Router) Arms() []*Arm {
|
||||||
r.mu.RLock()
|
r.mu.RLock()
|
||||||
|
|||||||
@@ -221,7 +221,7 @@ func TestSelectBest_PrefersToolSupport(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
task := Task{Type: TaskGeneration, RequiresTools: true, Priority: PriorityNormal}
|
task := Task{Type: TaskGeneration, RequiresTools: true, Priority: PriorityNormal}
|
||||||
best := selectBest([]*Arm{withoutTools, withTools}, task)
|
best := selectBest(nil, []*Arm{withoutTools, withTools}, task)
|
||||||
|
|
||||||
if best.ID != "a/with-tools" {
|
if best.ID != "a/with-tools" {
|
||||||
t.Errorf("should prefer arm with tool support, got %s", best.ID)
|
t.Errorf("should prefer arm with tool support, got %s", best.ID)
|
||||||
@@ -241,7 +241,7 @@ func TestSelectBest_PrefersThinkingForPlanning(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
task := Task{Type: TaskPlanning, RequiresTools: true, Priority: PriorityNormal, EstimatedTokens: 5000}
|
task := Task{Type: TaskPlanning, RequiresTools: true, Priority: PriorityNormal, EstimatedTokens: 5000}
|
||||||
best := selectBest([]*Arm{noThinking, thinking}, task)
|
best := selectBest(nil, []*Arm{noThinking, thinking}, task)
|
||||||
|
|
||||||
if best.ID != "a/thinking" {
|
if best.ID != "a/thinking" {
|
||||||
t.Errorf("should prefer thinking model for planning, got %s", best.ID)
|
t.Errorf("should prefer thinking model for planning, got %s", best.ID)
|
||||||
|
|||||||
@@ -36,10 +36,9 @@ func (d RoutingDecision) Rollback() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// selectBest picks the highest-scoring feasible arm using heuristic scoring.
|
// selectBest picks the highest-scoring feasible arm, blending heuristic and
|
||||||
// No bandit learning — that's M9. Just smart defaults based on model size,
|
// observed EMA quality when enough data is available.
|
||||||
// locality, task type, cost, and pool scarcity.
|
func selectBest(qt *QualityTracker, arms []*Arm, task Task) *Arm {
|
||||||
func selectBest(arms []*Arm, task Task) *Arm {
|
|
||||||
if len(arms) == 0 {
|
if len(arms) == 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -48,7 +47,7 @@ func selectBest(arms []*Arm, task Task) *Arm {
|
|||||||
bestScore := math.Inf(-1)
|
bestScore := math.Inf(-1)
|
||||||
|
|
||||||
for _, arm := range arms {
|
for _, arm := range arms {
|
||||||
score := scoreArm(arm, task)
|
score := scoreArm(qt, arm, task)
|
||||||
if score > bestScore {
|
if score > bestScore {
|
||||||
bestScore = score
|
bestScore = score
|
||||||
best = arm
|
best = arm
|
||||||
@@ -58,17 +57,23 @@ func selectBest(arms []*Arm, task Task) *Arm {
|
|||||||
return best
|
return best
|
||||||
}
|
}
|
||||||
|
|
||||||
// scoreArm computes a heuristic quality/cost score for an arm.
|
// scoreArm computes a quality/cost score for an arm.
|
||||||
|
// When the quality tracker has sufficient observations, blends observed EMA
|
||||||
|
// (70%) with heuristic (30%). Falls back to pure heuristic otherwise.
|
||||||
// Score = (quality × value) / effective_cost
|
// Score = (quality × value) / effective_cost
|
||||||
func scoreArm(arm *Arm, task Task) float64 {
|
func scoreArm(qt *QualityTracker, arm *Arm, task Task) float64 {
|
||||||
quality := heuristicQuality(arm, task)
|
hq := heuristicQuality(arm, task)
|
||||||
|
quality := hq
|
||||||
|
if qt != nil {
|
||||||
|
if observed, hasData := qt.Quality(arm.ID, task.Type); hasData {
|
||||||
|
quality = 0.7*observed + 0.3*hq
|
||||||
|
}
|
||||||
|
}
|
||||||
value := task.ValueScore()
|
value := task.ValueScore()
|
||||||
cost := effectiveCost(arm, task)
|
cost := effectiveCost(arm, task)
|
||||||
|
|
||||||
if cost <= 0 {
|
if cost <= 0 {
|
||||||
cost = 0.001 // prevent division by zero for free local models
|
cost = 0.001
|
||||||
}
|
}
|
||||||
|
|
||||||
return (quality * value) / cost
|
return (quality * value) / cost
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -86,7 +86,6 @@ func (t *Tool) Execute(ctx context.Context, args json.RawMessage) (tool.Result,
|
|||||||
if t.store != nil {
|
if t.store != nil {
|
||||||
preSave, _ = t.store.List("")
|
preSave, _ = t.store.List("")
|
||||||
}
|
}
|
||||||
_ = preSave // used in Task 4 for ResultFilePaths diff
|
|
||||||
|
|
||||||
e, err := t.manager.Spawn(ctx, taskType, a.Prompt, systemPrompt, maxTurns)
|
e, err := t.manager.Spawn(ctx, taskType, a.Prompt, systemPrompt, maxTurns)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -174,7 +173,19 @@ func (t *Tool) Execute(ctx context.Context, args json.RawMessage) (tool.Result,
|
|||||||
return tool.Result{Output: "Elf timed out after 5 minutes"}, nil
|
return tool.Result{Output: "Elf timed out after 5 minutes"}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Report outcome to router for quality feedback
|
// Attribute /tmp result files produced during this elf's run
|
||||||
|
if t.store != nil {
|
||||||
|
postSave, _ := t.store.List("")
|
||||||
|
preSet := make(map[string]bool, len(preSave))
|
||||||
|
for _, f := range preSave {
|
||||||
|
preSet[f.Path] = true
|
||||||
|
}
|
||||||
|
for _, f := range postSave {
|
||||||
|
if !preSet[f.Path] {
|
||||||
|
result.ResultFilePaths = append(result.ResultFilePaths, f.Path)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
t.manager.ReportResult(result)
|
t.manager.ReportResult(result)
|
||||||
|
|
||||||
// Send done signal — stays in tree until turn completes
|
// Send done signal — stays in tree until turn completes
|
||||||
|
|||||||
@@ -97,7 +97,6 @@ func (t *BatchTool) Execute(ctx context.Context, args json.RawMessage) (tool.Res
|
|||||||
if t.store != nil {
|
if t.store != nil {
|
||||||
preSave, _ = t.store.List("")
|
preSave, _ = t.store.List("")
|
||||||
}
|
}
|
||||||
_ = preSave // used in Task 4
|
|
||||||
|
|
||||||
// Spawn all elfs with slight stagger to avoid rate limit bursts
|
// Spawn all elfs with slight stagger to avoid rate limit bursts
|
||||||
type elfEntry struct {
|
type elfEntry struct {
|
||||||
@@ -178,7 +177,21 @@ func (t *BatchTool) Execute(ctx context.Context, args json.RawMessage) (tool.Res
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Report outcome to router
|
// For batch elfs, attribute all new /tmp files produced during the batch
|
||||||
|
if t.store != nil {
|
||||||
|
postSave, _ := t.store.List("")
|
||||||
|
preSet := make(map[string]bool, len(preSave))
|
||||||
|
for _, f := range preSave {
|
||||||
|
preSet[f.Path] = true
|
||||||
|
}
|
||||||
|
var newPaths []string
|
||||||
|
for _, f := range postSave {
|
||||||
|
if !preSet[f.Path] {
|
||||||
|
newPaths = append(newPaths, f.Path)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
results[idx].ResultFilePaths = newPaths
|
||||||
|
}
|
||||||
t.manager.ReportResult(results[idx])
|
t.manager.ReportResult(results[idx])
|
||||||
|
|
||||||
// Send done progress
|
// Send done progress
|
||||||
|
|||||||
Reference in New Issue
Block a user