fd327107df
DiscoverOllama() interpreted a nil probeCache as 'skip probing entirely' rather than 'probe but don't cache.' cmd/gnoma/main.go's synchronous discovery path passes nil, so every ollama-discovered model got SupportsTools=false (the Go zero value), regardless of what ollama actually reported in its capabilities field. The symptom: filterFeasible rejected every ollama arm for any tool-requiring task with reason=tools_required_but_unsupported, even when ollama itself reported the model as tool-capable. Verified via curl: qwen3:14b advertises capabilities=[completion, tools, thinking] and has 'tools' in its template, but the gnoma arm shipped with tool_use_capability=false. Fix: always run probeOllamaModel; treat probeCache as an optional memoisation aid only. nil cache now means 'no caching across calls' not 'no probing.' For users with many models, passing a real cache still avoids redundant HTTP calls — semantics for that path are unchanged. Surfaced via the new filterFeasible Debug logging from the previous commit, which made the per-arm rejection reasons visible.
535 lines
16 KiB
Go
535 lines
16 KiB
Go
package router
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"log/slog"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"somegit.dev/Owlibou/gnoma/internal/provider"
|
|
)
|
|
|
|
const discoveryTimeout = 5 * time.Second
|
|
|
|
// Per-provider context-window fallbacks applied when a probe doesn't report a
|
|
// concrete num_ctx / n_ctx. The router treats ContextWindow == 0 as "tiny"
|
|
// (forcing two-stage tool routing), so leaving 0 in for unprobed models would
|
|
// corrupt routing decisions for every local arm.
|
|
const (
|
|
defaultOllamaContextSize = 32768
|
|
defaultLlamaCppContextSize = 8192
|
|
)
|
|
|
|
// DiscoveredModel represents a model found via discovery.
|
|
type DiscoveredModel struct {
|
|
ID string
|
|
Name string
|
|
Provider string // "ollama" or "llamacpp"
|
|
Size int64 // bytes, if available
|
|
SupportsTools bool // whether the model supports function/tool calling
|
|
SupportsVision bool // whether the model accepts image inputs (multimodal)
|
|
ContextSize int // context window in tokens (always populated; provider-specific default if probe was inconclusive)
|
|
}
|
|
|
|
// OllamaProbeResult bundles the capabilities probed from a single
|
|
// /api/show call. Cached per model name so discovery cycles don't re-probe
|
|
// every model. SupportsVision was added alongside SupportsTools; older
|
|
// callers using `map[string]bool` should migrate to `map[string]OllamaProbeResult`.
|
|
type OllamaProbeResult struct {
|
|
SupportsTools bool
|
|
SupportsVision bool
|
|
ContextSize int
|
|
}
|
|
|
|
// DiscoverOllama polls the local Ollama instance for available models.
|
|
// probeCache caches /api/show probe results per model name to avoid N
|
|
// requests per discovery cycle. Pass nil to probe every model
|
|
// unconditionally. The caller owns the cache and should pass the same
|
|
// map across cycles.
|
|
func DiscoverOllama(ctx context.Context, baseURL string, probeCache map[string]OllamaProbeResult) ([]DiscoveredModel, error) {
|
|
if baseURL == "" {
|
|
baseURL = "http://localhost:11434"
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(ctx, discoveryTimeout)
|
|
defer cancel()
|
|
|
|
req, err := http.NewRequestWithContext(ctx, "GET", baseURL+"/api/tags", nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
resp, err := http.DefaultClient.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("ollama not reachable at %s: %w", baseURL, err)
|
|
}
|
|
defer func() { _ = resp.Body.Close() }()
|
|
|
|
if resp.StatusCode != 200 {
|
|
return nil, fmt.Errorf("ollama returned status %d", resp.StatusCode)
|
|
}
|
|
|
|
var data struct {
|
|
Models []struct {
|
|
Name string `json:"name"`
|
|
Size int64 `json:"size"`
|
|
} `json:"models"`
|
|
}
|
|
if err := json.NewDecoder(resp.Body).Decode(&data); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
discovered := make([]DiscoveredModel, 0, len(data.Models))
|
|
currentModels := make(map[string]bool, len(data.Models))
|
|
for _, m := range data.Models {
|
|
currentModels[m.Name] = true
|
|
dm := DiscoveredModel{
|
|
ID: m.Name,
|
|
Name: m.Name,
|
|
Provider: "ollama",
|
|
Size: m.Size,
|
|
}
|
|
|
|
// Always probe; the cache is optional. Previously nil-cache was
|
|
// treated as "skip probing entirely", which left SupportsTools
|
|
// at its zero value (false) for every model — every ollama-
|
|
// discovered arm then got marked as tool-unsupported and
|
|
// rejected by filterFeasible for any tool-requiring task. main.go
|
|
// passes nil from the synchronous discovery path; we still want
|
|
// real probe data there.
|
|
var result OllamaProbeResult
|
|
if probeCache != nil {
|
|
if cached, ok := probeCache[m.Name]; ok {
|
|
result = cached
|
|
} else {
|
|
result = probeOllamaModel(ctx, baseURL, m.Name)
|
|
probeCache[m.Name] = result
|
|
}
|
|
} else {
|
|
result = probeOllamaModel(ctx, baseURL, m.Name)
|
|
}
|
|
dm.SupportsTools = result.SupportsTools
|
|
dm.SupportsVision = result.SupportsVision
|
|
dm.ContextSize = result.ContextSize
|
|
|
|
if dm.ContextSize == 0 {
|
|
dm.ContextSize = defaultOllamaContextSize
|
|
}
|
|
|
|
discovered = append(discovered, dm)
|
|
}
|
|
|
|
// Prune cache entries for models that have disappeared since the last
|
|
// poll. Without this, the cache grows unbounded and stale entries linger
|
|
// (a reappearing model would replay an out-of-date probe verdict).
|
|
for name := range probeCache {
|
|
if !currentModels[name] {
|
|
delete(probeCache, name)
|
|
}
|
|
}
|
|
return discovered, nil
|
|
}
|
|
|
|
func probeOllamaModel(ctx context.Context, baseURL, model string) OllamaProbeResult {
|
|
req, err := http.NewRequestWithContext(ctx, "POST", baseURL+"/api/show", strings.NewReader(fmt.Sprintf(`{"name":"%s"}`, model)))
|
|
if err != nil {
|
|
return OllamaProbeResult{}
|
|
}
|
|
resp, err := http.DefaultClient.Do(req)
|
|
if err != nil {
|
|
return OllamaProbeResult{}
|
|
}
|
|
defer func() { _ = resp.Body.Close() }()
|
|
if resp.StatusCode != 200 {
|
|
return OllamaProbeResult{}
|
|
}
|
|
var data struct {
|
|
Template string `json:"template"`
|
|
Parameters string `json:"parameters"`
|
|
Details struct {
|
|
Families []string `json:"families"`
|
|
Family string `json:"family"`
|
|
} `json:"details"`
|
|
Capabilities []string `json:"capabilities"`
|
|
}
|
|
if err := json.NewDecoder(resp.Body).Decode(&data); err != nil {
|
|
return OllamaProbeResult{}
|
|
}
|
|
|
|
// Heuristic for tool support: many modern models that support tools
|
|
// have "call" or "tool" or "json" in their template or system prompt
|
|
// logic. More specifically, Ollama's own tool-calling models often
|
|
// include specific jinja templates. Newer Ollama versions also
|
|
// advertise capabilities via the "capabilities" field.
|
|
supportsTools := strings.Contains(data.Template, ".Tool") ||
|
|
strings.Contains(data.Template, "tools") ||
|
|
strings.Contains(data.Template, "json")
|
|
for _, cap := range data.Capabilities {
|
|
if cap == "tools" {
|
|
supportsTools = true
|
|
}
|
|
}
|
|
|
|
// Vision detection: CLIP/vision encoder families show up in
|
|
// details.families (e.g. "clip", "mllama"); newer Ollama also lists
|
|
// "vision" in the capabilities array. Fall back to a name-pattern
|
|
// match for releases that predate the capabilities field.
|
|
supportsVision := false
|
|
for _, fam := range data.Details.Families {
|
|
f := strings.ToLower(fam)
|
|
if f == "clip" || f == "mllama" || strings.HasSuffix(f, "vl") {
|
|
supportsVision = true
|
|
break
|
|
}
|
|
}
|
|
for _, cap := range data.Capabilities {
|
|
if cap == "vision" {
|
|
supportsVision = true
|
|
}
|
|
}
|
|
if !supportsVision && isKnownVisionModelName(model) {
|
|
supportsVision = true
|
|
}
|
|
|
|
// Context size heuristic from parameters
|
|
contextSize := 0
|
|
if strings.Contains(data.Parameters, "num_ctx") {
|
|
// Ollama parameters are often a block of text: "num_ctx 4096\nstop <|end|>"
|
|
lines := strings.Split(data.Parameters, "\n")
|
|
for _, l := range lines {
|
|
if strings.HasPrefix(l, "num_ctx") {
|
|
_, _ = fmt.Sscanf(l, "num_ctx %d", &contextSize)
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
return OllamaProbeResult{
|
|
SupportsTools: supportsTools,
|
|
SupportsVision: supportsVision,
|
|
ContextSize: contextSize,
|
|
}
|
|
}
|
|
|
|
// knownVisionModelPrefixes lists Ollama model name prefixes that ship as
|
|
// multimodal models. Used as a fallback when the /api/show response is
|
|
// missing details.families or the capabilities array (older Ollama).
|
|
var knownVisionModelPrefixes = []string{
|
|
"llava",
|
|
"bakllava",
|
|
"moondream",
|
|
"qwen2-vl",
|
|
"qwen2.5-vl",
|
|
"qwen3-vl",
|
|
"llama3.2-vision",
|
|
"llama4-vision",
|
|
"minicpm-v",
|
|
"cogvlm",
|
|
"pixtral",
|
|
"gemma3", // gemma3 multimodal variants
|
|
"gemma4", // gemma4 base + edge (e2b, e4b) variants
|
|
"gemma-4", // hyphenated GGUF naming (gemma-4-e2b-it, gemma-4-e4b-it)
|
|
"glm-ocr", // vision-language model specialized for OCR
|
|
}
|
|
|
|
func isKnownVisionModelName(model string) bool {
|
|
low := strings.ToLower(model)
|
|
for _, p := range knownVisionModelPrefixes {
|
|
if strings.HasPrefix(low, p) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// nonChatModelPatterns lists case-insensitive substrings that mark a model
|
|
// as not suitable for chat routing. Discovery skips these entirely rather
|
|
// than registering them as broken chat arms — they're embedding models,
|
|
// speech-to-text, text-to-speech, audio realtime, or rerankers that would
|
|
// fail at inference time if the router selected them for a chat turn.
|
|
//
|
|
// Substring match (not prefix) because user namespaces (e.g.
|
|
// "someorg/whisper-finetune") would defeat a prefix-only check.
|
|
var nonChatModelPatterns = []string{
|
|
"whisper",
|
|
"moonshine",
|
|
"kokoros",
|
|
"vibevoice",
|
|
"-asr",
|
|
"-tts",
|
|
"-audio",
|
|
"-embedding",
|
|
"embedding-",
|
|
"embeddinggemma",
|
|
"-reranker",
|
|
"lfm2",
|
|
}
|
|
|
|
func isNonChatModel(model string) bool {
|
|
low := strings.ToLower(model)
|
|
for _, p := range nonChatModelPatterns {
|
|
if strings.Contains(low, p) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// DiscoverLlamaCPP enumerates models served by a llama.cpp server.
|
|
//
|
|
// llama-server exposes /v1/models (OpenAI-compatible) — single-model
|
|
// deployments return one entry with the actual model ID; multi-model proxies
|
|
// (llama-swap, custom routers in front of llama-server) return many. We
|
|
// enumerate that list and share the context window read from /props across
|
|
// the entries, since llama-server is one process per context.
|
|
func DiscoverLlamaCPP(ctx context.Context, baseURL string) ([]DiscoveredModel, error) {
|
|
if baseURL == "" {
|
|
baseURL = "http://localhost:8080"
|
|
}
|
|
ctx, cancel := context.WithTimeout(ctx, discoveryTimeout)
|
|
defer cancel()
|
|
|
|
ids, err := fetchLlamaCppModelIDs(ctx, baseURL)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// /props is best-effort — if it fails or omits n_ctx, fall back to the
|
|
// documented default rather than aborting discovery.
|
|
ctxSize := fetchLlamaCppContextSize(ctx, baseURL)
|
|
if ctxSize == 0 {
|
|
ctxSize = defaultLlamaCppContextSize
|
|
}
|
|
|
|
models := make([]DiscoveredModel, 0, len(ids))
|
|
for _, id := range ids {
|
|
models = append(models, DiscoveredModel{
|
|
ID: id,
|
|
Name: id,
|
|
Provider: "llamacpp",
|
|
ContextSize: ctxSize,
|
|
SupportsTools: true, // assume true for modern llama.cpp
|
|
})
|
|
}
|
|
return models, nil
|
|
}
|
|
|
|
func fetchLlamaCppModelIDs(ctx context.Context, baseURL string) ([]string, error) {
|
|
req, err := http.NewRequestWithContext(ctx, "GET", baseURL+"/v1/models", nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
resp, err := http.DefaultClient.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("llama.cpp not reachable at %s: %w", baseURL, err)
|
|
}
|
|
defer func() { _ = resp.Body.Close() }()
|
|
if resp.StatusCode != 200 {
|
|
return nil, fmt.Errorf("llama.cpp returned status %d on /v1/models", resp.StatusCode)
|
|
}
|
|
|
|
var body struct {
|
|
Data []struct {
|
|
ID string `json:"id"`
|
|
} `json:"data"`
|
|
}
|
|
if err := json.NewDecoder(resp.Body).Decode(&body); err != nil {
|
|
return nil, fmt.Errorf("llama.cpp /v1/models parse: %w", err)
|
|
}
|
|
if len(body.Data) == 0 {
|
|
return nil, fmt.Errorf("llama.cpp /v1/models returned no entries")
|
|
}
|
|
ids := make([]string, 0, len(body.Data))
|
|
for _, m := range body.Data {
|
|
if m.ID == "" {
|
|
continue
|
|
}
|
|
ids = append(ids, m.ID)
|
|
}
|
|
if len(ids) == 0 {
|
|
return nil, fmt.Errorf("llama.cpp /v1/models returned only empty IDs")
|
|
}
|
|
return ids, nil
|
|
}
|
|
|
|
// fetchLlamaCppContextSize returns the configured n_ctx from /props, or 0 if
|
|
// the endpoint is unavailable / malformed. Caller applies a default.
|
|
func fetchLlamaCppContextSize(ctx context.Context, baseURL string) int {
|
|
req, err := http.NewRequestWithContext(ctx, "GET", baseURL+"/props", nil)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
resp, err := http.DefaultClient.Do(req)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
defer func() { _ = resp.Body.Close() }()
|
|
if resp.StatusCode != 200 {
|
|
return 0
|
|
}
|
|
var data struct {
|
|
DefaultGenerationSettings struct {
|
|
N_Ctx int `json:"n_ctx"`
|
|
} `json:"default_generation_settings"`
|
|
}
|
|
if err := json.NewDecoder(resp.Body).Decode(&data); err != nil {
|
|
return 0
|
|
}
|
|
return data.DefaultGenerationSettings.N_Ctx
|
|
}
|
|
|
|
// DiscoverLocalModels polls all known local providers.
|
|
func DiscoverLocalModels(ctx context.Context, logger *slog.Logger, ollamaURL, llamacppURL string, ollamaProbeCache map[string]OllamaProbeResult) []DiscoveredModel {
|
|
var all []DiscoveredModel
|
|
|
|
if models, err := DiscoverOllama(ctx, ollamaURL, ollamaProbeCache); err != nil {
|
|
logger.Debug("ollama discovery skipped", "error", err)
|
|
} else {
|
|
all = append(all, models...)
|
|
}
|
|
|
|
if models, err := DiscoverLlamaCPP(ctx, llamacppURL); err != nil {
|
|
logger.Debug("llama.cpp discovery skipped", "error", err)
|
|
} else {
|
|
all = append(all, models...)
|
|
}
|
|
|
|
return all
|
|
}
|
|
|
|
// StartDiscoveryLoop periodically polls for local models and reconciles with the router.
|
|
// onReconcile is called when the forced arm identity changes (may be nil).
|
|
func StartDiscoveryLoop(ctx context.Context, r *Router, logger *slog.Logger,
|
|
ollamaURL, llamacppURL string,
|
|
providerFactory func(name, model string) SecureProvider,
|
|
interval time.Duration,
|
|
onReconcile func(ArmID),
|
|
) {
|
|
go func() {
|
|
ollamaProbeCache := make(map[string]OllamaProbeResult)
|
|
ticker := time.NewTicker(interval)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
models := DiscoverLocalModels(ctx, logger, ollamaURL, llamacppURL, ollamaProbeCache)
|
|
reconcileArms(r, models, providerFactory, logger, onReconcile)
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
// reconcileArms adds newly discovered models, removes disappeared ones, and
|
|
// reconciles the forced arm when discovery reveals its real model name.
|
|
// onReconcile is called (if non-nil) when the forced arm identity changes.
|
|
func reconcileArms(r *Router, discovered []DiscoveredModel, providerFactory func(name, model string) SecureProvider, logger *slog.Logger, onReconcile func(ArmID)) {
|
|
discoveredSet := make(map[ArmID]bool, len(discovered))
|
|
for _, m := range discovered {
|
|
discoveredSet[NewArmID(m.Provider, m.ID)] = true
|
|
}
|
|
|
|
// Reconcile forced arm if it uses a placeholder "default" model name
|
|
// and discovery found the real model name for the same provider.
|
|
forcedID := r.ForcedArm()
|
|
if forcedID != "" && forcedID.Model() == "default" {
|
|
provName := forcedID.Provider()
|
|
var candidates []DiscoveredModel
|
|
for _, m := range discovered {
|
|
if m.Provider == provName {
|
|
candidates = append(candidates, m)
|
|
}
|
|
}
|
|
if len(candidates) >= 1 {
|
|
chosen := candidates[0]
|
|
newID := NewArmID(provName, chosen.ID)
|
|
if len(candidates) > 1 {
|
|
logger.Warn("multiple models discovered for forced provider, using first",
|
|
"provider", provName, "chosen", chosen.ID, "total", len(candidates))
|
|
}
|
|
logger.Debug("reconciling forced arm identity", "old", forcedID, "new", newID)
|
|
r.reconcileForcedArm(forcedID, newID, chosen.ID)
|
|
if onReconcile != nil {
|
|
onReconcile(newID)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Register new models
|
|
RegisterDiscoveredModels(r, discovered, providerFactory)
|
|
|
|
// Remove arms whose models have disappeared (only local arms).
|
|
// Never remove the forced arm — the user explicitly chose it.
|
|
currentForced := r.ForcedArm()
|
|
for _, arm := range r.Arms() {
|
|
if !arm.IsLocal {
|
|
continue
|
|
}
|
|
if arm.ID == currentForced {
|
|
continue
|
|
}
|
|
if !discoveredSet[arm.ID] {
|
|
logger.Debug("removing disappeared local arm", "id", arm.ID)
|
|
r.RemoveArm(arm.ID)
|
|
}
|
|
}
|
|
}
|
|
|
|
// RegisterDiscoveredModels registers discovered local models as arms in the router.
|
|
func RegisterDiscoveredModels(r *Router, models []DiscoveredModel, providerFactory func(name, model string) SecureProvider) {
|
|
for _, m := range models {
|
|
// Skip non-chat models (embeddings, ASR, TTS, audio, rerankers).
|
|
// These would otherwise register as broken chat arms and fail at
|
|
// inference time when the router selected them.
|
|
if isNonChatModel(m.ID) {
|
|
continue
|
|
}
|
|
|
|
armID := NewArmID(m.Provider, m.ID)
|
|
|
|
// Skip if already registered
|
|
exists := false
|
|
for _, arm := range r.Arms() {
|
|
if arm.ID == armID {
|
|
exists = true
|
|
break
|
|
}
|
|
}
|
|
if exists {
|
|
continue
|
|
}
|
|
|
|
prov := providerFactory(m.Provider, m.ID)
|
|
if prov == nil {
|
|
continue
|
|
}
|
|
|
|
// Family-keyed defaults (Strengths, MaxComplexity, CostWeight,
|
|
// Disabled) are applied inside Router.RegisterArm — single source
|
|
// of truth so cloud-arm and local-arm registration paths agree.
|
|
// User-supplied [[arms]] config in TOML overrides defaults later
|
|
// via ApplyArmOverrides.
|
|
r.RegisterArm(&Arm{
|
|
ID: armID,
|
|
Provider: prov,
|
|
ModelName: m.ID,
|
|
IsLocal: true,
|
|
Capabilities: provider.Capabilities{
|
|
// Conservative default: don't assume tool support.
|
|
// Many small local models (phi, etc.) don't support
|
|
// function calling and will produce confused output if selected
|
|
// for tool-requiring tasks. Larger known models (mistral, llama3,
|
|
// qwen2.5-coder, tiny3.5) support tools. Vision is set from the
|
|
// /api/show probe (capabilities/families/name fallback).
|
|
ToolUse: m.SupportsTools,
|
|
Vision: m.SupportsVision,
|
|
ContextWindow: m.ContextSize,
|
|
},
|
|
})
|
|
}
|
|
}
|