feat: Ollama/gemma4 compat — /init flow, stream filter, safety fixes

provider/openai: - Fix doubled tool call args (argsComplete flag): Ollama sends complete args in the first streaming chunk then repeats them as delta, causing doubled JSON and 400 errors in elfs - Handle fs: prefix (gemma4 uses fs:grep instead of fs.grep) - Add Reasoning field support for Ollama thinking output cmd/gnoma: - Early TTY detection so logger is created with correct destination before any component gets a reference to it (fixes slog WARN bleed into TUI textarea) permission: - Exempt spawn_elfs and agent tools from safety scanner: elf prompt text may legitimately mention .env/.ssh/credentials patterns and should not be blocked tui/app: - /init retry chain: no-tool-calls → spawn_elfs nudge → write nudge (ask for plain text output) → TUI fallback write from streamBuf - looksLikeAgentsMD + extractMarkdownDoc: validate and clean fallback content before writing (reject refusals, strip narrative preambles) - Collapse thinking output to 3 lines; ctrl+o to expand (live stream and committed messages) - Stream-level filter for model pseudo-tool-call blocks: suppresses <<tool_code>>...</tool_code>> and <<function_call>>...<tool_call|> from entering streamBuf across chunk boundaries - sanitizeAssistantText regex covers both block formats - Reset streamFilterClose at every turn start
2026-04-05 19:24:51 +02:00
parent 14b88cadcc
commit cb2d63d06f
51 changed files with 2855 additions and 353 deletions
--- a/internal/provider/limiter.go
+++ b/internal/provider/limiter.go
@@ -0,0 +1,57 @@
+package provider
+
+import (
+	"context"
+	"sync"
+
+	"somegit.dev/Owlibou/gnoma/internal/stream"
+)
+
+// ConcurrentProvider wraps a Provider with a shared semaphore that limits the
+// number of in-flight Stream calls. All engines sharing the same
+// ConcurrentProvider instance share the same concurrency budget.
+type ConcurrentProvider struct {
+	Provider
+	sem chan struct{}
+}
+
+// WithConcurrency wraps p so that at most max Stream calls can be in-flight
+// simultaneously. If max <= 0, p is returned unwrapped.
+func WithConcurrency(p Provider, max int) Provider {
+	if max <= 0 {
+		return p
+	}
+	sem := make(chan struct{}, max)
+	for range max {
+		sem <- struct{}{}
+	}
+	return &ConcurrentProvider{Provider: p, sem: sem}
+}
+
+// Stream acquires a concurrency slot, calls the inner provider, and returns a
+// stream that releases the slot when Close is called.
+func (cp *ConcurrentProvider) Stream(ctx context.Context, req Request) (stream.Stream, error) {
+	select {
+	case <-cp.sem:
+	case <-ctx.Done():
+		return nil, ctx.Err()
+	}
+	s, err := cp.Provider.Stream(ctx, req)
+	if err != nil {
+		cp.sem <- struct{}{}
+		return nil, err
+	}
+	return &semStream{Stream: s, release: func() { cp.sem <- struct{}{} }}, nil
+}
+
+// semStream wraps a stream.Stream to release a semaphore slot on Close.
+type semStream struct {
+	stream.Stream
+	release func()
+	once    sync.Once
+}
+
+func (s *semStream) Close() error {
+	s.once.Do(s.release)
+	return s.Stream.Close()
+}
--- a/internal/provider/openai/provider.go
+++ b/internal/provider/openai/provider.go
@@ -15,13 +15,20 @@ const defaultModel = "gpt-4o"

 // Provider implements provider.Provider for the OpenAI API.
 type Provider struct {
-	client *oai.Client
-	name   string
-	model  string
+	client     *oai.Client
+	name       string
+	model      string
+	streamOpts []option.RequestOption // injected per-request (e.g. think:false for Ollama)
 }

 // New creates an OpenAI provider from config.
 func New(cfg provider.ProviderConfig) (provider.Provider, error) {
+	return NewWithStreamOptions(cfg, nil)
+}
+
+// NewWithStreamOptions creates an OpenAI provider with extra per-request stream options.
+// Use this for Ollama/llama.cpp adapters that need non-standard body fields.
+func NewWithStreamOptions(cfg provider.ProviderConfig, streamOpts []option.RequestOption) (provider.Provider, error) {
 	if cfg.APIKey == "" {
 		return nil, fmt.Errorf("openai: api key required")
 	}
@@ -41,9 +48,10 @@ func New(cfg provider.ProviderConfig) (provider.Provider, error) {
 	}

 	return &Provider{
-		client: &client,
-		name:   "openai",
-		model:  model,
+		client:     &client,
+		name:       "openai",
+		model:      model,
+		streamOpts: streamOpts,
 	}, nil
 }

@@ -57,7 +65,7 @@ func (p *Provider) Stream(ctx context.Context, req provider.Request) (stream.Str
 	params := translateRequest(req)
 	params.Model = model

-	raw := p.client.Chat.Completions.NewStreaming(ctx, params)
+	raw := p.client.Chat.Completions.NewStreaming(ctx, params, p.streamOpts...)

 	return newOpenAIStream(raw), nil
 }
--- a/internal/provider/openai/stream.go
+++ b/internal/provider/openai/stream.go
@@ -25,9 +25,10 @@ type openaiStream struct {
 }

 type toolCallState struct {
-	id   string
-	name string
-	args string
+	id           string
+	name         string
+	args         string
+	argsComplete bool // true when args arrived in the initial chunk; skip subsequent deltas
 }

 func newOpenAIStream(raw *ssestream.Stream[oai.ChatCompletionChunk]) *openaiStream {
@@ -74,9 +75,10 @@ func (s *openaiStream) Next() bool {
 				if !ok {
 					// New tool call — capture initial arguments too
 					existing = &toolCallState{
-						id:   tc.ID,
-						name: tc.Function.Name,
-						args: tc.Function.Arguments,
+						id:           tc.ID,
+						name:         tc.Function.Name,
+						args:         tc.Function.Arguments,
+						argsComplete: tc.Function.Arguments != "",
 					}
 					s.toolCalls[tc.Index] = existing
 					s.hadToolCalls = true
@@ -91,8 +93,11 @@ func (s *openaiStream) Next() bool {
 					}
 				}

-				// Accumulate arguments (subsequent chunks)
-				if tc.Function.Arguments != "" && ok {
+				// Accumulate arguments (subsequent chunks).
+				// Skip if args were already provided in the initial chunk — some providers
+				// (e.g. Ollama) send complete args in the name chunk and then repeat them
+				// as a delta, which would cause doubled JSON and unmarshal failures.
+				if tc.Function.Arguments != "" && ok && !existing.argsComplete {
 					existing.args += tc.Function.Arguments
 					s.cur = stream.Event{
 						Type:       stream.EventToolCallDelta,
@@ -113,6 +118,29 @@ func (s *openaiStream) Next() bool {
 			}
 			return true
 		}
+
+		// Ollama thinking content — non-standard "thinking" or "reasoning" field on the delta.
+		// Ollama uses "reasoning"; some other servers use "thinking".
+		// The openai-go struct drops unknown fields, so we read the raw JSON directly.
+		if raw := delta.RawJSON(); raw != "" {
+			var extra struct {
+				Thinking  string `json:"thinking"`
+				Reasoning string `json:"reasoning"`
+			}
+			if json.Unmarshal([]byte(raw), &extra) == nil {
+				text := extra.Thinking
+				if text == "" {
+					text = extra.Reasoning
+				}
+				if text != "" {
+					s.cur = stream.Event{
+						Type: stream.EventThinkingDelta,
+						Text: text,
+					}
+					return true
+				}
+			}
+		}
 	}

 	// Stream ended — flush tool call Done events, then emit stop
--- a/internal/provider/openai/translate.go
+++ b/internal/provider/openai/translate.go
@@ -20,6 +20,10 @@ func unsanitizeToolName(name string) string {
 	if strings.HasPrefix(name, "fs_") {
 		return "fs." + name[3:]
 	}
+	// Some models (e.g. gemma4 via Ollama) use "fs:grep" instead of "fs_grep"
+	if strings.HasPrefix(name, "fs:") {
+		return "fs." + name[3:]
+	}
 	return name
 }

@@ -127,6 +131,12 @@ func translateRequest(req provider.Request) oai.ChatCompletionNewParams {
 		IncludeUsage: param.NewOpt(true),
 	}

+	if req.ToolChoice != "" && len(params.Tools) > 0 {
+		params.ToolChoice = oai.ChatCompletionToolChoiceOptionUnionParam{
+			OfAuto: param.NewOpt(string(req.ToolChoice)),
+		}
+	}
+
 	return params
 }

--- a/internal/provider/provider.go
+++ b/internal/provider/provider.go
@@ -8,6 +8,15 @@ import (
 	"somegit.dev/Owlibou/gnoma/internal/stream"
 )

+// ToolChoiceMode controls how the model selects tools.
+type ToolChoiceMode string
+
+const (
+	ToolChoiceAuto     ToolChoiceMode = "auto"
+	ToolChoiceRequired ToolChoiceMode = "required"
+	ToolChoiceNone     ToolChoiceMode = "none"
+)
+
 // Request encapsulates everything needed for a single LLM API call.
 type Request struct {
 	Model          string
@@ -21,6 +30,7 @@ type Request struct {
 	StopSequences  []string
 	Thinking       *ThinkingConfig
 	ResponseFormat *ResponseFormat
+	ToolChoice     ToolChoiceMode // "" = provider default (auto)
 }

 // ToolDefinition is the provider-agnostic tool schema.
--- a/internal/provider/ratelimits.go
+++ b/internal/provider/ratelimits.go
@@ -1,5 +1,7 @@
 package provider

+import "math"
+
 // RateLimits describes the rate limits for a provider+model pair.
 // Zero values mean "no limit" or "unknown".
 type RateLimits struct {
@@ -13,6 +15,31 @@ type RateLimits struct {
 	SpendCap    float64 // monthly spend cap in provider currency
 }

+// MaxConcurrent returns the maximum number of concurrent in-flight requests
+// that this rate limit allows. Returns 0 when there is no meaningful concurrency
+// constraint (provider has high or unknown limits).
+func (rl RateLimits) MaxConcurrent() int {
+	if rl.RPS > 0 {
+		n := int(math.Ceil(rl.RPS))
+		if n < 1 {
+			n = 1
+		}
+		return n
+	}
+	if rl.RPM > 0 {
+		// Allow 1 concurrent slot per 30 RPM (conservative heuristic).
+		n := rl.RPM / 30
+		if n < 1 {
+			n = 1
+		}
+		if n > 16 {
+			n = 16
+		}
+		return n
+	}
+	return 0
+}
+
 // ProviderDefaults holds default rate limits keyed by model glob.
 // The special key "*" matches any model not explicitly listed.
 type ProviderDefaults struct {