feat: Ollama/gemma4 compat — /init flow, stream filter, safety fixes

provider/openai:
- Fix doubled tool call args (argsComplete flag): Ollama sends complete
  args in the first streaming chunk then repeats them as delta, causing
  doubled JSON and 400 errors in elfs
- Handle fs: prefix (gemma4 uses fs:grep instead of fs.grep)
- Add Reasoning field support for Ollama thinking output

cmd/gnoma:
- Early TTY detection so logger is created with correct destination
  before any component gets a reference to it (fixes slog WARN bleed
  into TUI textarea)

permission:
- Exempt spawn_elfs and agent tools from safety scanner: elf prompt
  text may legitimately mention .env/.ssh/credentials patterns and
  should not be blocked

tui/app:
- /init retry chain: no-tool-calls → spawn_elfs nudge → write nudge
  (ask for plain text output) → TUI fallback write from streamBuf
- looksLikeAgentsMD + extractMarkdownDoc: validate and clean fallback
  content before writing (reject refusals, strip narrative preambles)
- Collapse thinking output to 3 lines; ctrl+o to expand (live stream
  and committed messages)
- Stream-level filter for model pseudo-tool-call blocks: suppresses
  <<tool_code>>...</tool_code>> and <<function_call>>...<tool_call|>
  from entering streamBuf across chunk boundaries
- sanitizeAssistantText regex covers both block formats
- Reset streamFilterClose at every turn start
This commit is contained in:
2026-04-05 19:24:51 +02:00
parent 14b88cadcc
commit cb2d63d06f
51 changed files with 2855 additions and 353 deletions

View File

@@ -0,0 +1,57 @@
package provider
import (
"context"
"sync"
"somegit.dev/Owlibou/gnoma/internal/stream"
)
// ConcurrentProvider wraps a Provider with a shared semaphore that limits the
// number of in-flight Stream calls. All engines sharing the same
// ConcurrentProvider instance share the same concurrency budget.
type ConcurrentProvider struct {
Provider
sem chan struct{}
}
// WithConcurrency wraps p so that at most max Stream calls can be in-flight
// simultaneously. If max <= 0, p is returned unwrapped.
func WithConcurrency(p Provider, max int) Provider {
if max <= 0 {
return p
}
sem := make(chan struct{}, max)
for range max {
sem <- struct{}{}
}
return &ConcurrentProvider{Provider: p, sem: sem}
}
// Stream acquires a concurrency slot, calls the inner provider, and returns a
// stream that releases the slot when Close is called.
func (cp *ConcurrentProvider) Stream(ctx context.Context, req Request) (stream.Stream, error) {
select {
case <-cp.sem:
case <-ctx.Done():
return nil, ctx.Err()
}
s, err := cp.Provider.Stream(ctx, req)
if err != nil {
cp.sem <- struct{}{}
return nil, err
}
return &semStream{Stream: s, release: func() { cp.sem <- struct{}{} }}, nil
}
// semStream wraps a stream.Stream to release a semaphore slot on Close.
type semStream struct {
stream.Stream
release func()
once sync.Once
}
func (s *semStream) Close() error {
s.once.Do(s.release)
return s.Stream.Close()
}

View File

@@ -15,13 +15,20 @@ const defaultModel = "gpt-4o"
// Provider implements provider.Provider for the OpenAI API.
type Provider struct {
client *oai.Client
name string
model string
client *oai.Client
name string
model string
streamOpts []option.RequestOption // injected per-request (e.g. think:false for Ollama)
}
// New creates an OpenAI provider from config.
func New(cfg provider.ProviderConfig) (provider.Provider, error) {
return NewWithStreamOptions(cfg, nil)
}
// NewWithStreamOptions creates an OpenAI provider with extra per-request stream options.
// Use this for Ollama/llama.cpp adapters that need non-standard body fields.
func NewWithStreamOptions(cfg provider.ProviderConfig, streamOpts []option.RequestOption) (provider.Provider, error) {
if cfg.APIKey == "" {
return nil, fmt.Errorf("openai: api key required")
}
@@ -41,9 +48,10 @@ func New(cfg provider.ProviderConfig) (provider.Provider, error) {
}
return &Provider{
client: &client,
name: "openai",
model: model,
client: &client,
name: "openai",
model: model,
streamOpts: streamOpts,
}, nil
}
@@ -57,7 +65,7 @@ func (p *Provider) Stream(ctx context.Context, req provider.Request) (stream.Str
params := translateRequest(req)
params.Model = model
raw := p.client.Chat.Completions.NewStreaming(ctx, params)
raw := p.client.Chat.Completions.NewStreaming(ctx, params, p.streamOpts...)
return newOpenAIStream(raw), nil
}

View File

@@ -25,9 +25,10 @@ type openaiStream struct {
}
type toolCallState struct {
id string
name string
args string
id string
name string
args string
argsComplete bool // true when args arrived in the initial chunk; skip subsequent deltas
}
func newOpenAIStream(raw *ssestream.Stream[oai.ChatCompletionChunk]) *openaiStream {
@@ -74,9 +75,10 @@ func (s *openaiStream) Next() bool {
if !ok {
// New tool call — capture initial arguments too
existing = &toolCallState{
id: tc.ID,
name: tc.Function.Name,
args: tc.Function.Arguments,
id: tc.ID,
name: tc.Function.Name,
args: tc.Function.Arguments,
argsComplete: tc.Function.Arguments != "",
}
s.toolCalls[tc.Index] = existing
s.hadToolCalls = true
@@ -91,8 +93,11 @@ func (s *openaiStream) Next() bool {
}
}
// Accumulate arguments (subsequent chunks)
if tc.Function.Arguments != "" && ok {
// Accumulate arguments (subsequent chunks).
// Skip if args were already provided in the initial chunk — some providers
// (e.g. Ollama) send complete args in the name chunk and then repeat them
// as a delta, which would cause doubled JSON and unmarshal failures.
if tc.Function.Arguments != "" && ok && !existing.argsComplete {
existing.args += tc.Function.Arguments
s.cur = stream.Event{
Type: stream.EventToolCallDelta,
@@ -113,6 +118,29 @@ func (s *openaiStream) Next() bool {
}
return true
}
// Ollama thinking content — non-standard "thinking" or "reasoning" field on the delta.
// Ollama uses "reasoning"; some other servers use "thinking".
// The openai-go struct drops unknown fields, so we read the raw JSON directly.
if raw := delta.RawJSON(); raw != "" {
var extra struct {
Thinking string `json:"thinking"`
Reasoning string `json:"reasoning"`
}
if json.Unmarshal([]byte(raw), &extra) == nil {
text := extra.Thinking
if text == "" {
text = extra.Reasoning
}
if text != "" {
s.cur = stream.Event{
Type: stream.EventThinkingDelta,
Text: text,
}
return true
}
}
}
}
// Stream ended — flush tool call Done events, then emit stop

View File

@@ -20,6 +20,10 @@ func unsanitizeToolName(name string) string {
if strings.HasPrefix(name, "fs_") {
return "fs." + name[3:]
}
// Some models (e.g. gemma4 via Ollama) use "fs:grep" instead of "fs_grep"
if strings.HasPrefix(name, "fs:") {
return "fs." + name[3:]
}
return name
}
@@ -127,6 +131,12 @@ func translateRequest(req provider.Request) oai.ChatCompletionNewParams {
IncludeUsage: param.NewOpt(true),
}
if req.ToolChoice != "" && len(params.Tools) > 0 {
params.ToolChoice = oai.ChatCompletionToolChoiceOptionUnionParam{
OfAuto: param.NewOpt(string(req.ToolChoice)),
}
}
return params
}

View File

@@ -8,6 +8,15 @@ import (
"somegit.dev/Owlibou/gnoma/internal/stream"
)
// ToolChoiceMode controls how the model selects tools.
type ToolChoiceMode string
const (
ToolChoiceAuto ToolChoiceMode = "auto"
ToolChoiceRequired ToolChoiceMode = "required"
ToolChoiceNone ToolChoiceMode = "none"
)
// Request encapsulates everything needed for a single LLM API call.
type Request struct {
Model string
@@ -21,6 +30,7 @@ type Request struct {
StopSequences []string
Thinking *ThinkingConfig
ResponseFormat *ResponseFormat
ToolChoice ToolChoiceMode // "" = provider default (auto)
}
// ToolDefinition is the provider-agnostic tool schema.

View File

@@ -1,5 +1,7 @@
package provider
import "math"
// RateLimits describes the rate limits for a provider+model pair.
// Zero values mean "no limit" or "unknown".
type RateLimits struct {
@@ -13,6 +15,31 @@ type RateLimits struct {
SpendCap float64 // monthly spend cap in provider currency
}
// MaxConcurrent returns the maximum number of concurrent in-flight requests
// that this rate limit allows. Returns 0 when there is no meaningful concurrency
// constraint (provider has high or unknown limits).
func (rl RateLimits) MaxConcurrent() int {
if rl.RPS > 0 {
n := int(math.Ceil(rl.RPS))
if n < 1 {
n = 1
}
return n
}
if rl.RPM > 0 {
// Allow 1 concurrent slot per 30 RPM (conservative heuristic).
n := rl.RPM / 30
if n < 1 {
n = 1
}
if n > 16 {
n = 16
}
return n
}
return 0
}
// ProviderDefaults holds default rate limits keyed by model glob.
// The special key "*" matches any model not explicitly listed.
type ProviderDefaults struct {