fix(slm): enforce JSON output + strip thinking-block prefixes

Two structural fixes for the SLM classifier's 100% failure rate:

(1) Pass ResponseFormat=json_object + Temperature=0 + TopP=1 +
MaxTokens=128 in the classifier Request. The provider type already
supports these but callSLM was leaving them unset, which meant ollama
(and any other backend) ran with default sampling and free-form text
output. format=json mode in particular makes ollama emit only valid
JSON at decoding time — eliminates the majority of parse failures.

(2) Harden extractJSON to strip common thinking-block tags before
hunting for the brace. Seen in the wild: <think>…</think> (Qwen3
distillations) and <Thought Process>…</Thought Process> (tiny3.5).
Defensive list also covers <reasoning>, <thoughts>. Unterminated
thinking blocks fall back to brace-search so we still have a shot.
Table-driven tests cover all variants plus the no-tag and
fenced-json paths to confirm no regression.

Even with format=json on a capable provider, the extractor is the
safety net for backends that don't enforce format strictly — same
defence-in-depth shape as the existing fence stripping.

Doesn't fix the deeper architecture question (encoder + bandit
preferred over decoder-SLM as classifier — see plan doc landing in
the same PR); fixes the immediate bug.
This commit is contained in:
2026-05-25 01:19:51 +02:00
parent f3c70bd802
commit c0c2e4bff5
2 changed files with 96 additions and 1 deletions
+54 -1
View File
@@ -102,9 +102,25 @@ func (c *Classifier) Classify(ctx context.Context, prompt string, history []mess
}
func (c *Classifier) callSLM(ctx context.Context, prompt string) (*classifyResponse, error) {
// Constrain the model toward valid, deterministic JSON output. Without
// these settings small models routinely ignore the JSON-only system
// prompt, emit reasoning blocks (<think>, <Thought Process>) or just
// answer the user's prompt in prose. ResponseFormat=json_object asks
// the provider to enforce JSON at decoding time where supported
// (ollama 'format=json', llama.cpp grammar, OpenAI json_object). Even
// when the provider can't enforce, the explicit signal nudges the
// adapter to set the right backend flag.
temp := 0.0
topP := 1.0
req := provider.Request{
Model: c.model,
SystemPrompt: classifySystemPrompt,
Temperature: &temp,
TopP: &topP,
MaxTokens: 128, // classification output is ~50 tokens; cap to prevent runaway reasoning
ResponseFormat: &provider.ResponseFormat{
Type: provider.ResponseJSON,
},
Messages: []message.Message{
{
Role: message.RoleUser,
@@ -138,10 +154,22 @@ func (c *Classifier) callSLM(ctx context.Context, prompt string) (*classifyRespo
return &resp, nil
}
// extractJSON pulls the first {...} substring from s, stripping markdown fences if present.
// extractJSON pulls the first {...} substring from s, stripping markdown
// fences and known thinking-block tags. Small models routinely violate
// the JSON-only system prompt by emitting reasoning tokens first, so
// the extractor must tolerate prefixes the model wasn't asked to emit.
func extractJSON(s string) string {
s = strings.TrimSpace(s)
// Strip known thinking-block tags. Order matters: longer/more-
// specific names first so a partial match doesn't shadow a real
// one. Seen in the wild on Qwen3 (<think>) and tiny3.5
// (<Thought Process>); the others are defensive against similar
// fine-tunes.
for _, tag := range []string{"Thought Process", "thinking", "reasoning", "thoughts", "think"} {
s = stripTagBlock(s, tag)
}
// Strip ```json ... ``` fences.
if strings.HasPrefix(s, "```") {
end := strings.LastIndex(s, "```")
@@ -171,3 +199,28 @@ func extractJSON(s string) string {
}
return s[start:]
}
// stripTagBlock removes <tag>...</tag> blocks (case-insensitive on the
// tag name) from the start of s. Returns the original string if the tag
// is not at the start. Idempotent; safe to call repeatedly.
func stripTagBlock(s, tag string) string {
trimmed := strings.TrimSpace(s)
open := "<" + tag
lower := strings.ToLower(trimmed)
if !strings.HasPrefix(lower, strings.ToLower(open)) {
return s
}
// Find the matching closing tag, case-insensitive.
close := "</" + tag + ">"
closeIdx := strings.Index(strings.ToLower(trimmed), strings.ToLower(close))
if closeIdx < 0 {
// Unterminated thinking block — strip up to the first '{'
// so we still have a shot at extracting JSON that follows.
braceIdx := strings.IndexByte(trimmed, '{')
if braceIdx > 0 {
return strings.TrimSpace(trimmed[braceIdx:])
}
return s
}
return strings.TrimSpace(trimmed[closeIdx+len(close):])
}
+42
View File
@@ -215,3 +215,45 @@ func TestClassifier_ContextPassedToHistory(t *testing.T) {
t.Errorf("Type = %s, want Explain", task.Type)
}
}
func TestExtractJSON_StripsThinkingTags(t *testing.T) {
cases := []struct {
name string
in string
want string
}{
{
name: "qwen-think-block",
in: `<think>Let me decide</think>{"task_type":"Debug","complexity":0.5,"requires_tools":true}`,
want: `{"task_type":"Debug","complexity":0.5,"requires_tools":true}`,
},
{
name: "tiny3.5-thought-process",
in: "<Thought Process>\nUser wants debugging help.\n</Thought Process>\n{\"task_type\":\"Debug\",\"complexity\":0.4,\"requires_tools\":true}",
want: `{"task_type":"Debug","complexity":0.4,"requires_tools":true}`,
},
{
name: "unterminated-think-falls-back-to-brace",
in: `<think>incomplete reasoning {"task_type":"Explain","complexity":0.2,"requires_tools":false}`,
want: `{"task_type":"Explain","complexity":0.2,"requires_tools":false}`,
},
{
name: "no-tags-still-works",
in: `{"task_type":"Generation","complexity":0.6,"requires_tools":false}`,
want: `{"task_type":"Generation","complexity":0.6,"requires_tools":false}`,
},
{
name: "fenced-json-still-works",
in: "```json\n{\"task_type\":\"Refactor\",\"complexity\":0.5,\"requires_tools\":true}\n```",
want: `{"task_type":"Refactor","complexity":0.5,"requires_tools":true}`,
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
got := extractJSON(tc.in)
if got != tc.want {
t.Errorf("extractJSON(...)\n got: %q\n want: %q", got, tc.want)
}
})
}
}