fix(provider): preserve Z.ai GLM thinking traces

Z.ai's current GLM-5.2 coding-agent docs confirm actual thinking efforts high and max, with ultracode mapped to max. Its Chat Completion API documents thinking.type, thinking.clear_thinking, and reasoning_content rather than a raw reasoning_effort scalar, so direct Z.ai requests now use the documented thinking object and preserve reasoning_content for coding-agent continuity.

Also accepts ultracode as a max-effort alias across settings, routing, and Responses mapping.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hunter B
2026-06-13 08:10:37 -07:00
parent 6ac93e81c7
commit f6867e65bd
7 changed files with 78 additions and 11 deletions
+46 -5
View File
@@ -1258,7 +1258,8 @@ pub(super) fn apply_reasoning_effort(
| ApiProvider::Sglang
| ApiProvider::Volcengine
| ApiProvider::Together
| ApiProvider::Atlascloud => {
| ApiProvider::Atlascloud
| ApiProvider::Zai => {
body["thinking"] = json!({ "type": "disabled" });
}
ApiProvider::OpenaiCodex => {
@@ -1304,7 +1305,7 @@ pub(super) fn apply_reasoning_effort(
ApiProvider::Minimax => {
body["thinking"] = json!({ "type": "disabled" });
}
ApiProvider::Zai | ApiProvider::Stepfun => {}
ApiProvider::Stepfun => {}
},
"low" | "minimal" | "medium" | "mid" | "high" | "" => match provider {
// DeepSeek compatibility: low/medium both map to high
@@ -1381,9 +1382,15 @@ pub(super) fn apply_reasoning_effort(
ApiProvider::Minimax => {
body["thinking"] = json!({ "type": "adaptive" });
}
ApiProvider::Zai | ApiProvider::Stepfun => {}
ApiProvider::Zai => {
body["thinking"] = json!({
"type": "enabled",
"clear_thinking": false,
});
}
ApiProvider::Stepfun => {}
},
"xhigh" | "max" | "highest" => match provider {
"xhigh" | "max" | "highest" | "ultracode" => match provider {
ApiProvider::Deepseek
| ApiProvider::DeepseekCN
| ApiProvider::Siliconflow
@@ -1438,7 +1445,13 @@ pub(super) fn apply_reasoning_effort(
ApiProvider::Minimax => {
body["thinking"] = json!({ "type": "adaptive" });
}
ApiProvider::Zai | ApiProvider::Stepfun => {}
ApiProvider::Zai => {
body["thinking"] = json!({
"type": "enabled",
"clear_thinking": false,
});
}
ApiProvider::Stepfun => {}
},
_ => {}
}
@@ -2882,6 +2895,34 @@ mod tests {
assert_eq!(body, json!({ "reasoning_split": true }));
}
#[test]
fn reasoning_effort_zai_uses_documented_thinking_shape() {
let mut body = json!({});
apply_reasoning_effort(&mut body, Some("high"), ApiProvider::Zai);
assert_eq!(
body,
json!({ "thinking": { "type": "enabled", "clear_thinking": false } })
);
let mut body = json!({});
apply_reasoning_effort(&mut body, Some("max"), ApiProvider::Zai);
assert_eq!(
body,
json!({ "thinking": { "type": "enabled", "clear_thinking": false } })
);
let mut body = json!({});
apply_reasoning_effort(&mut body, Some("ultracode"), ApiProvider::Zai);
assert_eq!(
body,
json!({ "thinking": { "type": "enabled", "clear_thinking": false } })
);
let mut body = json!({});
apply_reasoning_effort(&mut body, Some("off"), ApiProvider::Zai);
assert_eq!(body, json!({ "thinking": { "type": "disabled" } }));
}
#[test]
fn chat_parser_accepts_nvidia_nim_reasoning_field() -> Result<()> {
let response = parse_chat_message(&json!({
+11
View File
@@ -2056,6 +2056,7 @@ fn provider_accepts_reasoning_content(provider: ApiProvider) -> bool {
| ApiProvider::Arcee
| ApiProvider::Minimax
| ApiProvider::Sglang
| ApiProvider::Zai
| ApiProvider::Moonshot // #3016: Kimi thinking traces use reasoning_content
)
}
@@ -3952,6 +3953,7 @@ mod alias_thinking_detection_tests {
assert!(provider_accepts_reasoning_content(ApiProvider::XiaomiMimo));
assert!(provider_accepts_reasoning_content(ApiProvider::Arcee));
assert!(provider_accepts_reasoning_content(ApiProvider::Minimax));
assert!(provider_accepts_reasoning_content(ApiProvider::Zai));
// #3016: Moonshot's native endpoint streams Kimi thinking as
// reasoning_content.
assert!(provider_accepts_reasoning_content(ApiProvider::Moonshot));
@@ -3987,6 +3989,11 @@ mod alias_thinking_detection_tests {
"MiniMax-M3",
None,
));
assert!(should_replay_reasoning_content_for_provider(
ApiProvider::Zai,
"GLM-5.2",
None,
));
assert!(!should_replay_reasoning_content_for_provider(
ApiProvider::Moonshot,
"kimi-for-coding",
@@ -4096,6 +4103,10 @@ mod alias_thinking_detection_tests {
is_reasoning_model_for_stream(ApiProvider::Arcee, "trinity-large-thinking"),
"trinity-large-thinking should stream reasoning as thinking on direct Arcee"
);
assert!(
is_reasoning_model_for_stream(ApiProvider::Zai, "GLM-5.2"),
"GLM-5.2 should stream reasoning_content as thinking on direct Z.ai"
);
for model in [
"arcee-ai/trinity-large-thinking",
"minimax/minimax-m3",
+2 -1
View File
@@ -609,7 +609,7 @@ fn codex_responses_reasoning_effort(raw: &str) -> Option<&'static str> {
"minimal" => Some("low"),
"low" => Some("low"),
"high" => Some("high"),
"xhigh" | "max" | "maximum" => Some("xhigh"),
"xhigh" | "max" | "maximum" | "ultracode" => Some("xhigh"),
_ => Some("medium"),
}
}
@@ -707,6 +707,7 @@ mod tests {
assert_eq!(codex_responses_reasoning_effort("max"), Some("xhigh"));
assert_eq!(codex_responses_reasoning_effort("maximum"), Some("xhigh"));
assert_eq!(codex_responses_reasoning_effort("xhigh"), Some("xhigh"));
assert_eq!(codex_responses_reasoning_effort("ultracode"), Some("xhigh"));
assert_eq!(codex_responses_reasoning_effort("high"), Some("high"));
assert_eq!(codex_responses_reasoning_effort("medium"), Some("medium"));
assert_eq!(codex_responses_reasoning_effort("minimal"), Some("low"));
+7 -1
View File
@@ -315,7 +315,7 @@ fn parse_auto_route_reasoning_effort(effort: &str) -> Option<ReasoningEffort> {
"off" | "disabled" | "none" | "false" => Some(ReasoningEffort::Off),
"low" | "minimal" | "medium" | "mid" => Some(ReasoningEffort::High),
"high" => Some(ReasoningEffort::High),
"max" | "maximum" | "xhigh" => Some(ReasoningEffort::Max),
"max" | "maximum" | "xhigh" | "ultracode" => Some(ReasoningEffort::Max),
_ => None,
}
}
@@ -622,6 +622,12 @@ mod tests {
assert_eq!(rec.model, "deepseek-v4-pro");
assert_eq!(rec.reasoning_effort, Some(ReasoningEffort::Max));
let rec = parse_auto_route_recommendation(
r#"{"model":"deepseek-v4-pro","reasoning_effort":"ultracode"}"#,
)
.expect("ultracode should parse as max");
assert_eq!(rec.reasoning_effort, Some(ReasoningEffort::Max));
}
#[test]
+6 -2
View File
@@ -1116,10 +1116,10 @@ fn normalize_reasoning_effort_setting(value: &str) -> Result<Option<String>> {
"medium" | "mid" => "medium",
"high" => "high",
"auto" | "automatic" => "auto",
"max" | "maximum" | "xhigh" => "max",
"max" | "maximum" | "xhigh" | "ultracode" => "max",
_ => {
anyhow::bail!(
"Failed to update setting: invalid reasoning_effort '{value}'. Expected: auto, off, low, medium, high, max, xhigh, or default."
"Failed to update setting: invalid reasoning_effort '{value}'. Expected: auto, off, low, medium, high, max, xhigh, ultracode, or default."
);
}
};
@@ -1388,6 +1388,10 @@ mod tests {
.set("reasoning_effort", "xhigh")
.expect("normalize xhigh");
assert_eq!(settings.reasoning_effort.as_deref(), Some("max"));
settings
.set("reasoning_effort", "ultracode")
.expect("normalize ultracode");
assert_eq!(settings.reasoning_effort.as_deref(), Some("max"));
settings
.set("reasoning_effort", "default")
.expect("clear effort");
+5 -1
View File
@@ -207,7 +207,7 @@ impl ReasoningEffort {
"medium" | "mid" => Self::Medium,
"high" => Self::High,
"auto" | "automatic" => Self::Auto,
"max" | "maximum" | "xhigh" => Self::Max,
"max" | "maximum" | "xhigh" | "ultracode" => Self::Max,
_ => Self::default(),
}
}
@@ -5664,6 +5664,10 @@ mod tests {
ReasoningEffort::Max.api_value_for_provider(ApiProvider::Deepseek),
Some("max")
);
assert_eq!(
ReasoningEffort::from_setting("ultracode"),
ReasoningEffort::Max
);
}
#[test]
+1 -1
View File
@@ -958,7 +958,7 @@ If you are upgrading from older releases:
- `path_suffix` (string, optional provider-table key): override the chat-completions path for OpenAI-compatible gateways that do not serve `/v1/chat/completions`. For example, `[providers.openai] path_suffix = "/chat/completions"` sends chat requests to the unversioned base URL plus `/chat/completions`; `models` and `beta/*` requests keep their normal routing.
- `insecure_skip_tls_verify` (bool, optional provider-table key): disabled by default. When true on the active provider table, only the LLM provider HTTP client skips TLS certificate verification. Prefer `SSL_CERT_FILE` for corporate or private CA bundles; `codewhale doctor` reports this setting when enabled.
- `default_text_model` (string, optional): defaults to `deepseek-v4-pro` for DeepSeek and generic OpenAI-compatible endpoints, `deepseek-ai/deepseek-v4-pro` for NVIDIA NIM, `deepseek-ai/deepseek-v4-flash` for AtlasCloud, `deepseek-reasoner` for Wanjie Ark, `DeepSeek-V4-Pro` for Volcengine Ark, `deepseek/deepseek-v4-pro` for OpenRouter and Novita, `mimo-v2.5-pro` for Xiaomi MiMo, `accounts/fireworks/models/deepseek-v4-pro` for Fireworks, `deepseek-ai/DeepSeek-V4-Pro` for SiliconFlow, `trinity-large-thinking` for Arcee AI, `kimi-k2.7-code` for Moonshot, `MiniMax-M3` for MiniMax, `GLM-5.1` for Z.ai, `step-3.7-flash` for StepFun, `deepseek-ai/DeepSeek-V4-Pro` for SGLang/vLLM, and `deepseek-coder:1.3b` for Ollama. Hugging Face and Together AI both default to `deepseek-ai/DeepSeek-V4-Pro`. Current public DeepSeek IDs are `deepseek-v4-pro` and `deepseek-v4-flash`, both with 1M context windows, 384K max output, and thinking mode enabled by default. Legacy `deepseek-chat` and `deepseek-reasoner` remain compatibility aliases for `deepseek-v4-flash` until July 24, 2026, except SiliconFlow maps `deepseek-reasoner` and `deepseek-r1` to its Pro model while `deepseek-chat` and `deepseek-v3` map to Flash. Provider-specific mappings translate `deepseek-v4-pro` / `deepseek-v4-flash` to each provider's model ID where supported. OpenRouter also recognizes recent large IDs such as `arcee-ai/trinity-large-thinking`, `minimax/minimax-m3`, `minimax/minimax-2.7`, `xiaomi/mimo-v2.5-pro`, `qwen/qwen3.6-flash`, `qwen/qwen3.6-35b-a3b`, `qwen/qwen3.6-max-preview`, `qwen/qwen3.6-27b`, `qwen/qwen3.6-plus`, `qwen/qwen3.7-max`, `google/gemma-4-31b-it`, `moonshotai/kimi-k2.7-code`, `moonshotai/kimi-k2.6`, `nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free`, and `nvidia/nemotron-3-ultra-550b-a55b`; direct Arcee uses bare IDs such as `trinity-large-thinking` and `trinity-large-preview`; direct Moonshot recognizes `kimi-k2.7-code`, `kimi-k2.6`, and Kimi Code's stable `kimi-for-coding`; direct MiniMax recognizes `MiniMax-M3` and the documented M2.x chat model IDs; direct Xiaomi MiMo recognizes chat IDs `mimo-v2.5-pro` and `mimo-v2.5`, while TTS IDs are selected through `codewhale speech` / `tts`. Generic `openai`, `atlascloud`, `wanjie-ark`, `xiaomi-mimo`, `arcee`, `moonshot`, `minimax`, `zai`, `stepfun`, and Ollama model IDs are passed through unchanged after known aliases are normalized. OpenRouter and SiliconFlow provider configs with a custom `base_url` also preserve explicit model values, which lets OpenAI-compatible gateways accept bare model IDs. Use `/models` or `codewhale models` to discover live IDs from your configured endpoint. `CODEWHALE_MODEL` overrides this for a single process; `DEEPSEEK_MODEL` is the legacy alias.
- `reasoning_effort` (string, optional): `off`, `low`, `medium`, `high`, `max`, or `xhigh`; defaults to the configured UI tier. DeepSeek Platform receives top-level `thinking` / `reasoning_effort` fields. OpenAI Codex normalizes stale `off` to `low` and sends `max` as Responses `xhigh`. NVIDIA NIM receives equivalent settings through `chat_template_kwargs`.
- `reasoning_effort` (string, optional): `off`, `low`, `medium`, `high`, `max`, `xhigh`, or `ultracode`; defaults to the configured UI tier. DeepSeek Platform receives top-level `thinking` / `reasoning_effort` fields. OpenAI Codex normalizes stale `off` to `low` and sends `max` / `ultracode` as Responses `xhigh`. Z.ai receives documented `thinking` controls and treats enabled thinking as the GLM coding high/max lane. NVIDIA NIM receives equivalent settings through `chat_template_kwargs`.
- `verbosity` (string, optional): `normal` or `concise`. `normal` keeps the
default conversational prompt. `concise` appends a prompt discipline block
for direct, low-chatter output; CLI noninteractive commands (`exec`, `eval`,