fix(provider): preserve Z.ai GLM thinking traces
Z.ai's current GLM-5.2 coding-agent docs confirm actual thinking efforts high and max, with ultracode mapped to max. Its Chat Completion API documents thinking.type, thinking.clear_thinking, and reasoning_content rather than a raw reasoning_effort scalar, so direct Z.ai requests now use the documented thinking object and preserve reasoning_content for coding-agent continuity. Also accepts ultracode as a max-effort alias across settings, routing, and Responses mapping. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1258,7 +1258,8 @@ pub(super) fn apply_reasoning_effort(
|
||||
| ApiProvider::Sglang
|
||||
| ApiProvider::Volcengine
|
||||
| ApiProvider::Together
|
||||
| ApiProvider::Atlascloud => {
|
||||
| ApiProvider::Atlascloud
|
||||
| ApiProvider::Zai => {
|
||||
body["thinking"] = json!({ "type": "disabled" });
|
||||
}
|
||||
ApiProvider::OpenaiCodex => {
|
||||
@@ -1304,7 +1305,7 @@ pub(super) fn apply_reasoning_effort(
|
||||
ApiProvider::Minimax => {
|
||||
body["thinking"] = json!({ "type": "disabled" });
|
||||
}
|
||||
ApiProvider::Zai | ApiProvider::Stepfun => {}
|
||||
ApiProvider::Stepfun => {}
|
||||
},
|
||||
"low" | "minimal" | "medium" | "mid" | "high" | "" => match provider {
|
||||
// DeepSeek compatibility: low/medium both map to high
|
||||
@@ -1381,9 +1382,15 @@ pub(super) fn apply_reasoning_effort(
|
||||
ApiProvider::Minimax => {
|
||||
body["thinking"] = json!({ "type": "adaptive" });
|
||||
}
|
||||
ApiProvider::Zai | ApiProvider::Stepfun => {}
|
||||
ApiProvider::Zai => {
|
||||
body["thinking"] = json!({
|
||||
"type": "enabled",
|
||||
"clear_thinking": false,
|
||||
});
|
||||
}
|
||||
ApiProvider::Stepfun => {}
|
||||
},
|
||||
"xhigh" | "max" | "highest" => match provider {
|
||||
"xhigh" | "max" | "highest" | "ultracode" => match provider {
|
||||
ApiProvider::Deepseek
|
||||
| ApiProvider::DeepseekCN
|
||||
| ApiProvider::Siliconflow
|
||||
@@ -1438,7 +1445,13 @@ pub(super) fn apply_reasoning_effort(
|
||||
ApiProvider::Minimax => {
|
||||
body["thinking"] = json!({ "type": "adaptive" });
|
||||
}
|
||||
ApiProvider::Zai | ApiProvider::Stepfun => {}
|
||||
ApiProvider::Zai => {
|
||||
body["thinking"] = json!({
|
||||
"type": "enabled",
|
||||
"clear_thinking": false,
|
||||
});
|
||||
}
|
||||
ApiProvider::Stepfun => {}
|
||||
},
|
||||
_ => {}
|
||||
}
|
||||
@@ -2882,6 +2895,34 @@ mod tests {
|
||||
assert_eq!(body, json!({ "reasoning_split": true }));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn reasoning_effort_zai_uses_documented_thinking_shape() {
|
||||
let mut body = json!({});
|
||||
apply_reasoning_effort(&mut body, Some("high"), ApiProvider::Zai);
|
||||
assert_eq!(
|
||||
body,
|
||||
json!({ "thinking": { "type": "enabled", "clear_thinking": false } })
|
||||
);
|
||||
|
||||
let mut body = json!({});
|
||||
apply_reasoning_effort(&mut body, Some("max"), ApiProvider::Zai);
|
||||
assert_eq!(
|
||||
body,
|
||||
json!({ "thinking": { "type": "enabled", "clear_thinking": false } })
|
||||
);
|
||||
|
||||
let mut body = json!({});
|
||||
apply_reasoning_effort(&mut body, Some("ultracode"), ApiProvider::Zai);
|
||||
assert_eq!(
|
||||
body,
|
||||
json!({ "thinking": { "type": "enabled", "clear_thinking": false } })
|
||||
);
|
||||
|
||||
let mut body = json!({});
|
||||
apply_reasoning_effort(&mut body, Some("off"), ApiProvider::Zai);
|
||||
assert_eq!(body, json!({ "thinking": { "type": "disabled" } }));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chat_parser_accepts_nvidia_nim_reasoning_field() -> Result<()> {
|
||||
let response = parse_chat_message(&json!({
|
||||
|
||||
@@ -2056,6 +2056,7 @@ fn provider_accepts_reasoning_content(provider: ApiProvider) -> bool {
|
||||
| ApiProvider::Arcee
|
||||
| ApiProvider::Minimax
|
||||
| ApiProvider::Sglang
|
||||
| ApiProvider::Zai
|
||||
| ApiProvider::Moonshot // #3016: Kimi thinking traces use reasoning_content
|
||||
)
|
||||
}
|
||||
@@ -3952,6 +3953,7 @@ mod alias_thinking_detection_tests {
|
||||
assert!(provider_accepts_reasoning_content(ApiProvider::XiaomiMimo));
|
||||
assert!(provider_accepts_reasoning_content(ApiProvider::Arcee));
|
||||
assert!(provider_accepts_reasoning_content(ApiProvider::Minimax));
|
||||
assert!(provider_accepts_reasoning_content(ApiProvider::Zai));
|
||||
// #3016: Moonshot's native endpoint streams Kimi thinking as
|
||||
// reasoning_content.
|
||||
assert!(provider_accepts_reasoning_content(ApiProvider::Moonshot));
|
||||
@@ -3987,6 +3989,11 @@ mod alias_thinking_detection_tests {
|
||||
"MiniMax-M3",
|
||||
None,
|
||||
));
|
||||
assert!(should_replay_reasoning_content_for_provider(
|
||||
ApiProvider::Zai,
|
||||
"GLM-5.2",
|
||||
None,
|
||||
));
|
||||
assert!(!should_replay_reasoning_content_for_provider(
|
||||
ApiProvider::Moonshot,
|
||||
"kimi-for-coding",
|
||||
@@ -4096,6 +4103,10 @@ mod alias_thinking_detection_tests {
|
||||
is_reasoning_model_for_stream(ApiProvider::Arcee, "trinity-large-thinking"),
|
||||
"trinity-large-thinking should stream reasoning as thinking on direct Arcee"
|
||||
);
|
||||
assert!(
|
||||
is_reasoning_model_for_stream(ApiProvider::Zai, "GLM-5.2"),
|
||||
"GLM-5.2 should stream reasoning_content as thinking on direct Z.ai"
|
||||
);
|
||||
for model in [
|
||||
"arcee-ai/trinity-large-thinking",
|
||||
"minimax/minimax-m3",
|
||||
|
||||
@@ -609,7 +609,7 @@ fn codex_responses_reasoning_effort(raw: &str) -> Option<&'static str> {
|
||||
"minimal" => Some("low"),
|
||||
"low" => Some("low"),
|
||||
"high" => Some("high"),
|
||||
"xhigh" | "max" | "maximum" => Some("xhigh"),
|
||||
"xhigh" | "max" | "maximum" | "ultracode" => Some("xhigh"),
|
||||
_ => Some("medium"),
|
||||
}
|
||||
}
|
||||
@@ -707,6 +707,7 @@ mod tests {
|
||||
assert_eq!(codex_responses_reasoning_effort("max"), Some("xhigh"));
|
||||
assert_eq!(codex_responses_reasoning_effort("maximum"), Some("xhigh"));
|
||||
assert_eq!(codex_responses_reasoning_effort("xhigh"), Some("xhigh"));
|
||||
assert_eq!(codex_responses_reasoning_effort("ultracode"), Some("xhigh"));
|
||||
assert_eq!(codex_responses_reasoning_effort("high"), Some("high"));
|
||||
assert_eq!(codex_responses_reasoning_effort("medium"), Some("medium"));
|
||||
assert_eq!(codex_responses_reasoning_effort("minimal"), Some("low"));
|
||||
|
||||
@@ -315,7 +315,7 @@ fn parse_auto_route_reasoning_effort(effort: &str) -> Option<ReasoningEffort> {
|
||||
"off" | "disabled" | "none" | "false" => Some(ReasoningEffort::Off),
|
||||
"low" | "minimal" | "medium" | "mid" => Some(ReasoningEffort::High),
|
||||
"high" => Some(ReasoningEffort::High),
|
||||
"max" | "maximum" | "xhigh" => Some(ReasoningEffort::Max),
|
||||
"max" | "maximum" | "xhigh" | "ultracode" => Some(ReasoningEffort::Max),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
@@ -622,6 +622,12 @@ mod tests {
|
||||
|
||||
assert_eq!(rec.model, "deepseek-v4-pro");
|
||||
assert_eq!(rec.reasoning_effort, Some(ReasoningEffort::Max));
|
||||
|
||||
let rec = parse_auto_route_recommendation(
|
||||
r#"{"model":"deepseek-v4-pro","reasoning_effort":"ultracode"}"#,
|
||||
)
|
||||
.expect("ultracode should parse as max");
|
||||
assert_eq!(rec.reasoning_effort, Some(ReasoningEffort::Max));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1116,10 +1116,10 @@ fn normalize_reasoning_effort_setting(value: &str) -> Result<Option<String>> {
|
||||
"medium" | "mid" => "medium",
|
||||
"high" => "high",
|
||||
"auto" | "automatic" => "auto",
|
||||
"max" | "maximum" | "xhigh" => "max",
|
||||
"max" | "maximum" | "xhigh" | "ultracode" => "max",
|
||||
_ => {
|
||||
anyhow::bail!(
|
||||
"Failed to update setting: invalid reasoning_effort '{value}'. Expected: auto, off, low, medium, high, max, xhigh, or default."
|
||||
"Failed to update setting: invalid reasoning_effort '{value}'. Expected: auto, off, low, medium, high, max, xhigh, ultracode, or default."
|
||||
);
|
||||
}
|
||||
};
|
||||
@@ -1388,6 +1388,10 @@ mod tests {
|
||||
.set("reasoning_effort", "xhigh")
|
||||
.expect("normalize xhigh");
|
||||
assert_eq!(settings.reasoning_effort.as_deref(), Some("max"));
|
||||
settings
|
||||
.set("reasoning_effort", "ultracode")
|
||||
.expect("normalize ultracode");
|
||||
assert_eq!(settings.reasoning_effort.as_deref(), Some("max"));
|
||||
settings
|
||||
.set("reasoning_effort", "default")
|
||||
.expect("clear effort");
|
||||
|
||||
@@ -207,7 +207,7 @@ impl ReasoningEffort {
|
||||
"medium" | "mid" => Self::Medium,
|
||||
"high" => Self::High,
|
||||
"auto" | "automatic" => Self::Auto,
|
||||
"max" | "maximum" | "xhigh" => Self::Max,
|
||||
"max" | "maximum" | "xhigh" | "ultracode" => Self::Max,
|
||||
_ => Self::default(),
|
||||
}
|
||||
}
|
||||
@@ -5664,6 +5664,10 @@ mod tests {
|
||||
ReasoningEffort::Max.api_value_for_provider(ApiProvider::Deepseek),
|
||||
Some("max")
|
||||
);
|
||||
assert_eq!(
|
||||
ReasoningEffort::from_setting("ultracode"),
|
||||
ReasoningEffort::Max
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -958,7 +958,7 @@ If you are upgrading from older releases:
|
||||
- `path_suffix` (string, optional provider-table key): override the chat-completions path for OpenAI-compatible gateways that do not serve `/v1/chat/completions`. For example, `[providers.openai] path_suffix = "/chat/completions"` sends chat requests to the unversioned base URL plus `/chat/completions`; `models` and `beta/*` requests keep their normal routing.
|
||||
- `insecure_skip_tls_verify` (bool, optional provider-table key): disabled by default. When true on the active provider table, only the LLM provider HTTP client skips TLS certificate verification. Prefer `SSL_CERT_FILE` for corporate or private CA bundles; `codewhale doctor` reports this setting when enabled.
|
||||
- `default_text_model` (string, optional): defaults to `deepseek-v4-pro` for DeepSeek and generic OpenAI-compatible endpoints, `deepseek-ai/deepseek-v4-pro` for NVIDIA NIM, `deepseek-ai/deepseek-v4-flash` for AtlasCloud, `deepseek-reasoner` for Wanjie Ark, `DeepSeek-V4-Pro` for Volcengine Ark, `deepseek/deepseek-v4-pro` for OpenRouter and Novita, `mimo-v2.5-pro` for Xiaomi MiMo, `accounts/fireworks/models/deepseek-v4-pro` for Fireworks, `deepseek-ai/DeepSeek-V4-Pro` for SiliconFlow, `trinity-large-thinking` for Arcee AI, `kimi-k2.7-code` for Moonshot, `MiniMax-M3` for MiniMax, `GLM-5.1` for Z.ai, `step-3.7-flash` for StepFun, `deepseek-ai/DeepSeek-V4-Pro` for SGLang/vLLM, and `deepseek-coder:1.3b` for Ollama. Hugging Face and Together AI both default to `deepseek-ai/DeepSeek-V4-Pro`. Current public DeepSeek IDs are `deepseek-v4-pro` and `deepseek-v4-flash`, both with 1M context windows, 384K max output, and thinking mode enabled by default. Legacy `deepseek-chat` and `deepseek-reasoner` remain compatibility aliases for `deepseek-v4-flash` until July 24, 2026, except SiliconFlow maps `deepseek-reasoner` and `deepseek-r1` to its Pro model while `deepseek-chat` and `deepseek-v3` map to Flash. Provider-specific mappings translate `deepseek-v4-pro` / `deepseek-v4-flash` to each provider's model ID where supported. OpenRouter also recognizes recent large IDs such as `arcee-ai/trinity-large-thinking`, `minimax/minimax-m3`, `minimax/minimax-2.7`, `xiaomi/mimo-v2.5-pro`, `qwen/qwen3.6-flash`, `qwen/qwen3.6-35b-a3b`, `qwen/qwen3.6-max-preview`, `qwen/qwen3.6-27b`, `qwen/qwen3.6-plus`, `qwen/qwen3.7-max`, `google/gemma-4-31b-it`, `moonshotai/kimi-k2.7-code`, `moonshotai/kimi-k2.6`, `nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free`, and `nvidia/nemotron-3-ultra-550b-a55b`; direct Arcee uses bare IDs such as `trinity-large-thinking` and `trinity-large-preview`; direct Moonshot recognizes `kimi-k2.7-code`, `kimi-k2.6`, and Kimi Code's stable `kimi-for-coding`; direct MiniMax recognizes `MiniMax-M3` and the documented M2.x chat model IDs; direct Xiaomi MiMo recognizes chat IDs `mimo-v2.5-pro` and `mimo-v2.5`, while TTS IDs are selected through `codewhale speech` / `tts`. Generic `openai`, `atlascloud`, `wanjie-ark`, `xiaomi-mimo`, `arcee`, `moonshot`, `minimax`, `zai`, `stepfun`, and Ollama model IDs are passed through unchanged after known aliases are normalized. OpenRouter and SiliconFlow provider configs with a custom `base_url` also preserve explicit model values, which lets OpenAI-compatible gateways accept bare model IDs. Use `/models` or `codewhale models` to discover live IDs from your configured endpoint. `CODEWHALE_MODEL` overrides this for a single process; `DEEPSEEK_MODEL` is the legacy alias.
|
||||
- `reasoning_effort` (string, optional): `off`, `low`, `medium`, `high`, `max`, or `xhigh`; defaults to the configured UI tier. DeepSeek Platform receives top-level `thinking` / `reasoning_effort` fields. OpenAI Codex normalizes stale `off` to `low` and sends `max` as Responses `xhigh`. NVIDIA NIM receives equivalent settings through `chat_template_kwargs`.
|
||||
- `reasoning_effort` (string, optional): `off`, `low`, `medium`, `high`, `max`, `xhigh`, or `ultracode`; defaults to the configured UI tier. DeepSeek Platform receives top-level `thinking` / `reasoning_effort` fields. OpenAI Codex normalizes stale `off` to `low` and sends `max` / `ultracode` as Responses `xhigh`. Z.ai receives documented `thinking` controls and treats enabled thinking as the GLM coding high/max lane. NVIDIA NIM receives equivalent settings through `chat_template_kwargs`.
|
||||
- `verbosity` (string, optional): `normal` or `concise`. `normal` keeps the
|
||||
default conversational prompt. `concise` appends a prompt discipline block
|
||||
for direct, low-chatter output; CLI noninteractive commands (`exec`, `eval`,
|
||||
|
||||
Reference in New Issue
Block a user