diff --git a/CHANGELOG.md b/CHANGELOG.md index d6e6e999..e5f4e348 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,24 @@ real world uses." ### Fixed +- **vLLM provider: `reasoning_effort = "off"` now actually + disables thinking on Qwen3 / DeepSeek-R1 servers, cutting + TTFT from ~13s to ~270ms** (harvested from PR #1480 by + **@h3c-hexin**). The vLLM branch of `apply_reasoning_effort` + was injecting `thinking: {type: "disabled"}` at the top of + the request body — but vLLM speaks OpenAI's chat-completions + protocol, not Anthropic-native fields, and silently ignored + the directive. The model then emitted a full hidden reasoning + trace into the non-standard `reasoning` field (which this + client doesn't surface), so users saw a multi-second freeze + before any content token arrived. The vLLM branch now emits + the OpenAI extension `chat_template_kwargs.enable_thinking` + (which vLLM forwards into the model's chat template — the + canonical way to toggle Qwen3's `...` mode). + Measurement against vLLM + Qwen3.6-35B-A3B-FP8: TTFT + 13039ms → 274ms, total LLM call 13s → 5.7s. The `high` / + `max` effort levels likewise switch to the OpenAI extension. + No change for non-vLLM providers. - **`/sessions` picker no longer shows `` as the session title** (harvested from PR #1498 by **@wdw8276**). `session_manager::create_saved_session_with_id_and_mode` diff --git a/crates/tui/src/client.rs b/crates/tui/src/client.rs index b7ec2145..3b5875ff 100644 --- a/crates/tui/src/client.rs +++ b/crates/tui/src/client.rs @@ -888,10 +888,23 @@ pub(super) fn apply_reasoning_effort( | ApiProvider::Openrouter | ApiProvider::Novita | ApiProvider::Fireworks - | ApiProvider::Sglang - | ApiProvider::Vllm => { + | ApiProvider::Sglang => { body["thinking"] = json!({ "type": "disabled" }); } + // vLLM is an OpenAI-protocol server, not an Anthropic-protocol one. + // For Qwen3 / DeepSeek-R1 / other reasoning models hosted via vLLM, + // the canonical OpenAI extension to disable thinking is + // `chat_template_kwargs.enable_thinking`. The old + // `thinking: {type: disabled}` field is Anthropic-native and + // silently ignored by vLLM — the model still emits a full + // reasoning trace into the `reasoning` field (which this client + // doesn't surface), causing 10+ seconds of perceived "freeze" + // before the first content token (PR #1480 by @h3c-hexin). + ApiProvider::Vllm => { + body["chat_template_kwargs"] = json!({ + "enable_thinking": false, + }); + } ApiProvider::Openai | ApiProvider::Atlascloud | ApiProvider::Ollama => {} ApiProvider::NvidiaNim => { body["chat_template_kwargs"] = json!({ @@ -905,11 +918,16 @@ pub(super) fn apply_reasoning_effort( | ApiProvider::Openrouter | ApiProvider::Novita | ApiProvider::Fireworks - | ApiProvider::Sglang - | ApiProvider::Vllm => { + | ApiProvider::Sglang => { body["reasoning_effort"] = json!("high"); body["thinking"] = json!({ "type": "enabled" }); } + ApiProvider::Vllm => { + body["chat_template_kwargs"] = json!({ + "enable_thinking": true, + }); + body["reasoning_effort"] = json!("high"); + } ApiProvider::Openai | ApiProvider::Atlascloud | ApiProvider::Ollama => {} ApiProvider::NvidiaNim => { body["chat_template_kwargs"] = json!({ @@ -924,11 +942,16 @@ pub(super) fn apply_reasoning_effort( | ApiProvider::Openrouter | ApiProvider::Novita | ApiProvider::Fireworks - | ApiProvider::Sglang - | ApiProvider::Vllm => { + | ApiProvider::Sglang => { body["reasoning_effort"] = json!("max"); body["thinking"] = json!({ "type": "enabled" }); } + ApiProvider::Vllm => { + body["chat_template_kwargs"] = json!({ + "enable_thinking": true, + }); + body["reasoning_effort"] = json!("max"); + } ApiProvider::Openai | ApiProvider::Atlascloud | ApiProvider::Ollama => {} ApiProvider::NvidiaNim => { body["chat_template_kwargs"] = json!({