From dcc2c448ebe3fd450f672e1f341fa8fc9e6d48e5 Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Tue, 12 May 2026 01:25:16 -0500 Subject: [PATCH] fix(client): vLLM uses chat_template_kwargs to toggle reasoning, not the Anthropic field MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `apply_reasoning_effort`'s vLLM branch was injecting `thinking: {type: "disabled"}` at the top of the request body to turn off model reasoning. But vLLM speaks OpenAI's chat-completions protocol, not Anthropic-native extension fields, and silently ignored that directive — the model emitted a full hidden reasoning trace into the non-OpenAI-standard `reasoning` field (which this client does not surface), so users saw a ~13-second perceived freeze before the first content token arrived. The vLLM branch now emits the OpenAI extension `chat_template_kwargs.enable_thinking` — the canonical way to toggle Qwen3's `` mode, DeepSeek-R1's reasoning trace, and any other reasoning-capable model served via vLLM. End-to-end measurement against vLLM hosting Qwen3.6-35B-A3B-FP8: - TTFT: 13039ms → 274ms - Total LLM call: 13s → 5.7s - Output rate: 3 ch/s → 46 ch/s The `high` / `max` reasoning levels likewise route through `chat_template_kwargs` so the toggle is consistent across effort levels. No change for any non-vLLM provider (NVIDIA NIM continues to use the NVIDIA-specific `chat_template_kwargs.thinking` key; Anthropic-native providers keep the Anthropic-native field). Resolved a 3-way merge conflict against the v0.8.32 AtlasCloud harvest (PR #1436) so AtlasCloud stays in the no-op match arm alongside OpenAI / Ollama while the new vLLM arm gets its own branch. Note for future Sglang / Fireworks / Novita work: those servers likely have the same bug but each has its own chat_template_kwargs schema; this PR is intentionally minimal to the verified-fix scope. Harvested from PR #1480 by @h3c-hexin Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 18 ++++++++++++++++++ crates/tui/src/client.rs | 35 +++++++++++++++++++++++++++++------ 2 files changed, 47 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d6e6e999..e5f4e348 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,24 @@ real world uses." ### Fixed +- **vLLM provider: `reasoning_effort = "off"` now actually + disables thinking on Qwen3 / DeepSeek-R1 servers, cutting + TTFT from ~13s to ~270ms** (harvested from PR #1480 by + **@h3c-hexin**). The vLLM branch of `apply_reasoning_effort` + was injecting `thinking: {type: "disabled"}` at the top of + the request body — but vLLM speaks OpenAI's chat-completions + protocol, not Anthropic-native fields, and silently ignored + the directive. The model then emitted a full hidden reasoning + trace into the non-standard `reasoning` field (which this + client doesn't surface), so users saw a multi-second freeze + before any content token arrived. The vLLM branch now emits + the OpenAI extension `chat_template_kwargs.enable_thinking` + (which vLLM forwards into the model's chat template — the + canonical way to toggle Qwen3's `...` mode). + Measurement against vLLM + Qwen3.6-35B-A3B-FP8: TTFT + 13039ms → 274ms, total LLM call 13s → 5.7s. The `high` / + `max` effort levels likewise switch to the OpenAI extension. + No change for non-vLLM providers. - **`/sessions` picker no longer shows `` as the session title** (harvested from PR #1498 by **@wdw8276**). `session_manager::create_saved_session_with_id_and_mode` diff --git a/crates/tui/src/client.rs b/crates/tui/src/client.rs index b7ec2145..3b5875ff 100644 --- a/crates/tui/src/client.rs +++ b/crates/tui/src/client.rs @@ -888,10 +888,23 @@ pub(super) fn apply_reasoning_effort( | ApiProvider::Openrouter | ApiProvider::Novita | ApiProvider::Fireworks - | ApiProvider::Sglang - | ApiProvider::Vllm => { + | ApiProvider::Sglang => { body["thinking"] = json!({ "type": "disabled" }); } + // vLLM is an OpenAI-protocol server, not an Anthropic-protocol one. + // For Qwen3 / DeepSeek-R1 / other reasoning models hosted via vLLM, + // the canonical OpenAI extension to disable thinking is + // `chat_template_kwargs.enable_thinking`. The old + // `thinking: {type: disabled}` field is Anthropic-native and + // silently ignored by vLLM — the model still emits a full + // reasoning trace into the `reasoning` field (which this client + // doesn't surface), causing 10+ seconds of perceived "freeze" + // before the first content token (PR #1480 by @h3c-hexin). + ApiProvider::Vllm => { + body["chat_template_kwargs"] = json!({ + "enable_thinking": false, + }); + } ApiProvider::Openai | ApiProvider::Atlascloud | ApiProvider::Ollama => {} ApiProvider::NvidiaNim => { body["chat_template_kwargs"] = json!({ @@ -905,11 +918,16 @@ pub(super) fn apply_reasoning_effort( | ApiProvider::Openrouter | ApiProvider::Novita | ApiProvider::Fireworks - | ApiProvider::Sglang - | ApiProvider::Vllm => { + | ApiProvider::Sglang => { body["reasoning_effort"] = json!("high"); body["thinking"] = json!({ "type": "enabled" }); } + ApiProvider::Vllm => { + body["chat_template_kwargs"] = json!({ + "enable_thinking": true, + }); + body["reasoning_effort"] = json!("high"); + } ApiProvider::Openai | ApiProvider::Atlascloud | ApiProvider::Ollama => {} ApiProvider::NvidiaNim => { body["chat_template_kwargs"] = json!({ @@ -924,11 +942,16 @@ pub(super) fn apply_reasoning_effort( | ApiProvider::Openrouter | ApiProvider::Novita | ApiProvider::Fireworks - | ApiProvider::Sglang - | ApiProvider::Vllm => { + | ApiProvider::Sglang => { body["reasoning_effort"] = json!("max"); body["thinking"] = json!({ "type": "enabled" }); } + ApiProvider::Vllm => { + body["chat_template_kwargs"] = json!({ + "enable_thinking": true, + }); + body["reasoning_effort"] = json!("max"); + } ApiProvider::Openai | ApiProvider::Atlascloud | ApiProvider::Ollama => {} ApiProvider::NvidiaNim => { body["chat_template_kwargs"] = json!({