From dcc2c448ebe3fd450f672e1f341fa8fc9e6d48e5 Mon Sep 17 00:00:00 2001
From: Hunter Bown <hmbown@gmail.com>
Date: Tue, 12 May 2026 01:25:16 -0500
Subject: [PATCH] fix(client): vLLM uses chat_template_kwargs to toggle
 reasoning, not the Anthropic field
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`apply_reasoning_effort`'s vLLM branch was injecting
`thinking: {type: "disabled"}` at the top of the request body to
turn off model reasoning. But vLLM speaks OpenAI's
chat-completions protocol, not Anthropic-native extension fields,
and silently ignored that directive — the model emitted a full
hidden reasoning trace into the non-OpenAI-standard `reasoning`
field (which this client does not surface), so users saw a
~13-second perceived freeze before the first content token
arrived.

The vLLM branch now emits the OpenAI extension
`chat_template_kwargs.enable_thinking` — the canonical way to
toggle Qwen3's `<think>` mode, DeepSeek-R1's reasoning trace, and
any other reasoning-capable model served via vLLM. End-to-end
measurement against vLLM hosting Qwen3.6-35B-A3B-FP8:

  - TTFT:           13039ms → 274ms
  - Total LLM call: 13s     → 5.7s
  - Output rate:    3 ch/s  → 46 ch/s

The `high` / `max` reasoning levels likewise route through
`chat_template_kwargs` so the toggle is consistent across effort
levels. No change for any non-vLLM provider (NVIDIA NIM continues
to use the NVIDIA-specific `chat_template_kwargs.thinking` key;
Anthropic-native providers keep the Anthropic-native field).

Resolved a 3-way merge conflict against the v0.8.32 AtlasCloud
harvest (PR #1436) so AtlasCloud stays in the no-op match arm
alongside OpenAI / Ollama while the new vLLM arm gets its own
branch. Note for future Sglang / Fireworks / Novita work: those
servers likely have the same bug but each has its own
chat_template_kwargs schema; this PR is intentionally minimal
to the verified-fix scope.

Harvested from PR #1480 by @h3c-hexin

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md             | 18 ++++++++++++++++++
 crates/tui/src/client.rs | 35 +++++++++++++++++++++++++++++------
 2 files changed, 47 insertions(+), 6 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d6e6e999..e5f4e348 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,24 @@ real world uses."
 
 ### Fixed
 
+- **vLLM provider: `reasoning_effort = "off"` now actually
+  disables thinking on Qwen3 / DeepSeek-R1 servers, cutting
+  TTFT from ~13s to ~270ms** (harvested from PR #1480 by
+  **@h3c-hexin**). The vLLM branch of `apply_reasoning_effort`
+  was injecting `thinking: {type: "disabled"}` at the top of
+  the request body — but vLLM speaks OpenAI's chat-completions
+  protocol, not Anthropic-native fields, and silently ignored
+  the directive. The model then emitted a full hidden reasoning
+  trace into the non-standard `reasoning` field (which this
+  client doesn't surface), so users saw a multi-second freeze
+  before any content token arrived. The vLLM branch now emits
+  the OpenAI extension `chat_template_kwargs.enable_thinking`
+  (which vLLM forwards into the model's chat template — the
+  canonical way to toggle Qwen3's `<think>...</think>` mode).
+  Measurement against vLLM + Qwen3.6-35B-A3B-FP8: TTFT
+  13039ms → 274ms, total LLM call 13s → 5.7s. The `high` /
+  `max` effort levels likewise switch to the OpenAI extension.
+  No change for non-vLLM providers.
 - **`/sessions` picker no longer shows `<turn_meta>` as the
   session title** (harvested from PR #1498 by **@wdw8276**).
   `session_manager::create_saved_session_with_id_and_mode`
diff --git a/crates/tui/src/client.rs b/crates/tui/src/client.rs
index b7ec2145..3b5875ff 100644
--- a/crates/tui/src/client.rs
+++ b/crates/tui/src/client.rs
@@ -888,10 +888,23 @@ pub(super) fn apply_reasoning_effort(
             | ApiProvider::Openrouter
             | ApiProvider::Novita
             | ApiProvider::Fireworks
-            | ApiProvider::Sglang
-            | ApiProvider::Vllm => {
+            | ApiProvider::Sglang => {
                 body["thinking"] = json!({ "type": "disabled" });
             }
+            // vLLM is an OpenAI-protocol server, not an Anthropic-protocol one.
+            // For Qwen3 / DeepSeek-R1 / other reasoning models hosted via vLLM,
+            // the canonical OpenAI extension to disable thinking is
+            // `chat_template_kwargs.enable_thinking`. The old
+            // `thinking: {type: disabled}` field is Anthropic-native and
+            // silently ignored by vLLM — the model still emits a full
+            // reasoning trace into the `reasoning` field (which this client
+            // doesn't surface), causing 10+ seconds of perceived "freeze"
+            // before the first content token (PR #1480 by @h3c-hexin).
+            ApiProvider::Vllm => {
+                body["chat_template_kwargs"] = json!({
+                    "enable_thinking": false,
+                });
+            }
             ApiProvider::Openai | ApiProvider::Atlascloud | ApiProvider::Ollama => {}
             ApiProvider::NvidiaNim => {
                 body["chat_template_kwargs"] = json!({
@@ -905,11 +918,16 @@ pub(super) fn apply_reasoning_effort(
             | ApiProvider::Openrouter
             | ApiProvider::Novita
             | ApiProvider::Fireworks
-            | ApiProvider::Sglang
-            | ApiProvider::Vllm => {
+            | ApiProvider::Sglang => {
                 body["reasoning_effort"] = json!("high");
                 body["thinking"] = json!({ "type": "enabled" });
             }
+            ApiProvider::Vllm => {
+                body["chat_template_kwargs"] = json!({
+                    "enable_thinking": true,
+                });
+                body["reasoning_effort"] = json!("high");
+            }
             ApiProvider::Openai | ApiProvider::Atlascloud | ApiProvider::Ollama => {}
             ApiProvider::NvidiaNim => {
                 body["chat_template_kwargs"] = json!({
@@ -924,11 +942,16 @@ pub(super) fn apply_reasoning_effort(
             | ApiProvider::Openrouter
             | ApiProvider::Novita
             | ApiProvider::Fireworks
-            | ApiProvider::Sglang
-            | ApiProvider::Vllm => {
+            | ApiProvider::Sglang => {
                 body["reasoning_effort"] = json!("max");
                 body["thinking"] = json!({ "type": "enabled" });
             }
+            ApiProvider::Vllm => {
+                body["chat_template_kwargs"] = json!({
+                    "enable_thinking": true,
+                });
+                body["reasoning_effort"] = json!("max");
+            }
             ApiProvider::Openai | ApiProvider::Atlascloud | ApiProvider::Ollama => {}
             ApiProvider::NvidiaNim => {
                 body["chat_template_kwargs"] = json!({