diff --git a/CHANGELOG.md b/CHANGELOG.md
index d6e6e999..e5f4e348 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,24 @@ real world uses."
### Fixed
+- **vLLM provider: `reasoning_effort = "off"` now actually
+ disables thinking on Qwen3 / DeepSeek-R1 servers, cutting
+ TTFT from ~13s to ~270ms** (harvested from PR #1480 by
+ **@h3c-hexin**). The vLLM branch of `apply_reasoning_effort`
+ was injecting `thinking: {type: "disabled"}` at the top of
+ the request body — but vLLM speaks OpenAI's chat-completions
+ protocol, not Anthropic-native fields, and silently ignored
+ the directive. The model then emitted a full hidden reasoning
+ trace into the non-standard `reasoning` field (which this
+ client doesn't surface), so users saw a multi-second freeze
+ before any content token arrived. The vLLM branch now emits
+ the OpenAI extension `chat_template_kwargs.enable_thinking`
+ (which vLLM forwards into the model's chat template — the
+ canonical way to toggle Qwen3's `...` mode).
+ Measurement against vLLM + Qwen3.6-35B-A3B-FP8: TTFT
+ 13039ms → 274ms, total LLM call 13s → 5.7s. The `high` /
+ `max` effort levels likewise switch to the OpenAI extension.
+ No change for non-vLLM providers.
- **`/sessions` picker no longer shows `` as the
session title** (harvested from PR #1498 by **@wdw8276**).
`session_manager::create_saved_session_with_id_and_mode`
diff --git a/crates/tui/src/client.rs b/crates/tui/src/client.rs
index b7ec2145..3b5875ff 100644
--- a/crates/tui/src/client.rs
+++ b/crates/tui/src/client.rs
@@ -888,10 +888,23 @@ pub(super) fn apply_reasoning_effort(
| ApiProvider::Openrouter
| ApiProvider::Novita
| ApiProvider::Fireworks
- | ApiProvider::Sglang
- | ApiProvider::Vllm => {
+ | ApiProvider::Sglang => {
body["thinking"] = json!({ "type": "disabled" });
}
+ // vLLM is an OpenAI-protocol server, not an Anthropic-protocol one.
+ // For Qwen3 / DeepSeek-R1 / other reasoning models hosted via vLLM,
+ // the canonical OpenAI extension to disable thinking is
+ // `chat_template_kwargs.enable_thinking`. The old
+ // `thinking: {type: disabled}` field is Anthropic-native and
+ // silently ignored by vLLM — the model still emits a full
+ // reasoning trace into the `reasoning` field (which this client
+ // doesn't surface), causing 10+ seconds of perceived "freeze"
+ // before the first content token (PR #1480 by @h3c-hexin).
+ ApiProvider::Vllm => {
+ body["chat_template_kwargs"] = json!({
+ "enable_thinking": false,
+ });
+ }
ApiProvider::Openai | ApiProvider::Atlascloud | ApiProvider::Ollama => {}
ApiProvider::NvidiaNim => {
body["chat_template_kwargs"] = json!({
@@ -905,11 +918,16 @@ pub(super) fn apply_reasoning_effort(
| ApiProvider::Openrouter
| ApiProvider::Novita
| ApiProvider::Fireworks
- | ApiProvider::Sglang
- | ApiProvider::Vllm => {
+ | ApiProvider::Sglang => {
body["reasoning_effort"] = json!("high");
body["thinking"] = json!({ "type": "enabled" });
}
+ ApiProvider::Vllm => {
+ body["chat_template_kwargs"] = json!({
+ "enable_thinking": true,
+ });
+ body["reasoning_effort"] = json!("high");
+ }
ApiProvider::Openai | ApiProvider::Atlascloud | ApiProvider::Ollama => {}
ApiProvider::NvidiaNim => {
body["chat_template_kwargs"] = json!({
@@ -924,11 +942,16 @@ pub(super) fn apply_reasoning_effort(
| ApiProvider::Openrouter
| ApiProvider::Novita
| ApiProvider::Fireworks
- | ApiProvider::Sglang
- | ApiProvider::Vllm => {
+ | ApiProvider::Sglang => {
body["reasoning_effort"] = json!("max");
body["thinking"] = json!({ "type": "enabled" });
}
+ ApiProvider::Vllm => {
+ body["chat_template_kwargs"] = json!({
+ "enable_thinking": true,
+ });
+ body["reasoning_effort"] = json!("max");
+ }
ApiProvider::Openai | ApiProvider::Atlascloud | ApiProvider::Ollama => {}
ApiProvider::NvidiaNim => {
body["chat_template_kwargs"] = json!({