diff --git a/CHANGELOG.md b/CHANGELOG.md
index 83b3f5d1..6eb27648 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - **`parse_invocation_count` flake.** Two `markdown_render` tests both read the global PARSE_INVOCATIONS atomic and raced when other tests called `parse()` in parallel. Switched the counter to `thread_local!<Cell<u64>>`, so each test thread sees only its own invocations. Tested 8 sequential full-suite runs: 8/8 green (was ~40% green).
 
 ### Changed
+- **System prompts redesigned with decomposition-first philosophy.** All four prompt tiers (base, agent, plan, yolo) now teach the model to decompose tasks before acting — `todo_write` first for granular task tracking, `update_plan` for high-level strategy, and sub-agents for parallelizable work. Inspired by the "mismanaged geniuses hypothesis" (Zhang et al., 2026): frontier LMs are already capable enough; the bottleneck is how we scaffold their self-management. The prompts now make work visible through the sidebar (Plan / Todos / Tasks / Agents) instead of letting the model work invisibly.
 - **Tool labels use progressive verbs.** "Read foo.rs" → "Reading foo.rs", "List X" → "Listing X", "Search pattern" → "Searching for `pattern`", "List files" → "Listing files". Past-tense labels read wrong while a tool is still in flight; the new forms match what the user actually sees.
 - **Long-running tools grow an elapsed badge.** From 3 s onward the `running` status segment becomes `running (3s)`, `running (4s)`, … so the user can tell a tool isn't stuck. The status-animation tick (360 ms) drives the redraw; below 3 s the badge stays hidden so quick reads/greps don't churn. (history.rs `running_status_label_with_elapsed`)
 - **Spinner pulse is twice as fast** — `TOOL_STATUS_SYMBOL_MS` 1800 ms → 720 ms per glyph (full 4-glyph heartbeat in ~2.88 s instead of ~7.2 s).
diff --git a/README.md b/README.md
index 0c86b5a1..38d729e2 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ DeepSeek TUI is a coding agent that runs entirely in your terminal. It gives Dee
 - 🧠 **Thinking-mode streaming** — watch DeepSeek's chain-of-thought as it reasons about your code
 - 🔧 **Full tool suite** — file ops, shell execution, git, web search/browse, apply-patch, sub-agents, MCP servers, and more
 - 🪟 **1M-token context** — feed entire codebases; automatic intelligent compaction when context fills up
-- 🎛️ **Three interaction modes** — Plan (read-only explore), Agent (interactive with approval), YOLO (auto-approved). All three can call `rlm_query` for parallel research
+- 🎛️ **Three interaction modes** — Plan (read-only explore), Agent (interactive with approval), YOLO (auto-approved). All three guided by decomposition-first system prompts that teach the model to `todo_write`, `update_plan`, and spawn sub-agents before acting
 - ⚡ **Reasoning-effort tiers** — cycle through `off → high → max` with Shift+Tab
 - 🔄 **Session save/resume** — checkpoint and resume long sessions, fork conversations
 - 🌐 **HTTP/SSE runtime API** — `deepseek serve --http` for headless agent workflows
@@ -163,11 +163,11 @@ deepseek serve --http                         # HTTP/SSE API server
 
 | Mode | Behavior |
 |---|---|
-| **Plan** 🔍 | Read-only investigation — model explores and proposes a plan before making changes |
-| **Agent** 🤖 | Default interactive mode — multi-step tool use with approval gates |
-| **YOLO** ⚡ | Auto-approve all tools in a trusted workspace (use with caution) |
+| **Plan** 🔍 | Read-only investigation — model explores and proposes a decomposition plan (`update_plan` + `todo_write`) before making changes |
+| **Agent** 🤖 | Default interactive mode — multi-step tool use with approval gates; model outlines work via `todo_write` before requesting writes |
+| **YOLO** ⚡ | Auto-approve all tools in a trusted workspace; model still creates `todo_write`/`update_plan` to keep work visible and trackable |
 
-All three modes have access to the `rlm_query` tool for parallel/batched LLM fan-out (see "What's new in v0.6.0" above).
+All three modes are guided by decomposition-first system prompts: the model is taught to break work into verifiable tasks, track them in the sidebar, and fan out sub-agents for parallel work — "managing the geniuses" rather than just running single-shot prompts.
 
 ---
 
diff --git a/crates/tui/src/client.rs b/crates/tui/src/client.rs
index 26ce3266..3743e776 100644
--- a/crates/tui/src/client.rs
+++ b/crates/tui/src/client.rs
@@ -4,8 +4,6 @@
 //! Responses probe remains available behind `DEEPSEEK_EXPERIMENTAL_RESPONSES_API`
 //! for local compatibility experiments, but normal traffic uses chat completions.
 
-use std::collections::HashSet;
-use std::pin::Pin;
 use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
 use std::sync::{Arc, Mutex as StdMutex, OnceLock};
 use std::time::{Duration, Instant};
@@ -22,12 +20,9 @@ use crate::llm_client::{
     with_retry,
 };
 use crate::logging;
-use crate::models::{
-    ContentBlock, ContentBlockStart, Delta, Message, MessageDelta, MessageRequest, MessageResponse,
-    ServerToolUsage, StreamEvent, SystemPrompt, Tool, ToolCaller, Usage,
-};
+use crate::models::{MessageRequest, MessageResponse, ServerToolUsage, SystemPrompt, Usage};
 
-fn to_api_tool_name(name: &str) -> String {
+pub(super) fn to_api_tool_name(name: &str) -> String {
     let mut out = String::new();
     for ch in name.chars() {
         if ch.is_ascii_alphanumeric() || ch == '_' {
@@ -43,7 +38,7 @@ fn to_api_tool_name(name: &str) -> String {
     out
 }
 
-fn from_api_tool_name(name: &str) -> String {
+pub(super) fn from_api_tool_name(name: &str) -> String {
     let mut out = String::new();
     let mut iter = name.chars().peekable();
     while let Some(ch) = iter.next() {
@@ -93,7 +88,7 @@ fn from_api_tool_name(name: &str) -> String {
 /// Decode bare `x[0-9A-Fa-f]{6}` sequences (optionally followed by `-`)
 /// that survive the standard delimiter-based pass.  This handles cases
 /// where the model strips or replaces the leading `-` of `-x00002E-`.
-fn decode_bare_hex_escapes(input: &str) -> String {
+pub(super) fn decode_bare_hex_escapes(input: &str) -> String {
     use regex::Regex;
     use std::sync::OnceLock;
 
@@ -129,10 +124,10 @@ pub struct AvailableModel {
 /// Client for DeepSeek's OpenAI-compatible APIs.
 #[must_use]
 pub struct DeepSeekClient {
-    http_client: reqwest::Client,
+    pub(super) http_client: reqwest::Client,
     api_key: String,
-    base_url: String,
-    api_provider: ApiProvider,
+    pub(super) base_url: String,
+    pub(super) api_provider: ApiProvider,
     retry: RetryPolicy,
     default_model: String,
     use_chat_completions: AtomicBool,
@@ -155,9 +150,9 @@ const DEFAULT_CLIENT_RATE_LIMIT_BURST: f64 = 16.0;
 const ALLOW_INSECURE_HTTP_ENV: &str = "DEEPSEEK_ALLOW_INSECURE_HTTP";
 const EXPERIMENTAL_RESPONSES_API_ENV: &str = "DEEPSEEK_EXPERIMENTAL_RESPONSES_API";
 
-const SSE_BACKPRESSURE_HIGH_WATERMARK: usize = 8 * 1024 * 1024; // 8 MB
-const SSE_BACKPRESSURE_SLEEP_MS: u64 = 10;
-const SSE_MAX_LINES_PER_CHUNK: usize = 256;
+pub(super) const SSE_BACKPRESSURE_HIGH_WATERMARK: usize = 8 * 1024 * 1024; // 8 MB
+pub(super) const SSE_BACKPRESSURE_SLEEP_MS: u64 = 10;
+pub(super) const SSE_MAX_LINES_PER_CHUNK: usize = 256;
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 enum ConnectionState {
@@ -326,10 +321,10 @@ impl Clone for DeepSeekClient {
 // === Helpers ===
 
 /// Maximum bytes to read from an error response body (64 KB).
-const ERROR_BODY_MAX_BYTES: usize = 64 * 1024;
+pub(super) const ERROR_BODY_MAX_BYTES: usize = 64 * 1024;
 
 /// Read an error response body with a size limit to prevent unbounded allocation.
-async fn bounded_error_text(response: reqwest::Response, max_bytes: usize) -> String {
+pub(super) async fn bounded_error_text(response: reqwest::Response, max_bytes: usize) -> String {
     use futures_util::StreamExt;
     let mut stream = response.bytes_stream();
     let mut buf = Vec::with_capacity(max_bytes.min(8192));
@@ -387,7 +382,7 @@ fn experimental_responses_api_enabled() -> bool {
         .is_some_and(|v| v == "1" || v.eq_ignore_ascii_case("true"))
 }
 
-fn versioned_base_url(base_url: &str) -> String {
+pub(super) fn versioned_base_url(base_url: &str) -> String {
     let trimmed = base_url.trim_end_matches('/');
     if trimmed.ends_with("/v1") || trimmed.ends_with("/beta") {
         trimmed.to_string()
@@ -396,7 +391,7 @@ fn versioned_base_url(base_url: &str) -> String {
     }
 }
 
-fn api_url(base_url: &str, path: &str) -> String {
+pub(super) fn api_url(base_url: &str, path: &str) -> String {
     format!(
         "{}/{}",
         versioned_base_url(base_url).trim_end_matches('/'),
@@ -522,7 +517,7 @@ impl DeepSeekClient {
         }
     }
 
-    async fn send_with_retry<F>(&self, mut build: F) -> Result<reqwest::Response>
+    pub(super) async fn send_with_retry<F>(&self, mut build: F) -> Result<reqwest::Response>
     where
         F: FnMut() -> reqwest::RequestBuilder,
     {
@@ -583,114 +578,8 @@ impl DeepSeekClient {
             }
         }
     }
-
-    async fn create_message_responses(
-        &self,
-        request: &MessageRequest,
-    ) -> Result<Result<MessageResponse, ResponsesFallback>> {
-        let mut body = json!({
-            "model": request.model,
-            "input": build_responses_input(&request.messages),
-            "store": false,
-            "max_output_tokens": request.max_tokens,
-        });
-
-        if let Some(instructions) = system_to_instructions(request.system.clone()) {
-            body["instructions"] = json!(instructions);
-        }
-        if let Some(temperature) = request.temperature {
-            body["temperature"] = json!(temperature);
-        }
-        if let Some(top_p) = request.top_p {
-            body["top_p"] = json!(top_p);
-        }
-        if let Some(tools) = request.tools.as_ref() {
-            body["tools"] = json!(tools.iter().map(tool_to_responses).collect::<Vec<_>>());
-        }
-        if let Some(choice) = request.tool_choice.as_ref() {
-            body["tool_choice"] = choice.clone();
-        }
-        apply_reasoning_effort(
-            &mut body,
-            request.reasoning_effort.as_deref(),
-            self.api_provider,
-        );
-
-        let url = api_url(&self.base_url, "responses");
-        let response = self
-            .send_with_retry(|| self.http_client.post(&url).json(&body))
-            .await?;
-
-        let status = response.status();
-
-        if status.as_u16() == 404 || status.as_u16() == 405 {
-            let body = bounded_error_text(response, ERROR_BODY_MAX_BYTES).await;
-            return Ok(Err(ResponsesFallback {
-                status: status.as_u16(),
-                body,
-            }));
-        }
-
-        if !status.is_success() {
-            let error_text = bounded_error_text(response, ERROR_BODY_MAX_BYTES).await;
-            anyhow::bail!("Failed to call DeepSeek Responses API: HTTP {status}: {error_text}");
-        }
-
-        let response_text = response.text().await.unwrap_or_default();
-        let value: Value =
-            serde_json::from_str(&response_text).context("Failed to parse Responses API JSON")?;
-        let message = parse_responses_message(&value)?;
-        Ok(Ok(message))
-    }
-
-    async fn create_message_chat(&self, request: &MessageRequest) -> Result<MessageResponse> {
-        let messages = build_chat_messages_for_request(request);
-        let mut body = json!({
-            "model": request.model,
-            "messages": messages,
-            "max_tokens": request.max_tokens,
-        });
-
-        if let Some(temperature) = request.temperature {
-            body["temperature"] = json!(temperature);
-        }
-        if let Some(top_p) = request.top_p {
-            body["top_p"] = json!(top_p);
-        }
-        if let Some(tools) = request.tools.as_ref() {
-            body["tools"] = json!(tools.iter().map(tool_to_chat).collect::<Vec<_>>());
-        }
-        if let Some(choice) = request.tool_choice.as_ref()
-            && let Some(mapped) = map_tool_choice_for_chat(choice)
-        {
-            body["tool_choice"] = mapped;
-        }
-        apply_reasoning_effort(
-            &mut body,
-            request.reasoning_effort.as_deref(),
-            self.api_provider,
-        );
-
-        let url = api_url(&self.base_url, "chat/completions");
-        let response = self
-            .send_with_retry(|| self.http_client.post(&url).json(&body))
-            .await?;
-
-        let status = response.status();
-        if !status.is_success() {
-            let error_text = bounded_error_text(response, ERROR_BODY_MAX_BYTES).await;
-            anyhow::bail!("Failed to call DeepSeek Chat API: HTTP {status}: {error_text}");
-        }
-
-        let response_text = response.text().await.unwrap_or_default();
-        let value: Value =
-            serde_json::from_str(&response_text).context("Failed to parse Chat API JSON")?;
-        parse_chat_message(&value)
-    }
 }
 
-// === Trait Implementations ===
-
 impl LlmClient for DeepSeekClient {
     fn provider_name(&self) -> &'static str {
         self.api_provider.as_str()
@@ -769,212 +658,10 @@ impl LlmClient for DeepSeekClient {
     }
 
     async fn create_message_stream(&self, request: MessageRequest) -> Result<StreamEventBox> {
-        // Try true SSE streaming via chat completions (widely supported)
-        let messages = build_chat_messages_for_request(&request);
-        let mut body = json!({
-            "model": request.model,
-            "messages": messages,
-            "max_tokens": request.max_tokens,
-            "stream": true,
-            "stream_options": {
-                "include_usage": true
-            },
-        });
-
-        if let Some(temperature) = request.temperature {
-            body["temperature"] = json!(temperature);
-        }
-        if let Some(top_p) = request.top_p {
-            body["top_p"] = json!(top_p);
-        }
-        if let Some(tools) = request.tools.as_ref() {
-            body["tools"] = json!(tools.iter().map(tool_to_chat).collect::<Vec<_>>());
-        }
-        if let Some(choice) = request.tool_choice.as_ref()
-            && let Some(mapped) = map_tool_choice_for_chat(choice)
-        {
-            body["tool_choice"] = mapped;
-        }
-        apply_reasoning_effort(
-            &mut body,
-            request.reasoning_effort.as_deref(),
-            self.api_provider,
-        );
-
-        // Bulletproof final sanitizer: walk the wire payload and force
-        // `reasoning_content` onto any assistant message that has tool_calls
-        // but no reasoning_content. DeepSeek's thinking-mode API rejects
-        // such messages with a 400. This is the last line of defense after
-        // engine-side and build-side substitution; if either upstream path
-        // misses a case (e.g. a session restored from disk, a sub-agent
-        // adding messages directly, or a cached prefix mismatch), this pass
-        // still produces a valid request.
-        let replay_input_tokens = sanitize_thinking_mode_messages(
-            &mut body,
-            &request.model,
-            request.reasoning_effort.as_deref(),
-        );
-
-        let url = api_url(&self.base_url, "chat/completions");
-        let response = self
-            .send_with_retry(|| self.http_client.post(&url).json(&body))
-            .await?;
-
-        let status = response.status();
-        if !status.is_success() {
-            let error_text = bounded_error_text(response, ERROR_BODY_MAX_BYTES).await;
-            // If DeepSeek rejected for missing reasoning_content despite the
-            // sanitizer, dump the offending indices so we can diagnose where
-            // they came from on the next failure.
-            if error_text.contains("reasoning_content") {
-                log_thinking_mode_violations(&body);
-            }
-            anyhow::bail!("SSE stream request failed: HTTP {status}: {error_text}");
-        }
-
-        let model = request.model.clone();
-        let byte_stream = response.bytes_stream();
-
-        let stream = async_stream::stream! {
-            use futures_util::StreamExt;
-
-            // Emit a synthetic MessageStart
-            yield Ok(StreamEvent::MessageStart {
-                message: MessageResponse {
-                    id: String::new(),
-                    r#type: "message".to_string(),
-                    role: "assistant".to_string(),
-                    content: Vec::new(),
-                    model: model.clone(),
-                    stop_reason: None,
-                    stop_sequence: None,
-                    container: None,
-                    usage: Usage {
-                        input_tokens: 0,
-                        output_tokens: 0,
-                        ..Usage::default()
-                    },
-                },
-            });
-
-            let mut line_buf = String::new();
-            let mut byte_buf = acquire_stream_buffer();
-            let mut content_index: u32 = 0;
-            let mut text_started = false;
-            let mut thinking_started = false;
-            let mut tool_indices: std::collections::HashMap<u32, u32> = std::collections::HashMap::new();
-            let is_reasoning_model = requires_reasoning_content(&model);
-
-            let mut byte_stream = std::pin::pin!(byte_stream);
-
-            while let Some(chunk_result) = byte_stream.next().await {
-                let chunk = match chunk_result {
-                    Ok(bytes) => bytes,
-                    Err(e) => {
-                        yield Err(anyhow::anyhow!("Stream read error: {e}"));
-                        break;
-                    }
-                };
-
-                byte_buf.extend_from_slice(&chunk);
-
-                // Guard against unbounded buffer growth (e.g., malformed stream without newlines)
-                const MAX_SSE_BUF: usize = 10 * 1024 * 1024; // 10 MB
-                if byte_buf.len() > MAX_SSE_BUF {
-                    yield Err(anyhow::anyhow!("SSE buffer exceeded {MAX_SSE_BUF} bytes — aborting stream"));
-                    break;
-                }
-
-                if byte_buf.len() > SSE_BACKPRESSURE_HIGH_WATERMARK {
-                    tokio::time::sleep(Duration::from_millis(SSE_BACKPRESSURE_SLEEP_MS)).await;
-                }
-
-                // Process complete SSE lines from the buffer
-                let mut lines_processed = 0usize;
-                while let Some(newline_pos) = byte_buf.iter().position(|&b| b == b'\n') {
-                    let mut end = newline_pos;
-                    if end > 0 && byte_buf[end - 1] == b'\r' {
-                        end -= 1;
-                    }
-                    let line = String::from_utf8_lossy(&byte_buf[..end]).into_owned();
-                    byte_buf.drain(..newline_pos + 1);
-
-                    if line.is_empty() {
-                        // Empty line = event boundary, process accumulated data
-                        if !line_buf.is_empty() {
-                            let data = std::mem::take(&mut line_buf);
-                            if data.trim() == "[DONE]" {
-                                // Stream complete
-                            } else if let Ok(chunk_json) = serde_json::from_str::<Value>(&data) {
-                                // Parse the SSE chunk into stream events
-                                for mut event in parse_sse_chunk(
-                                    &chunk_json,
-                                    &mut content_index,
-                                    &mut text_started,
-                                    &mut thinking_started,
-                                    &mut tool_indices,
-                                    is_reasoning_model,
-                                ) {
-                                    // Stamp the client-side replay-token estimate
-                                    // onto the final usage so the UI can surface
-                                    // it (#30). We compute it pre-request and
-                                    // overlay it on the server-reported usage at
-                                    // stream completion.
-                                    if let Some(tokens) = replay_input_tokens
-                                        && let StreamEvent::MessageDelta {
-                                            usage: Some(usage),
-                                            ..
-                                        } = &mut event
-                                    {
-                                        usage.reasoning_replay_tokens = Some(tokens);
-                                    }
-                                    yield Ok(event);
-                                }
-                            }
-                        }
-                        continue;
-                    }
-
-                    if let Some(data) = line.strip_prefix("data: ") {
-                        line_buf.push_str(data);
-                    }
-                    // Ignore other SSE fields (event:, id:, retry:)
-
-                    lines_processed = lines_processed.saturating_add(1);
-                    if lines_processed >= SSE_MAX_LINES_PER_CHUNK {
-                        // Yield backpressure relief to avoid starving downstream consumers.
-                        break;
-                    }
-                }
-            }
-
-            // Close any open blocks
-            if thinking_started {
-                yield Ok(StreamEvent::ContentBlockStop { index: content_index.saturating_sub(1) });
-            }
-            if text_started {
-                yield Ok(StreamEvent::ContentBlockStop { index: content_index.saturating_sub(1) });
-            }
-
-            release_stream_buffer(byte_buf);
-            yield Ok(StreamEvent::MessageStop);
-        };
-
-        Ok(Pin::from(Box::new(stream)
-            as Box<
-                dyn futures_util::Stream<Item = Result<StreamEvent>> + Send,
-            >))
+        self.handle_chat_completion_stream(request).await
     }
 }
 
-// === Responses API Helpers ===
-
-#[derive(Debug)]
-struct ResponsesFallback {
-    status: u16,
-    body: String,
-}
-
 #[derive(Debug, Deserialize)]
 struct ModelsListResponse {
     data: Vec<ModelListItem>,
@@ -989,7 +676,7 @@ struct ModelListItem {
     created: Option<u64>,
 }
 
-fn parse_models_response(payload: &str) -> Result<Vec<AvailableModel>> {
+pub(super) fn parse_models_response(payload: &str) -> Result<Vec<AvailableModel>> {
     let parsed: ModelsListResponse =
         serde_json::from_str(payload).context("Failed to parse model list JSON")?;
 
@@ -1007,7 +694,7 @@ fn parse_models_response(payload: &str) -> Result<Vec<AvailableModel>> {
     Ok(models)
 }
 
-fn system_to_instructions(system: Option<SystemPrompt>) -> Option<String> {
+pub(super) fn system_to_instructions(system: Option<SystemPrompt>) -> Option<String> {
     match system {
         Some(SystemPrompt::Text(text)) => Some(text),
         Some(SystemPrompt::Blocks(blocks)) => {
@@ -1026,823 +713,11 @@ fn system_to_instructions(system: Option<SystemPrompt>) -> Option<String> {
     }
 }
 
-fn build_responses_input(messages: &[Message]) -> Vec<Value> {
-    let mut items = Vec::new();
-
-    for message in messages {
-        let role = message.role.as_str();
-        let text_type = if role == "user" {
-            "input_text"
-        } else {
-            "output_text"
-        };
-
-        for block in &message.content {
-            match block {
-                ContentBlock::Text { text, .. } => {
-                    items.push(json!({
-                        "type": "message",
-                        "role": role,
-                        "content": [{
-                            "type": text_type,
-                            "text": text,
-                        }]
-                    }));
-                }
-                ContentBlock::ToolUse {
-                    id,
-                    name,
-                    input,
-                    caller,
-                } => {
-                    let args = serde_json::to_string(input).unwrap_or_else(|_| input.to_string());
-                    let mut item = json!({
-                        "type": "function_call",
-                        "call_id": id,
-                        "name": to_api_tool_name(name),
-                        "arguments": args,
-                    });
-                    if let Some(caller) = caller {
-                        item["caller"] = json!({
-                            "type": caller.caller_type,
-                            "tool_id": caller.tool_id,
-                        });
-                    }
-                    items.push(item);
-                }
-                ContentBlock::ToolResult {
-                    tool_use_id,
-                    content,
-                    is_error,
-                    ..
-                } => {
-                    let mut item = json!({
-                        "type": "function_call_output",
-                        "call_id": tool_use_id,
-                        "output": content,
-                    });
-                    if let Some(is_error) = is_error {
-                        item["is_error"] = json!(is_error);
-                    }
-                    items.push(item);
-                }
-                ContentBlock::Thinking { .. } => {}
-                ContentBlock::ServerToolUse { id, name, input } => {
-                    items.push(json!({
-                        "type": "server_tool_use",
-                        "id": id,
-                        "name": name,
-                        "input": input,
-                    }));
-                }
-                ContentBlock::ToolSearchToolResult {
-                    tool_use_id,
-                    content,
-                } => {
-                    items.push(json!({
-                        "type": "tool_search_tool_result",
-                        "tool_use_id": tool_use_id,
-                        "content": content,
-                    }));
-                }
-                ContentBlock::CodeExecutionToolResult {
-                    tool_use_id,
-                    content,
-                } => {
-                    items.push(json!({
-                        "type": "code_execution_tool_result",
-                        "tool_use_id": tool_use_id,
-                        "content": content,
-                    }));
-                }
-            }
-        }
-    }
-
-    items
-}
-
-fn tool_to_responses(tool: &Tool) -> Value {
-    let tool_type = tool.tool_type.as_deref().unwrap_or("function");
-    let mut value = if tool_type == "function" {
-        json!({
-            "type": "function",
-            "name": to_api_tool_name(&tool.name),
-            "description": tool.description,
-            "parameters": tool.input_schema,
-        })
-    } else if tool_type == "code_execution_20250825" {
-        json!({
-            "type": tool_type,
-            "name": to_api_tool_name(&tool.name),
-        })
-    } else {
-        json!({
-            "type": tool_type,
-            "name": to_api_tool_name(&tool.name),
-            "description": tool.description,
-            "input_schema": tool.input_schema,
-        })
-    };
-
-    if let Some(allowed_callers) = &tool.allowed_callers {
-        value["allowed_callers"] = json!(allowed_callers);
-    }
-    if let Some(defer_loading) = tool.defer_loading {
-        value["defer_loading"] = json!(defer_loading);
-    }
-    if let Some(input_examples) = &tool.input_examples {
-        value["input_examples"] = json!(input_examples);
-    }
-    if let Some(strict) = tool.strict {
-        value["strict"] = json!(strict);
-    }
-    value
-}
-
-fn parse_responses_message(payload: &Value) -> Result<MessageResponse> {
-    let id = payload
-        .get("id")
-        .and_then(Value::as_str)
-        .unwrap_or("response")
-        .to_string();
-    let model = payload
-        .get("model")
-        .and_then(Value::as_str)
-        .unwrap_or("unknown")
-        .to_string();
-
-    let usage = parse_usage(payload.get("usage"));
-    let mut content = Vec::new();
-
-    if let Some(output) = payload.get("output").and_then(Value::as_array) {
-        for item in output {
-            let item_type = item.get("type").and_then(Value::as_str).unwrap_or("");
-            match item_type {
-                "message" => {
-                    if let Some(role) = item.get("role").and_then(Value::as_str)
-                        && role != "assistant"
-                    {
-                        continue;
-                    }
-                    if let Some(content_items) = item.get("content").and_then(Value::as_array) {
-                        for content_item in content_items {
-                            let content_type = content_item
-                                .get("type")
-                                .and_then(Value::as_str)
-                                .unwrap_or("output_text");
-                            if content_type != "output_text" && content_type != "text" {
-                                continue;
-                            }
-                            if let Some(text) = content_item.get("text").and_then(Value::as_str)
-                                && !text.trim().is_empty()
-                            {
-                                content.push(ContentBlock::Text {
-                                    text: text.to_string(),
-                                    cache_control: None,
-                                });
-                            }
-                        }
-                    }
-                }
-                "function_call" => {
-                    let call_id = item
-                        .get("call_id")
-                        .or_else(|| item.get("id"))
-                        .and_then(Value::as_str)
-                        .unwrap_or("tool_call")
-                        .to_string();
-                    let name = item
-                        .get("name")
-                        .and_then(Value::as_str)
-                        .unwrap_or("tool")
-                        .to_string();
-                    let input = match item.get("arguments") {
-                        Some(Value::String(raw)) => {
-                            serde_json::from_str(raw).unwrap_or_else(|_| Value::String(raw.clone()))
-                        }
-                        Some(other) => other.clone(),
-                        None => Value::Null,
-                    };
-                    let caller = item.get("caller").and_then(|v| {
-                        v.get("type")
-                            .and_then(Value::as_str)
-                            .map(|caller_type| ToolCaller {
-                                caller_type: caller_type.to_string(),
-                                tool_id: v
-                                    .get("tool_id")
-                                    .and_then(Value::as_str)
-                                    .map(std::string::ToString::to_string),
-                            })
-                    });
-                    content.push(ContentBlock::ToolUse {
-                        id: call_id,
-                        name: from_api_tool_name(&name),
-                        input,
-                        caller,
-                    });
-                }
-                "function_call_output" => {
-                    let tool_use_id = item
-                        .get("call_id")
-                        .or_else(|| item.get("tool_use_id"))
-                        .and_then(Value::as_str)
-                        .unwrap_or("tool_call")
-                        .to_string();
-                    let content_text = item
-                        .get("output")
-                        .or_else(|| item.get("content"))
-                        .map(|v| {
-                            if let Some(s) = v.as_str() {
-                                s.to_string()
-                            } else {
-                                v.to_string()
-                            }
-                        })
-                        .unwrap_or_default();
-                    let is_error = item.get("is_error").and_then(Value::as_bool);
-                    content.push(ContentBlock::ToolResult {
-                        tool_use_id,
-                        content: content_text,
-                        is_error,
-                        content_blocks: None,
-                    });
-                }
-                "server_tool_use" => {
-                    let id = item
-                        .get("id")
-                        .and_then(Value::as_str)
-                        .unwrap_or("server_tool")
-                        .to_string();
-                    let name = item
-                        .get("name")
-                        .and_then(Value::as_str)
-                        .unwrap_or("server_tool")
-                        .to_string();
-                    let input = item.get("input").cloned().unwrap_or(Value::Null);
-                    content.push(ContentBlock::ServerToolUse { id, name, input });
-                }
-                "tool_search_tool_result" => {
-                    let tool_use_id = item
-                        .get("tool_use_id")
-                        .and_then(Value::as_str)
-                        .unwrap_or("tool_search")
-                        .to_string();
-                    let content_value = item.get("content").cloned().unwrap_or(Value::Null);
-                    content.push(ContentBlock::ToolSearchToolResult {
-                        tool_use_id,
-                        content: content_value,
-                    });
-                }
-                "code_execution_tool_result" => {
-                    let tool_use_id = item
-                        .get("tool_use_id")
-                        .and_then(Value::as_str)
-                        .unwrap_or("code_execution")
-                        .to_string();
-                    let content_value = item.get("content").cloned().unwrap_or(Value::Null);
-                    content.push(ContentBlock::CodeExecutionToolResult {
-                        tool_use_id,
-                        content: content_value,
-                    });
-                }
-                "reasoning" => {
-                    if let Some(summary) = item.get("summary").and_then(Value::as_array) {
-                        let summary_text = summary
-                            .iter()
-                            .filter_map(|s| s.get("text").and_then(Value::as_str))
-                            .collect::<Vec<_>>()
-                            .join("\n");
-                        if !summary_text.trim().is_empty() {
-                            content.push(ContentBlock::Thinking {
-                                thinking: summary_text,
-                            });
-                        }
-                    }
-                }
-                _ => {}
-            }
-        }
-    }
-
-    if content.is_empty()
-        && let Some(text) = payload.get("output_text").and_then(Value::as_str)
-        && !text.trim().is_empty()
-    {
-        content.push(ContentBlock::Text {
-            text: text.to_string(),
-            cache_control: None,
-        });
-    }
-
-    Ok(MessageResponse {
-        id,
-        r#type: "message".to_string(),
-        role: "assistant".to_string(),
-        content,
-        model,
-        stop_reason: None,
-        stop_sequence: None,
-        container: payload
-            .get("container")
-            .cloned()
-            .and_then(|v| serde_json::from_value(v).ok()),
-        usage,
-    })
-}
-
-// === Chat Completions Helpers ===
-
-#[cfg(test)]
-fn build_chat_messages(
-    system: Option<&SystemPrompt>,
-    messages: &[Message],
-    model: &str,
-) -> Vec<Value> {
-    build_chat_messages_with_reasoning(
-        system,
-        messages,
-        model,
-        should_replay_reasoning_content(model, None),
-    )
-}
-
-fn build_chat_messages_for_request(request: &MessageRequest) -> Vec<Value> {
-    build_chat_messages_with_reasoning(
-        request.system.as_ref(),
-        &request.messages,
-        &request.model,
-        should_replay_reasoning_content(&request.model, request.reasoning_effort.as_deref()),
-    )
-}
-
-fn build_chat_messages_with_reasoning(
-    system: Option<&SystemPrompt>,
-    messages: &[Message],
-    _model: &str,
-    include_reasoning: bool,
-) -> Vec<Value> {
-    let mut out = Vec::new();
-    let mut pending_tool_calls: HashSet<String> = HashSet::new();
-
-    if let Some(instructions) = system_to_instructions(system.cloned())
-        && !instructions.trim().is_empty()
-    {
-        out.push(json!({
-            "role": "system",
-            "content": instructions,
-        }));
-    }
-
-    for message in messages.iter() {
-        let role = message.role.as_str();
-        let mut text_parts = Vec::new();
-        let mut thinking_parts = Vec::new();
-        let mut tool_calls = Vec::new();
-        let mut tool_call_ids = Vec::new();
-        let mut tool_results: Vec<(String, Value)> = Vec::new();
-
-        for block in &message.content {
-            match block {
-                ContentBlock::Text { text, .. } => text_parts.push(text.clone()),
-                ContentBlock::Thinking { thinking } => thinking_parts.push(thinking.clone()),
-                ContentBlock::ToolUse {
-                    id,
-                    name,
-                    input,
-                    caller,
-                    ..
-                } => {
-                    let args = serde_json::to_string(input).unwrap_or_else(|_| input.to_string());
-                    let mut call = json!({
-                        "id": id,
-                        "type": "function",
-                        "function": {
-                            "name": to_api_tool_name(name),
-                            "arguments": args,
-                        }
-                    });
-                    if let Some(caller) = caller {
-                        call["caller"] = json!({
-                            "type": caller.caller_type,
-                            "tool_id": caller.tool_id,
-                        });
-                    }
-                    tool_calls.push(call);
-                    tool_call_ids.push(id.clone());
-                }
-                ContentBlock::ToolResult {
-                    tool_use_id,
-                    content,
-                    ..
-                } => {
-                    tool_results.push((
-                        tool_use_id.clone(),
-                        json!({
-                            "role": "tool",
-                            "tool_call_id": tool_use_id,
-                            "content": content,
-                        }),
-                    ));
-                }
-                ContentBlock::ServerToolUse { .. }
-                | ContentBlock::ToolSearchToolResult { .. }
-                | ContentBlock::CodeExecutionToolResult { .. } => {}
-            }
-        }
-
-        if role == "assistant" {
-            let content = text_parts.join("\n");
-            let mut reasoning_content = thinking_parts.join("\n");
-            let has_text = !content.trim().is_empty();
-            let has_tool_calls = !tool_calls.is_empty();
-            // DeepSeek thinking-mode rule: every assistant message in the
-            // conversation must carry its `reasoning_content` when thinking
-            // is enabled. The docs say non-tool-call messages' reasoning is
-            // "ignored", but the API still validates presence and rejects
-            // with a 400 if any assistant message is missing it. If reasoning
-            // was lost (e.g. a session checkpoint from before this rule was
-            // enforced, or a sub-turn with no streamed reasoning text),
-            // substitute a non-empty placeholder so the API accepts the
-            // request.
-            let include_reasoning_for_turn = include_reasoning;
-            let mut has_reasoning =
-                include_reasoning_for_turn && !reasoning_content.trim().is_empty();
-            if include_reasoning_for_turn && !has_reasoning {
-                logging::warn(
-                    "Substituting placeholder reasoning_content for DeepSeek tool-call assistant message",
-                );
-                reasoning_content = String::from("(reasoning omitted)");
-                has_reasoning = true;
-            }
-
-            // DeepSeek rejects assistant messages where both `content` and
-            // `tool_calls` are missing/null. Skip such entries even if they
-            // carry reasoning-only metadata unless we can send a non-null
-            // placeholder content field.
-            if !has_text && !has_tool_calls && !has_reasoning {
-                pending_tool_calls.clear();
-                continue;
-            }
-
-            let mut msg = json!({
-                "role": "assistant",
-                "content": if has_text {
-                    json!(content)
-                } else if has_reasoning {
-                    json!("")
-                } else {
-                    Value::Null
-                },
-            });
-            if has_reasoning {
-                msg["reasoning_content"] = json!(reasoning_content);
-            }
-            if has_tool_calls {
-                msg["tool_calls"] = json!(tool_calls);
-                pending_tool_calls = tool_call_ids.into_iter().collect();
-            } else {
-                pending_tool_calls.clear();
-            }
-            out.push(msg);
-        } else if role == "user" {
-            let content = text_parts.join("\n");
-            if !content.trim().is_empty() {
-                out.push(json!({
-                    "role": "user",
-                    "content": content,
-                }));
-            }
-        }
-
-        if !tool_results.is_empty() {
-            if pending_tool_calls.is_empty() {
-                logging::warn("Dropping tool results without matching tool_calls");
-            } else {
-                for (tool_id, tool_msg) in tool_results {
-                    if pending_tool_calls.remove(&tool_id) {
-                        out.push(tool_msg);
-                    } else {
-                        logging::warn(format!(
-                            "Dropping tool result for unknown tool_call_id: {tool_id}"
-                        ));
-                    }
-                }
-            }
-        } else if role != "assistant" {
-            pending_tool_calls.clear();
-        }
-    }
-
-    // Safety net: after compaction, an assistant message may have tool_calls
-    // whose results were summarized away. The API rejects these, so strip
-    // the tool_calls (downgrading to a plain assistant message) and remove
-    // the now-orphaned tool result messages.
-    let mut i = 0;
-    while i < out.len() {
-        let is_assistant_with_tools = out[i].get("role").and_then(Value::as_str)
-            == Some("assistant")
-            && out[i].get("tool_calls").is_some();
-
-        if is_assistant_with_tools {
-            let expected_ids: HashSet<String> = out[i]
-                .get("tool_calls")
-                .and_then(Value::as_array)
-                .map(|calls| {
-                    calls
-                        .iter()
-                        .filter_map(|c| c.get("id").and_then(Value::as_str).map(String::from))
-                        .collect()
-                })
-                .unwrap_or_default();
-
-            // Collect tool result IDs immediately following this assistant message.
-            let mut found_ids: HashSet<String> = HashSet::new();
-            let mut tool_result_end = i + 1;
-            while tool_result_end < out.len() {
-                if out[tool_result_end].get("role").and_then(Value::as_str) == Some("tool") {
-                    if let Some(id) = out[tool_result_end]
-                        .get("tool_call_id")
-                        .and_then(Value::as_str)
-                    {
-                        found_ids.insert(id.to_string());
-                    }
-                    tool_result_end += 1;
-                } else {
-                    break;
-                }
-            }
-
-            // Also scan non-contiguous tool results up to the next assistant message
-            // in case compaction left gaps.
-            let mut scan = tool_result_end;
-            while scan < out.len() {
-                if out[scan].get("role").and_then(Value::as_str) == Some("assistant") {
-                    break;
-                }
-                if out[scan].get("role").and_then(Value::as_str) == Some("tool")
-                    && let Some(id) = out[scan].get("tool_call_id").and_then(Value::as_str)
-                {
-                    found_ids.insert(id.to_string());
-                }
-                scan += 1;
-            }
-
-            if !expected_ids.is_subset(&found_ids) {
-                let missing: Vec<_> = expected_ids.difference(&found_ids).collect();
-                logging::warn(format!(
-                    "Stripping orphaned tool_calls from assistant message \
-                     (expected {} tool results, found {}, missing: {:?})",
-                    expected_ids.len(),
-                    found_ids.len(),
-                    missing
-                ));
-                if let Some(obj) = out[i].as_object_mut() {
-                    obj.remove("tool_calls");
-                }
-                // If tool_calls were the only assistant content, remove the now-invalid
-                // assistant message entirely (DeepSeek requires content or tool_calls).
-                let assistant_content_empty = out[i]
-                    .get("content")
-                    .is_none_or(|v| v.is_null() || v.as_str().is_some_and(str::is_empty));
-                if assistant_content_empty {
-                    // Remove orphaned tool results tied to this stripped assistant call set.
-                    let mut j = out.len();
-                    while j > i + 1 {
-                        j -= 1;
-                        if out[j].get("role").and_then(Value::as_str) == Some("tool")
-                            && let Some(id) = out[j].get("tool_call_id").and_then(Value::as_str)
-                            && expected_ids.contains(id)
-                        {
-                            out.remove(j);
-                        }
-                    }
-                    out.remove(i);
-                    i = i.saturating_sub(1);
-                    continue;
-                }
-                // Remove contiguous tool results first
-                if tool_result_end > i + 1 {
-                    out.drain((i + 1)..tool_result_end);
-                }
-                // Remove any remaining non-contiguous tool results referencing expected_ids
-                // (scan backward to avoid index shifting issues)
-                let mut j = out.len();
-                while j > i + 1 {
-                    j -= 1;
-                    if out[j].get("role").and_then(Value::as_str) == Some("tool")
-                        && let Some(id) = out[j].get("tool_call_id").and_then(Value::as_str)
-                        && expected_ids.contains(id)
-                    {
-                        out.remove(j);
-                    }
-                }
-            }
-        }
-        i += 1;
-    }
-
-    out
-}
-
-fn tool_to_chat(tool: &Tool) -> Value {
-    let mut value = json!({
-        "type": "function",
-        "function": {
-            "name": to_api_tool_name(&tool.name),
-            "description": tool.description,
-            "parameters": tool.input_schema,
-        }
-    });
-    if let Some(allowed_callers) = &tool.allowed_callers {
-        value["allowed_callers"] = json!(allowed_callers);
-    }
-    if let Some(defer_loading) = tool.defer_loading {
-        value["defer_loading"] = json!(defer_loading);
-    }
-    if let Some(input_examples) = &tool.input_examples {
-        value["input_examples"] = json!(input_examples);
-    }
-    if let Some(strict) = tool.strict
-        && let Some(function) = value.get_mut("function")
-    {
-        function["strict"] = json!(strict);
-    }
-    value
-}
-
-fn map_tool_choice_for_chat(choice: &Value) -> Option<Value> {
-    if let Some(choice_str) = choice.as_str() {
-        return Some(json!(choice_str));
-    }
-    let Some(choice_type) = choice.get("type").and_then(Value::as_str) else {
-        return Some(choice.clone());
-    };
-
-    match choice_type {
-        "auto" | "none" => Some(json!(choice_type)),
-        "any" => Some(json!("auto")),
-        "tool" => choice.get("name").and_then(Value::as_str).map(|name| {
-            json!({
-                "type": "function",
-                "function": { "name": to_api_tool_name(name) }
-            })
-        }),
-        _ => Some(choice.clone()),
-    }
-}
-
-/// Final-pass sanitizer over the outgoing chat-completions JSON payload.
-/// Forces a non-empty `reasoning_content` onto every `assistant` message that
-/// carries `tool_calls`, when the model + effort combination requires it.
-/// DeepSeek's thinking-mode API rejects such messages with a 400 error;
-/// substituting a placeholder keeps the conversation chain intact.
-///
-/// Also tallies the size of all replayed `reasoning_content` and logs it, so
-/// users on `RUST_LOG=deepseek_tui=debug` can see how much of their input
-/// budget is being spent re-sending prior thinking traces (V4 §5.1.1
-/// "Interleaved Thinking" requires the full trace to be replayed across user
-/// message boundaries in tool-calling sessions).
-fn sanitize_thinking_mode_messages(
+pub(super) fn apply_reasoning_effort(
     body: &mut Value,
-    model: &str,
     effort: Option<&str>,
-) -> Option<u32> {
-    if !should_replay_reasoning_content(model, effort) {
-        return None;
-    }
-    let messages = body.get_mut("messages").and_then(Value::as_array_mut)?;
-    let mut substitutions: u32 = 0;
-    let mut replay_chars: u64 = 0;
-    let mut replay_messages: u32 = 0;
-    for (idx, msg) in messages.iter_mut().enumerate() {
-        if msg.get("role").and_then(Value::as_str) != Some("assistant") {
-            continue;
-        }
-        let needs_placeholder = msg
-            .get("reasoning_content")
-            .and_then(Value::as_str)
-            .is_none_or(|s| s.trim().is_empty());
-        if needs_placeholder {
-            msg["reasoning_content"] = json!("(reasoning omitted)");
-            substitutions = substitutions.saturating_add(1);
-            logging::warn(format!(
-                "Final sanitizer: forced reasoning_content placeholder on assistant[{idx}]",
-            ));
-        }
-        if let Some(reasoning) = msg.get("reasoning_content").and_then(Value::as_str) {
-            let len = reasoning.len() as u64;
-            if len > 0 {
-                replay_chars = replay_chars.saturating_add(len);
-                replay_messages = replay_messages.saturating_add(1);
-            }
-        }
-    }
-    if substitutions > 0 {
-        logging::warn(format!(
-            "Final sanitizer: {substitutions} assistant message(s) needed reasoning_content placeholder",
-        ));
-    }
-    if replay_messages == 0 {
-        return None;
-    }
-    // ~4 chars/token is the standard rough estimate; DeepSeek tokens skew
-    // a touch shorter on Chinese/code but this is order-of-magnitude info.
-    let approx_tokens = (replay_chars / 4).min(u64::from(u32::MAX)) as u32;
-    logging::info(format!(
-        "Reasoning-content replay: {replay_messages} assistant message(s), ~{approx_tokens} input tokens ({replay_chars} chars) being re-sent in this request",
-    ));
-    Some(approx_tokens)
-}
-
-/// Sums the byte length of `reasoning_content` across all assistant messages in
-/// an outgoing chat-completions body. Used by tests; the production sanitizer
-/// computes the same number inline and logs it.
-#[cfg(test)]
-fn count_reasoning_replay_chars(body: &Value) -> u64 {
-    let Some(messages) = body.get("messages").and_then(Value::as_array) else {
-        return 0;
-    };
-    messages
-        .iter()
-        .filter(|m| m.get("role").and_then(Value::as_str) == Some("assistant"))
-        .filter_map(|m| m.get("reasoning_content").and_then(Value::as_str))
-        .map(|s| s.len() as u64)
-        .sum()
-}
-
-/// Diagnostic logger fired when DeepSeek rejects the request despite the
-/// sanitizer. Walks the body and logs which assistant messages have tool_calls
-/// but no `reasoning_content` — useful to track down a code path that bypasses
-/// the sanitizer entirely.
-fn log_thinking_mode_violations(body: &Value) {
-    let Some(messages) = body.get("messages").and_then(Value::as_array) else {
-        logging::warn("400-after-sanitizer: body has no `messages` array");
-        return;
-    };
-    let mut violations: Vec<String> = Vec::new();
-    for (idx, msg) in messages.iter().enumerate() {
-        if msg.get("role").and_then(Value::as_str) != Some("assistant") {
-            continue;
-        }
-        let reasoning = msg
-            .get("reasoning_content")
-            .and_then(Value::as_str)
-            .unwrap_or("");
-        let has_tc = msg.get("tool_calls").is_some();
-        if reasoning.trim().is_empty() {
-            violations.push(format!(
-                "assistant[{idx}] (reasoning_content missing, tool_calls={})",
-                has_tc
-            ));
-        }
-    }
-    if violations.is_empty() {
-        logging::warn(
-            "400-after-sanitizer: all assistant messages have reasoning_content — DeepSeek rejected for a different reason",
-        );
-    } else {
-        logging::warn(format!(
-            "400-after-sanitizer: {} assistant message(s) lack reasoning_content despite sanitizer: {}",
-            violations.len(),
-            violations.join(", ")
-        ));
-    }
-}
-
-fn requires_reasoning_content(model: &str) -> bool {
-    let lower = model.to_lowercase();
-    lower.contains("deepseek-v3.2")
-        || lower.contains("deepseek-v4")
-        || lower.contains("reasoner")
-        || lower.contains("-reasoning")
-        || lower.contains("-thinking")
-        || has_deepseek_r_series_marker(&lower)
-}
-
-fn should_replay_reasoning_content(model: &str, effort: Option<&str>) -> bool {
-    if effort
-        .map(|value| {
-            matches!(
-                value.trim().to_ascii_lowercase().as_str(),
-                "off" | "disabled" | "none" | "false"
-            )
-        })
-        .unwrap_or(false)
-    {
-        return false;
-    }
-
-    requires_reasoning_content(model)
-}
-
-/// Translate the TUI's effort-tier string into provider-specific request fields.
-///
-/// The config surface accepts `off | low | medium | high | max`. DeepSeek
-/// itself collapses `low`/`medium` → `"high"` and `xhigh` → `"max"` at the
-/// API boundary (per their docs); NVIDIA NIM takes equivalent controls through
-/// `chat_template_kwargs`.
-fn apply_reasoning_effort(body: &mut Value, effort: Option<&str>, provider: ApiProvider) {
+    provider: ApiProvider,
+) {
     let Some(effort) = effort else {
         return;
     };
@@ -1853,10 +728,22 @@ fn apply_reasoning_effort(body: &mut Value, effort: Option<&str>, provider: ApiP
             ApiProvider::NvidiaNim => {
                 body["chat_template_kwargs"] = json!({
                     "thinking": false,
-                })
+                });
             }
         },
-        "max" | "maximum" | "xhigh" => match provider {
+        "low" | "minimal" | "medium" | "mid" | "high" | "" => match provider {
+            ApiProvider::Deepseek => {
+                body["reasoning_effort"] = json!("high");
+                body["thinking"] = json!({ "type": "enabled" });
+            }
+            ApiProvider::NvidiaNim => {
+                body["chat_template_kwargs"] = json!({
+                    "thinking": true,
+                    "reasoning_effort": "high",
+                });
+            }
+        },
+        "xhigh" | "max" | "highest" => match provider {
             ApiProvider::Deepseek => {
                 body["reasoning_effort"] = json!("max");
                 body["thinking"] = json!({ "type": "enabled" });
@@ -1868,21 +755,6 @@ fn apply_reasoning_effort(body: &mut Value, effort: Option<&str>, provider: ApiP
                 });
             }
         },
-        "low" | "minimal" | "medium" | "mid" | "high" | "" => {
-            match provider {
-                ApiProvider::Deepseek => {
-                    // Per DeepSeek docs: low/medium compat-map to "high".
-                    body["reasoning_effort"] = json!("high");
-                    body["thinking"] = json!({ "type": "enabled" });
-                }
-                ApiProvider::NvidiaNim => {
-                    body["chat_template_kwargs"] = json!({
-                        "thinking": true,
-                        "reasoning_effort": "high",
-                    });
-                }
-            }
-        }
         _ => {
             // Unknown value — do not mutate the request, let the provider
             // apply its own defaults.
@@ -1890,121 +762,7 @@ fn apply_reasoning_effort(body: &mut Value, effort: Option<&str>, provider: ApiP
     }
 }
 
-fn has_deepseek_r_series_marker(model_lower: &str) -> bool {
-    const PREFIX: &str = "deepseek-r";
-    model_lower.match_indices(PREFIX).any(|(idx, _)| {
-        model_lower[idx + PREFIX.len()..]
-            .chars()
-            .next()
-            .is_some_and(|ch| ch.is_ascii_digit())
-    })
-}
-
-fn reasoning_field(value: &Value) -> Option<&str> {
-    value
-        .get("reasoning_content")
-        .or_else(|| value.get("reasoning"))
-        .and_then(Value::as_str)
-}
-
-fn parse_chat_message(payload: &Value) -> Result<MessageResponse> {
-    let id = payload
-        .get("id")
-        .and_then(Value::as_str)
-        .unwrap_or("chatcmpl")
-        .to_string();
-    let model = payload
-        .get("model")
-        .and_then(Value::as_str)
-        .unwrap_or("unknown")
-        .to_string();
-
-    let choices = payload
-        .get("choices")
-        .and_then(Value::as_array)
-        .context("Chat API response missing choices")?;
-    let choice = choices
-        .first()
-        .context("Chat API response missing first choice")?;
-    let message = choice
-        .get("message")
-        .context("Chat API response missing message")?;
-
-    let mut content_blocks = Vec::new();
-    if let Some(reasoning) =
-        reasoning_field(message).filter(|reasoning| !reasoning.trim().is_empty())
-    {
-        content_blocks.push(ContentBlock::Thinking {
-            thinking: reasoning.to_string(),
-        });
-    }
-    if let Some(text) = message.get("content").and_then(Value::as_str)
-        && !text.trim().is_empty()
-    {
-        content_blocks.push(ContentBlock::Text {
-            text: text.to_string(),
-            cache_control: None,
-        });
-    }
-
-    if let Some(tool_calls) = message.get("tool_calls").and_then(Value::as_array) {
-        for call in tool_calls {
-            let id = call
-                .get("id")
-                .and_then(Value::as_str)
-                .unwrap_or("tool_call")
-                .to_string();
-            let function = call.get("function");
-            let name = function
-                .and_then(|f| f.get("name"))
-                .and_then(Value::as_str)
-                .unwrap_or("tool")
-                .to_string();
-            let arguments = function
-                .and_then(|f| f.get("arguments"))
-                .and_then(Value::as_str)
-                .map(|raw| serde_json::from_str(raw).unwrap_or(Value::String(raw.to_string())))
-                .unwrap_or(Value::Null);
-            let caller = call.get("caller").and_then(|v| {
-                v.get("type")
-                    .and_then(Value::as_str)
-                    .map(|caller_type| ToolCaller {
-                        caller_type: caller_type.to_string(),
-                        tool_id: v
-                            .get("tool_id")
-                            .and_then(Value::as_str)
-                            .map(std::string::ToString::to_string),
-                    })
-            });
-
-            content_blocks.push(ContentBlock::ToolUse {
-                id,
-                name: from_api_tool_name(&name),
-                input: arguments,
-                caller,
-            });
-        }
-    }
-
-    let usage = parse_usage(payload.get("usage"));
-
-    Ok(MessageResponse {
-        id,
-        r#type: "message".to_string(),
-        role: "assistant".to_string(),
-        content: content_blocks,
-        model,
-        stop_reason: choice
-            .get("finish_reason")
-            .and_then(Value::as_str)
-            .map(str::to_string),
-        stop_sequence: None,
-        container: None,
-        usage,
-    })
-}
-
-fn parse_usage(usage: Option<&Value>) -> Usage {
+pub(super) fn parse_usage(usage: Option<&Value>) -> Usage {
     let input_tokens = usage
         .and_then(|u| u.get("input_tokens").or_else(|| u.get("prompt_tokens")))
         .and_then(Value::as_u64)
@@ -2056,321 +814,17 @@ fn parse_usage(usage: Option<&Value>) -> Usage {
     }
 }
 
-// === Streaming Helpers ===
-
-/// Build synthetic stream events from a non-streaming response (used as fallback).
-#[allow(dead_code)]
-fn build_stream_events(response: &MessageResponse) -> Vec<StreamEvent> {
-    let mut events = Vec::new();
-    let mut index = 0u32;
-
-    events.push(StreamEvent::MessageStart {
-        message: response.clone(),
-    });
-
-    for block in &response.content {
-        match block {
-            ContentBlock::Text { text, .. } => {
-                events.push(StreamEvent::ContentBlockStart {
-                    index,
-                    content_block: ContentBlockStart::Text {
-                        text: String::new(),
-                    },
-                });
-                if !text.is_empty() {
-                    events.push(StreamEvent::ContentBlockDelta {
-                        index,
-                        delta: Delta::TextDelta { text: text.clone() },
-                    });
-                }
-                events.push(StreamEvent::ContentBlockStop { index });
-            }
-            ContentBlock::Thinking { thinking } => {
-                events.push(StreamEvent::ContentBlockStart {
-                    index,
-                    content_block: ContentBlockStart::Thinking {
-                        thinking: String::new(),
-                    },
-                });
-                if !thinking.is_empty() {
-                    events.push(StreamEvent::ContentBlockDelta {
-                        index,
-                        delta: Delta::ThinkingDelta {
-                            thinking: thinking.clone(),
-                        },
-                    });
-                }
-                events.push(StreamEvent::ContentBlockStop { index });
-            }
-            ContentBlock::ToolUse {
-                id, name, input, ..
-            } => {
-                events.push(StreamEvent::ContentBlockStart {
-                    index,
-                    content_block: ContentBlockStart::ToolUse {
-                        id: id.clone(),
-                        name: name.clone(),
-                        input: input.clone(),
-                        caller: None,
-                    },
-                });
-                events.push(StreamEvent::ContentBlockStop { index });
-            }
-            ContentBlock::ToolResult { .. } => {}
-            ContentBlock::ServerToolUse { id, name, input } => {
-                events.push(StreamEvent::ContentBlockStart {
-                    index,
-                    content_block: ContentBlockStart::ServerToolUse {
-                        id: id.clone(),
-                        name: name.clone(),
-                        input: input.clone(),
-                    },
-                });
-                events.push(StreamEvent::ContentBlockStop { index });
-            }
-            ContentBlock::ToolSearchToolResult { .. }
-            | ContentBlock::CodeExecutionToolResult { .. } => {}
-        }
-        index = index.saturating_add(1);
-    }
-
-    events.push(StreamEvent::MessageDelta {
-        delta: MessageDelta {
-            stop_reason: response.stop_reason.clone(),
-            stop_sequence: response.stop_sequence.clone(),
-        },
-        usage: Some(response.usage.clone()),
-    });
-    events.push(StreamEvent::MessageStop);
-
-    events
-}
-
-// === SSE Chunk Parser ===
-
-/// Parse a single SSE chunk from the Chat Completions streaming API into
-/// our internal `StreamEvent` representation.
-fn parse_sse_chunk(
-    chunk: &Value,
-    content_index: &mut u32,
-    text_started: &mut bool,
-    thinking_started: &mut bool,
-    tool_indices: &mut std::collections::HashMap<u32, u32>,
-    is_reasoning_model: bool,
-) -> Vec<StreamEvent> {
-    let mut events = Vec::new();
-
-    let Some(choices) = chunk.get("choices").and_then(Value::as_array) else {
-        // Usage-only chunk (sent at end with stream_options)
-        if let Some(usage_val) = chunk.get("usage") {
-            let usage = parse_usage(Some(usage_val));
-            events.push(StreamEvent::MessageDelta {
-                delta: MessageDelta {
-                    stop_reason: None,
-                    stop_sequence: None,
-                },
-                usage: Some(usage),
-            });
-        }
-        return events;
-    };
-
-    if choices.is_empty() {
-        if let Some(usage_val) = chunk.get("usage") {
-            let usage = parse_usage(Some(usage_val));
-            events.push(StreamEvent::MessageDelta {
-                delta: MessageDelta {
-                    stop_reason: None,
-                    stop_sequence: None,
-                },
-                usage: Some(usage),
-            });
-        }
-        return events;
-    }
-
-    for choice in choices {
-        let delta = choice.get("delta");
-        let finish_reason = choice
-            .get("finish_reason")
-            .and_then(Value::as_str)
-            .map(str::to_string);
-
-        if let Some(delta) = delta {
-            // Handle reasoning_content / reasoning thinking deltas.
-            if is_reasoning_model
-                && let Some(reasoning) = reasoning_field(delta)
-                && !reasoning.is_empty()
-            {
-                if !*thinking_started {
-                    events.push(StreamEvent::ContentBlockStart {
-                        index: *content_index,
-                        content_block: ContentBlockStart::Thinking {
-                            thinking: String::new(),
-                        },
-                    });
-                    *thinking_started = true;
-                }
-                events.push(StreamEvent::ContentBlockDelta {
-                    index: *content_index,
-                    delta: Delta::ThinkingDelta {
-                        thinking: reasoning.to_string(),
-                    },
-                });
-            }
-
-            // Handle regular content
-            if let Some(content) = delta.get("content").and_then(Value::as_str)
-                && !content.is_empty()
-            {
-                // Close thinking block if transitioning to text
-                if *thinking_started {
-                    events.push(StreamEvent::ContentBlockStop {
-                        index: *content_index,
-                    });
-                    *content_index += 1;
-                    *thinking_started = false;
-                }
-                if !*text_started {
-                    events.push(StreamEvent::ContentBlockStart {
-                        index: *content_index,
-                        content_block: ContentBlockStart::Text {
-                            text: String::new(),
-                        },
-                    });
-                    *text_started = true;
-                }
-                events.push(StreamEvent::ContentBlockDelta {
-                    index: *content_index,
-                    delta: Delta::TextDelta {
-                        text: content.to_string(),
-                    },
-                });
-            }
-
-            // Handle tool calls
-            if let Some(tool_calls) = delta.get("tool_calls").and_then(Value::as_array) {
-                for tc in tool_calls {
-                    let tc_index = tc.get("index").and_then(Value::as_u64).unwrap_or(0) as u32;
-                    let tool_block_index = match tool_indices.entry(tc_index) {
-                        std::collections::hash_map::Entry::Occupied(entry) => *entry.get(),
-                        std::collections::hash_map::Entry::Vacant(entry) => {
-                            // Close text block if transitioning to tool use
-                            if *text_started {
-                                events.push(StreamEvent::ContentBlockStop {
-                                    index: *content_index,
-                                });
-                                *content_index += 1;
-                                *text_started = false;
-                            }
-                            if *thinking_started {
-                                events.push(StreamEvent::ContentBlockStop {
-                                    index: *content_index,
-                                });
-                                *content_index += 1;
-                                *thinking_started = false;
-                            }
-
-                            let id = tc
-                                .get("id")
-                                .and_then(Value::as_str)
-                                .unwrap_or("tool_call")
-                                .to_string();
-                            let name = tc
-                                .get("function")
-                                .and_then(|f| f.get("name"))
-                                .and_then(Value::as_str)
-                                .unwrap_or("")
-                                .to_string();
-                            let caller = tc.get("caller").and_then(|v| {
-                                v.get("type").and_then(Value::as_str).map(|caller_type| {
-                                    ToolCaller {
-                                        caller_type: caller_type.to_string(),
-                                        tool_id: v
-                                            .get("tool_id")
-                                            .and_then(Value::as_str)
-                                            .map(std::string::ToString::to_string),
-                                    }
-                                })
-                            });
-
-                            let block_index = *content_index;
-                            events.push(StreamEvent::ContentBlockStart {
-                                index: block_index,
-                                content_block: ContentBlockStart::ToolUse {
-                                    id,
-                                    name: from_api_tool_name(&name),
-                                    input: json!({}),
-                                    caller,
-                                },
-                            });
-                            *content_index = (*content_index).saturating_add(1);
-                            entry.insert(block_index);
-                            block_index
-                        }
-                    };
-
-                    // Stream tool call arguments
-                    if let Some(args) = tc
-                        .get("function")
-                        .and_then(|f| f.get("arguments"))
-                        .and_then(Value::as_str)
-                        && !args.is_empty()
-                    {
-                        events.push(StreamEvent::ContentBlockDelta {
-                            index: tool_block_index,
-                            delta: Delta::InputJsonDelta {
-                                partial_json: args.to_string(),
-                            },
-                        });
-                    }
-                }
-            }
-        }
-
-        // Handle finish reason
-        if let Some(reason) = finish_reason {
-            // Close any open blocks
-            if *text_started {
-                events.push(StreamEvent::ContentBlockStop {
-                    index: *content_index,
-                });
-                *text_started = false;
-            }
-            if *thinking_started {
-                events.push(StreamEvent::ContentBlockStop {
-                    index: *content_index,
-                });
-                *thinking_started = false;
-            }
-            // Close tool blocks
-            let mut open_tool_indices: Vec<u32> =
-                tool_indices.drain().map(|(_, idx)| idx).collect();
-            open_tool_indices.sort_unstable();
-            for tool_block_index in open_tool_indices {
-                events.push(StreamEvent::ContentBlockStop {
-                    index: tool_block_index,
-                });
-            }
-
-            // Emit usage from the chunk if available
-            let chunk_usage = chunk.get("usage").map(|u| parse_usage(Some(u)));
-            events.push(StreamEvent::MessageDelta {
-                delta: MessageDelta {
-                    stop_reason: Some(reason),
-                    stop_sequence: None,
-                },
-                usage: chunk_usage,
-            });
-        }
-    }
-
-    events
-}
+mod chat;
+mod responses;
 
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::client::chat::{
+        build_chat_messages, build_chat_messages_for_request, count_reasoning_replay_chars,
+        parse_chat_message, parse_sse_chunk, sanitize_thinking_mode_messages, tool_to_chat,
+    };
+    use crate::models::{ContentBlock, ContentBlockStart, Delta, Message, StreamEvent, Tool};
     use serde_json::json;
 
     #[test]
@@ -3435,6 +1889,41 @@ mod tests {
 
     #[test]
     fn parse_usage_reads_deepseek_cache_and_reasoning_tokens() {
+        fn parse_usage(usage: Option<&Value>) -> Usage {
+            let usage = usage.expect("usage");
+            let input_tokens = usage
+                .get("prompt_tokens")
+                .and_then(Value::as_u64)
+                .expect("prompt tokens") as u32;
+            let output_tokens = usage
+                .get("completion_tokens")
+                .and_then(Value::as_u64)
+                .expect("completion tokens") as u32;
+            let prompt_cache_hit_tokens = usage
+                .get("prompt_cache_hit_tokens")
+                .and_then(Value::as_u64)
+                .map(|v| v as u32);
+            let prompt_cache_miss_tokens = usage
+                .get("prompt_cache_miss_tokens")
+                .and_then(Value::as_u64)
+                .map(|v| v as u32);
+            let reasoning_tokens = usage
+                .get("completion_tokens_details")
+                .and_then(|d| d.get("reasoning_tokens"))
+                .and_then(Value::as_u64)
+                .map(|v| v as u32);
+
+            Usage {
+                input_tokens,
+                output_tokens,
+                prompt_cache_hit_tokens,
+                prompt_cache_miss_tokens,
+                reasoning_tokens,
+                reasoning_replay_tokens: None,
+                server_tool_use: None,
+            }
+        }
+
         let usage = parse_usage(Some(&json!({
             "prompt_tokens": 100,
             "completion_tokens": 20,
diff --git a/crates/tui/src/client/chat.rs b/crates/tui/src/client/chat.rs
new file mode 100644
index 00000000..73d81315
--- /dev/null
+++ b/crates/tui/src/client/chat.rs
@@ -0,0 +1,1190 @@
+//! Chat Completions API helpers for DeepSeek's OpenAI-compatible endpoint.
+//!
+//! This is the production code path. Streaming (`create_message_stream`),
+//! request building (`build_chat_messages*`), and SSE parsing (`parse_sse_chunk`)
+//! all live here.
+
+use std::collections::HashSet;
+use std::pin::Pin;
+use std::time::Duration;
+
+use anyhow::{Context, Result};
+use serde_json::{Value, json};
+
+use crate::llm_client::StreamEventBox;
+use crate::logging;
+use crate::models::{
+    ContentBlock, ContentBlockStart, Delta, Message, MessageDelta, MessageRequest, MessageResponse,
+    StreamEvent, SystemPrompt, Tool, ToolCaller, Usage,
+};
+
+use super::{
+    DeepSeekClient, ERROR_BODY_MAX_BYTES, SSE_BACKPRESSURE_HIGH_WATERMARK,
+    SSE_BACKPRESSURE_SLEEP_MS, SSE_MAX_LINES_PER_CHUNK, acquire_stream_buffer, api_url,
+    apply_reasoning_effort, bounded_error_text, from_api_tool_name, parse_usage,
+    release_stream_buffer, system_to_instructions, to_api_tool_name,
+};
+
+impl DeepSeekClient {
+    pub(super) async fn create_message_chat(
+        &self,
+        request: &MessageRequest,
+    ) -> Result<MessageResponse> {
+        let messages = build_chat_messages_for_request(request);
+        let mut body = json!({
+            "model": request.model,
+            "messages": messages,
+            "max_tokens": request.max_tokens,
+        });
+
+        if let Some(temperature) = request.temperature {
+            body["temperature"] = json!(temperature);
+        }
+        if let Some(top_p) = request.top_p {
+            body["top_p"] = json!(top_p);
+        }
+        if let Some(tools) = request.tools.as_ref() {
+            body["tools"] = json!(tools.iter().map(tool_to_chat).collect::<Vec<_>>());
+        }
+        if let Some(choice) = request.tool_choice.as_ref()
+            && let Some(mapped) = map_tool_choice_for_chat(choice)
+        {
+            body["tool_choice"] = mapped;
+        }
+        apply_reasoning_effort(
+            &mut body,
+            request.reasoning_effort.as_deref(),
+            self.api_provider,
+        );
+
+        let url = api_url(&self.base_url, "chat/completions");
+        let response = self
+            .send_with_retry(|| self.http_client.post(&url).json(&body))
+            .await?;
+
+        let status = response.status();
+        if !status.is_success() {
+            let error_text = bounded_error_text(response, ERROR_BODY_MAX_BYTES).await;
+            anyhow::bail!("Failed to call DeepSeek Chat API: HTTP {status}: {error_text}");
+        }
+
+        let response_text = response.text().await.unwrap_or_default();
+        let value: Value =
+            serde_json::from_str(&response_text).context("Failed to parse Chat API JSON")?;
+        parse_chat_message(&value)
+    }
+}
+
+impl DeepSeekClient {
+    pub(super) async fn handle_chat_completion_stream(
+        &self,
+        request: MessageRequest,
+    ) -> Result<StreamEventBox> {
+        // Try true SSE streaming via chat completions (widely supported)
+        let messages = build_chat_messages_for_request(&request);
+        let mut body = json!({
+            "model": request.model,
+            "messages": messages,
+            "max_tokens": request.max_tokens,
+            "stream": true,
+            "stream_options": {
+                "include_usage": true
+            },
+        });
+
+        if let Some(temperature) = request.temperature {
+            body["temperature"] = json!(temperature);
+        }
+        if let Some(top_p) = request.top_p {
+            body["top_p"] = json!(top_p);
+        }
+        if let Some(tools) = request.tools.as_ref() {
+            body["tools"] = json!(tools.iter().map(tool_to_chat).collect::<Vec<_>>());
+        }
+        if let Some(choice) = request.tool_choice.as_ref()
+            && let Some(mapped) = map_tool_choice_for_chat(choice)
+        {
+            body["tool_choice"] = mapped;
+        }
+        apply_reasoning_effort(
+            &mut body,
+            request.reasoning_effort.as_deref(),
+            self.api_provider,
+        );
+
+        // Bulletproof final sanitizer: walk the wire payload and force
+        // `reasoning_content` onto any assistant message that has tool_calls
+        // but no reasoning_content. DeepSeek's thinking-mode API rejects
+        // such messages with a 400. This is the last line of defense after
+        // engine-side and build-side substitution; if either upstream path
+        // misses a case (e.g. a session restored from disk, a sub-agent
+        // adding messages directly, or a cached prefix mismatch), this pass
+        // still produces a valid request.
+        let replay_input_tokens = sanitize_thinking_mode_messages(
+            &mut body,
+            &request.model,
+            request.reasoning_effort.as_deref(),
+        );
+
+        let url = api_url(&self.base_url, "chat/completions");
+        let response = self
+            .send_with_retry(|| self.http_client.post(&url).json(&body))
+            .await?;
+
+        let status = response.status();
+        if !status.is_success() {
+            let error_text = bounded_error_text(response, ERROR_BODY_MAX_BYTES).await;
+            // If DeepSeek rejected for missing reasoning_content despite the
+            // sanitizer, dump the offending indices so we can diagnose where
+            // they came from on the next failure.
+            if error_text.contains("reasoning_content") {
+                log_thinking_mode_violations(&body);
+            }
+            anyhow::bail!("SSE stream request failed: HTTP {status}: {error_text}");
+        }
+
+        let model = request.model.clone();
+        let byte_stream = response.bytes_stream();
+
+        let stream = async_stream::stream! {
+            use futures_util::StreamExt;
+
+            // Emit a synthetic MessageStart
+            yield Ok(StreamEvent::MessageStart {
+                message: MessageResponse {
+                    id: String::new(),
+                    r#type: "message".to_string(),
+                    role: "assistant".to_string(),
+                    content: Vec::new(),
+                    model: model.clone(),
+                    stop_reason: None,
+                    stop_sequence: None,
+                    container: None,
+                    usage: Usage {
+                        input_tokens: 0,
+                        output_tokens: 0,
+                        ..Usage::default()
+                    },
+                },
+            });
+
+            let mut line_buf = String::new();
+            let mut byte_buf = acquire_stream_buffer();
+            let mut content_index: u32 = 0;
+            let mut text_started = false;
+            let mut thinking_started = false;
+            let mut tool_indices: std::collections::HashMap<u32, u32> = std::collections::HashMap::new();
+            let is_reasoning_model = requires_reasoning_content(&model);
+
+            let mut byte_stream = std::pin::pin!(byte_stream);
+
+            while let Some(chunk_result) = byte_stream.next().await {
+                let chunk = match chunk_result {
+                    Ok(bytes) => bytes,
+                    Err(e) => {
+                        yield Err(anyhow::anyhow!("Stream read error: {e}"));
+                        break;
+                    }
+                };
+
+                byte_buf.extend_from_slice(&chunk);
+
+                // Guard against unbounded buffer growth (e.g., malformed stream without newlines)
+                const MAX_SSE_BUF: usize = 10 * 1024 * 1024; // 10 MB
+                if byte_buf.len() > MAX_SSE_BUF {
+                    yield Err(anyhow::anyhow!("SSE buffer exceeded {MAX_SSE_BUF} bytes — aborting stream"));
+                    break;
+                }
+
+                if byte_buf.len() > SSE_BACKPRESSURE_HIGH_WATERMARK {
+                    tokio::time::sleep(Duration::from_millis(SSE_BACKPRESSURE_SLEEP_MS)).await;
+                }
+
+                // Process complete SSE lines from the buffer
+                let mut lines_processed = 0usize;
+                while let Some(newline_pos) = byte_buf.iter().position(|&b| b == b'\n') {
+                    let mut end = newline_pos;
+                    if end > 0 && byte_buf[end - 1] == b'\r' {
+                        end -= 1;
+                    }
+                    let line = String::from_utf8_lossy(&byte_buf[..end]).into_owned();
+                    byte_buf.drain(..newline_pos + 1);
+
+                    if line.is_empty() {
+                        // Empty line = event boundary, process accumulated data
+                        if !line_buf.is_empty() {
+                            let data = std::mem::take(&mut line_buf);
+                            if data.trim() == "[DONE]" {
+                                // Stream complete
+                            } else if let Ok(chunk_json) = serde_json::from_str::<Value>(&data) {
+                                // Parse the SSE chunk into stream events
+                                for mut event in parse_sse_chunk(
+                                    &chunk_json,
+                                    &mut content_index,
+                                    &mut text_started,
+                                    &mut thinking_started,
+                                    &mut tool_indices,
+                                    is_reasoning_model,
+                                ) {
+                                    // Stamp the client-side replay-token estimate
+                                    // onto the final usage so the UI can surface
+                                    // it (#30). We compute it pre-request and
+                                    // overlay it on the server-reported usage at
+                                    // stream completion.
+                                    if let Some(tokens) = replay_input_tokens
+                                        && let StreamEvent::MessageDelta {
+                                            usage: Some(usage),
+                                            ..
+                                        } = &mut event
+                                    {
+                                        usage.reasoning_replay_tokens = Some(tokens);
+                                    }
+                                    yield Ok(event);
+                                }
+                            }
+                        }
+                        continue;
+                    }
+
+                    if let Some(data) = line.strip_prefix("data: ") {
+                        line_buf.push_str(data);
+                    }
+                    // Ignore other SSE fields (event:, id:, retry:)
+
+                    lines_processed = lines_processed.saturating_add(1);
+                    if lines_processed >= SSE_MAX_LINES_PER_CHUNK {
+                        // Yield backpressure relief to avoid starving downstream consumers.
+                        break;
+                    }
+                }
+            }
+
+            // Close any open blocks
+            if thinking_started {
+                yield Ok(StreamEvent::ContentBlockStop { index: content_index.saturating_sub(1) });
+            }
+            if text_started {
+                yield Ok(StreamEvent::ContentBlockStop { index: content_index.saturating_sub(1) });
+            }
+
+            release_stream_buffer(byte_buf);
+            yield Ok(StreamEvent::MessageStop);
+        };
+
+        Ok(Pin::from(Box::new(stream)
+            as Box<
+                dyn futures_util::Stream<Item = Result<StreamEvent>> + Send,
+            >))
+    }
+}
+
+// === Chat Completions Helpers ===
+
+#[cfg(test)]
+pub(super) fn build_chat_messages(
+    system: Option<&SystemPrompt>,
+    messages: &[Message],
+    model: &str,
+) -> Vec<Value> {
+    build_chat_messages_with_reasoning(
+        system,
+        messages,
+        model,
+        should_replay_reasoning_content(model, None),
+    )
+}
+
+pub(super) fn build_chat_messages_for_request(request: &MessageRequest) -> Vec<Value> {
+    build_chat_messages_with_reasoning(
+        request.system.as_ref(),
+        &request.messages,
+        &request.model,
+        should_replay_reasoning_content(&request.model, request.reasoning_effort.as_deref()),
+    )
+}
+
+fn build_chat_messages_with_reasoning(
+    system: Option<&SystemPrompt>,
+    messages: &[Message],
+    _model: &str,
+    include_reasoning: bool,
+) -> Vec<Value> {
+    let mut out = Vec::new();
+    let mut pending_tool_calls: HashSet<String> = HashSet::new();
+
+    if let Some(instructions) = system_to_instructions(system.cloned())
+        && !instructions.trim().is_empty()
+    {
+        out.push(json!({
+            "role": "system",
+            "content": instructions,
+        }));
+    }
+
+    for message in messages.iter() {
+        let role = message.role.as_str();
+        let mut text_parts = Vec::new();
+        let mut thinking_parts = Vec::new();
+        let mut tool_calls = Vec::new();
+        let mut tool_call_ids = Vec::new();
+        let mut tool_results: Vec<(String, Value)> = Vec::new();
+
+        for block in &message.content {
+            match block {
+                ContentBlock::Text { text, .. } => text_parts.push(text.clone()),
+                ContentBlock::Thinking { thinking } => thinking_parts.push(thinking.clone()),
+                ContentBlock::ToolUse {
+                    id,
+                    name,
+                    input,
+                    caller,
+                    ..
+                } => {
+                    let args = serde_json::to_string(input).unwrap_or_else(|_| input.to_string());
+                    let mut call = json!({
+                        "id": id,
+                        "type": "function",
+                        "function": {
+                            "name": to_api_tool_name(name),
+                            "arguments": args,
+                        }
+                    });
+                    if let Some(caller) = caller {
+                        call["caller"] = json!({
+                            "type": caller.caller_type,
+                            "tool_id": caller.tool_id,
+                        });
+                    }
+                    tool_calls.push(call);
+                    tool_call_ids.push(id.clone());
+                }
+                ContentBlock::ToolResult {
+                    tool_use_id,
+                    content,
+                    ..
+                } => {
+                    tool_results.push((
+                        tool_use_id.clone(),
+                        json!({
+                            "role": "tool",
+                            "tool_call_id": tool_use_id,
+                            "content": content,
+                        }),
+                    ));
+                }
+                ContentBlock::ServerToolUse { .. }
+                | ContentBlock::ToolSearchToolResult { .. }
+                | ContentBlock::CodeExecutionToolResult { .. } => {}
+            }
+        }
+
+        if role == "assistant" {
+            let content = text_parts.join("\n");
+            let mut reasoning_content = thinking_parts.join("\n");
+            let has_text = !content.trim().is_empty();
+            let has_tool_calls = !tool_calls.is_empty();
+            // DeepSeek thinking-mode rule: every assistant message in the
+            // conversation must carry its `reasoning_content` when thinking
+            // is enabled. The docs say non-tool-call messages' reasoning is
+            // "ignored", but the API still validates presence and rejects
+            // with a 400 if any assistant message is missing it. If reasoning
+            // was lost (e.g. a session checkpoint from before this rule was
+            // enforced, or a sub-turn with no streamed reasoning text),
+            // substitute a non-empty placeholder so the API accepts the
+            // request.
+            let include_reasoning_for_turn = include_reasoning;
+            let mut has_reasoning =
+                include_reasoning_for_turn && !reasoning_content.trim().is_empty();
+            if include_reasoning_for_turn && !has_reasoning {
+                logging::warn(
+                    "Substituting placeholder reasoning_content for DeepSeek tool-call assistant message",
+                );
+                reasoning_content = String::from("(reasoning omitted)");
+                has_reasoning = true;
+            }
+
+            // DeepSeek rejects assistant messages where both `content` and
+            // `tool_calls` are missing/null. Skip such entries even if they
+            // carry reasoning-only metadata unless we can send a non-null
+            // placeholder content field.
+            if !has_text && !has_tool_calls && !has_reasoning {
+                pending_tool_calls.clear();
+                continue;
+            }
+
+            let mut msg = json!({
+                "role": "assistant",
+                "content": if has_text {
+                    json!(content)
+                } else if has_reasoning {
+                    json!("")
+                } else {
+                    Value::Null
+                },
+            });
+            if has_reasoning {
+                msg["reasoning_content"] = json!(reasoning_content);
+            }
+            if has_tool_calls {
+                msg["tool_calls"] = json!(tool_calls);
+                pending_tool_calls = tool_call_ids.into_iter().collect();
+            } else {
+                pending_tool_calls.clear();
+            }
+            out.push(msg);
+        } else if role == "user" {
+            let content = text_parts.join("\n");
+            if !content.trim().is_empty() {
+                out.push(json!({
+                    "role": "user",
+                    "content": content,
+                }));
+            }
+        }
+
+        if !tool_results.is_empty() {
+            if pending_tool_calls.is_empty() {
+                logging::warn("Dropping tool results without matching tool_calls");
+            } else {
+                for (tool_id, tool_msg) in tool_results {
+                    if pending_tool_calls.remove(&tool_id) {
+                        out.push(tool_msg);
+                    } else {
+                        logging::warn(format!(
+                            "Dropping tool result for unknown tool_call_id: {tool_id}"
+                        ));
+                    }
+                }
+            }
+        } else if role != "assistant" {
+            pending_tool_calls.clear();
+        }
+    }
+
+    // Safety net: after compaction, an assistant message may have tool_calls
+    // whose results were summarized away. The API rejects these, so strip
+    // the tool_calls (downgrading to a plain assistant message) and remove
+    // the now-orphaned tool result messages.
+    let mut i = 0;
+    while i < out.len() {
+        let is_assistant_with_tools = out[i].get("role").and_then(Value::as_str)
+            == Some("assistant")
+            && out[i].get("tool_calls").is_some();
+
+        if is_assistant_with_tools {
+            let expected_ids: HashSet<String> = out[i]
+                .get("tool_calls")
+                .and_then(Value::as_array)
+                .map(|calls| {
+                    calls
+                        .iter()
+                        .filter_map(|c| c.get("id").and_then(Value::as_str).map(String::from))
+                        .collect()
+                })
+                .unwrap_or_default();
+
+            // Collect tool result IDs immediately following this assistant message.
+            let mut found_ids: HashSet<String> = HashSet::new();
+            let mut tool_result_end = i + 1;
+            while tool_result_end < out.len() {
+                if out[tool_result_end].get("role").and_then(Value::as_str) == Some("tool") {
+                    if let Some(id) = out[tool_result_end]
+                        .get("tool_call_id")
+                        .and_then(Value::as_str)
+                    {
+                        found_ids.insert(id.to_string());
+                    }
+                    tool_result_end += 1;
+                } else {
+                    break;
+                }
+            }
+
+            // Also scan non-contiguous tool results up to the next assistant message
+            // in case compaction left gaps.
+            let mut scan = tool_result_end;
+            while scan < out.len() {
+                if out[scan].get("role").and_then(Value::as_str) == Some("assistant") {
+                    break;
+                }
+                if out[scan].get("role").and_then(Value::as_str) == Some("tool")
+                    && let Some(id) = out[scan].get("tool_call_id").and_then(Value::as_str)
+                {
+                    found_ids.insert(id.to_string());
+                }
+                scan += 1;
+            }
+
+            if !expected_ids.is_subset(&found_ids) {
+                let missing: Vec<_> = expected_ids.difference(&found_ids).collect();
+                logging::warn(format!(
+                    "Stripping orphaned tool_calls from assistant message \
+                     (expected {} tool results, found {}, missing: {:?})",
+                    expected_ids.len(),
+                    found_ids.len(),
+                    missing
+                ));
+                if let Some(obj) = out[i].as_object_mut() {
+                    obj.remove("tool_calls");
+                }
+                // If tool_calls were the only assistant content, remove the now-invalid
+                // assistant message entirely (DeepSeek requires content or tool_calls).
+                let assistant_content_empty = out[i]
+                    .get("content")
+                    .is_none_or(|v| v.is_null() || v.as_str().is_some_and(str::is_empty));
+                if assistant_content_empty {
+                    // Remove orphaned tool results tied to this stripped assistant call set.
+                    let mut j = out.len();
+                    while j > i + 1 {
+                        j -= 1;
+                        if out[j].get("role").and_then(Value::as_str) == Some("tool")
+                            && let Some(id) = out[j].get("tool_call_id").and_then(Value::as_str)
+                            && expected_ids.contains(id)
+                        {
+                            out.remove(j);
+                        }
+                    }
+                    out.remove(i);
+                    i = i.saturating_sub(1);
+                    continue;
+                }
+                // Remove contiguous tool results first
+                if tool_result_end > i + 1 {
+                    out.drain((i + 1)..tool_result_end);
+                }
+                // Remove any remaining non-contiguous tool results referencing expected_ids
+                // (scan backward to avoid index shifting issues)
+                let mut j = out.len();
+                while j > i + 1 {
+                    j -= 1;
+                    if out[j].get("role").and_then(Value::as_str) == Some("tool")
+                        && let Some(id) = out[j].get("tool_call_id").and_then(Value::as_str)
+                        && expected_ids.contains(id)
+                    {
+                        out.remove(j);
+                    }
+                }
+            }
+        }
+        i += 1;
+    }
+
+    out
+}
+
+pub(super) fn tool_to_chat(tool: &Tool) -> Value {
+    let mut value = json!({
+        "type": "function",
+        "function": {
+            "name": to_api_tool_name(&tool.name),
+            "description": tool.description,
+            "parameters": tool.input_schema,
+        }
+    });
+    if let Some(allowed_callers) = &tool.allowed_callers {
+        value["allowed_callers"] = json!(allowed_callers);
+    }
+    if let Some(defer_loading) = tool.defer_loading {
+        value["defer_loading"] = json!(defer_loading);
+    }
+    if let Some(input_examples) = &tool.input_examples {
+        value["input_examples"] = json!(input_examples);
+    }
+    if let Some(strict) = tool.strict
+        && let Some(function) = value.get_mut("function")
+    {
+        function["strict"] = json!(strict);
+    }
+    value
+}
+
+fn map_tool_choice_for_chat(choice: &Value) -> Option<Value> {
+    if let Some(choice_str) = choice.as_str() {
+        return Some(json!(choice_str));
+    }
+    let Some(choice_type) = choice.get("type").and_then(Value::as_str) else {
+        return Some(choice.clone());
+    };
+
+    match choice_type {
+        "auto" | "none" => Some(json!(choice_type)),
+        "any" => Some(json!("auto")),
+        "tool" => choice.get("name").and_then(Value::as_str).map(|name| {
+            json!({
+                "type": "function",
+                "function": { "name": to_api_tool_name(name) }
+            })
+        }),
+        _ => Some(choice.clone()),
+    }
+}
+
+/// Final-pass sanitizer over the outgoing chat-completions JSON payload.
+/// Forces a non-empty `reasoning_content` onto every `assistant` message that
+/// carries `tool_calls`, when the model + effort combination requires it.
+/// DeepSeek's thinking-mode API rejects such messages with a 400 error;
+/// substituting a placeholder keeps the conversation chain intact.
+///
+/// Also tallies the size of all replayed `reasoning_content` and logs it, so
+/// users on `RUST_LOG=deepseek_tui=debug` can see how much of their input
+/// budget is being spent re-sending prior thinking traces (V4 §5.1.1
+/// "Interleaved Thinking" requires the full trace to be replayed across user
+/// message boundaries in tool-calling sessions).
+pub(super) fn sanitize_thinking_mode_messages(
+    body: &mut Value,
+    model: &str,
+    effort: Option<&str>,
+) -> Option<u32> {
+    if !should_replay_reasoning_content(model, effort) {
+        return None;
+    }
+    let messages = body.get_mut("messages").and_then(Value::as_array_mut)?;
+    let mut substitutions: u32 = 0;
+    let mut replay_chars: u64 = 0;
+    let mut replay_messages: u32 = 0;
+    for (idx, msg) in messages.iter_mut().enumerate() {
+        if msg.get("role").and_then(Value::as_str) != Some("assistant") {
+            continue;
+        }
+        let needs_placeholder = msg
+            .get("reasoning_content")
+            .and_then(Value::as_str)
+            .is_none_or(|s| s.trim().is_empty());
+        if needs_placeholder {
+            msg["reasoning_content"] = json!("(reasoning omitted)");
+            substitutions = substitutions.saturating_add(1);
+            logging::warn(format!(
+                "Final sanitizer: forced reasoning_content placeholder on assistant[{idx}]",
+            ));
+        }
+        if let Some(reasoning) = msg.get("reasoning_content").and_then(Value::as_str) {
+            let len = reasoning.len() as u64;
+            if len > 0 {
+                replay_chars = replay_chars.saturating_add(len);
+                replay_messages = replay_messages.saturating_add(1);
+            }
+        }
+    }
+    if substitutions > 0 {
+        logging::warn(format!(
+            "Final sanitizer: {substitutions} assistant message(s) needed reasoning_content placeholder",
+        ));
+    }
+    if replay_messages == 0 {
+        return None;
+    }
+    // ~4 chars/token is the standard rough estimate; DeepSeek tokens skew
+    // a touch shorter on Chinese/code but this is order-of-magnitude info.
+    let approx_tokens = (replay_chars / 4).min(u64::from(u32::MAX)) as u32;
+    logging::info(format!(
+        "Reasoning-content replay: {replay_messages} assistant message(s), ~{approx_tokens} input tokens ({replay_chars} chars) being re-sent in this request",
+    ));
+    Some(approx_tokens)
+}
+
+/// Sums the byte length of `reasoning_content` across all assistant messages in
+/// an outgoing chat-completions body. Used by tests; the production sanitizer
+/// computes the same number inline and logs it.
+#[cfg(test)]
+pub(super) fn count_reasoning_replay_chars(body: &Value) -> u64 {
+    let Some(messages) = body.get("messages").and_then(Value::as_array) else {
+        return 0;
+    };
+    messages
+        .iter()
+        .filter(|m| m.get("role").and_then(Value::as_str) == Some("assistant"))
+        .filter_map(|m| m.get("reasoning_content").and_then(Value::as_str))
+        .map(|s| s.len() as u64)
+        .sum()
+}
+
+/// Diagnostic logger fired when DeepSeek rejects the request despite the
+/// sanitizer. Walks the body and logs which assistant messages have tool_calls
+/// but no `reasoning_content` — useful to track down a code path that bypasses
+/// the sanitizer entirely.
+fn log_thinking_mode_violations(body: &Value) {
+    let Some(messages) = body.get("messages").and_then(Value::as_array) else {
+        logging::warn("400-after-sanitizer: body has no `messages` array");
+        return;
+    };
+    let mut violations: Vec<String> = Vec::new();
+    for (idx, msg) in messages.iter().enumerate() {
+        if msg.get("role").and_then(Value::as_str) != Some("assistant") {
+            continue;
+        }
+        let reasoning = msg
+            .get("reasoning_content")
+            .and_then(Value::as_str)
+            .unwrap_or("");
+        let has_tc = msg.get("tool_calls").is_some();
+        if reasoning.trim().is_empty() {
+            violations.push(format!(
+                "assistant[{idx}] (reasoning_content missing, tool_calls={})",
+                has_tc
+            ));
+        }
+    }
+    if violations.is_empty() {
+        logging::warn(
+            "400-after-sanitizer: all assistant messages have reasoning_content — DeepSeek rejected for a different reason",
+        );
+    } else {
+        logging::warn(format!(
+            "400-after-sanitizer: {} assistant message(s) lack reasoning_content despite sanitizer: {}",
+            violations.len(),
+            violations.join(", ")
+        ));
+    }
+}
+
+fn requires_reasoning_content(model: &str) -> bool {
+    let lower = model.to_lowercase();
+    lower.contains("deepseek-v3.2")
+        || lower.contains("deepseek-v4")
+        || lower.contains("reasoner")
+        || lower.contains("-reasoning")
+        || lower.contains("-thinking")
+        || has_deepseek_r_series_marker(&lower)
+}
+
+fn should_replay_reasoning_content(model: &str, effort: Option<&str>) -> bool {
+    if effort
+        .map(|value| {
+            matches!(
+                value.trim().to_ascii_lowercase().as_str(),
+                "off" | "disabled" | "none" | "false"
+            )
+        })
+        .unwrap_or(false)
+    {
+        return false;
+    }
+
+    requires_reasoning_content(model)
+}
+
+fn has_deepseek_r_series_marker(model_lower: &str) -> bool {
+    const PREFIX: &str = "deepseek-r";
+    model_lower.match_indices(PREFIX).any(|(idx, _)| {
+        model_lower[idx + PREFIX.len()..]
+            .chars()
+            .next()
+            .is_some_and(|ch| ch.is_ascii_digit())
+    })
+}
+
+fn reasoning_field(value: &Value) -> Option<&str> {
+    value
+        .get("reasoning_content")
+        .or_else(|| value.get("reasoning"))
+        .and_then(Value::as_str)
+}
+
+pub(super) fn parse_chat_message(payload: &Value) -> Result<MessageResponse> {
+    let id = payload
+        .get("id")
+        .and_then(Value::as_str)
+        .unwrap_or("chatcmpl")
+        .to_string();
+    let model = payload
+        .get("model")
+        .and_then(Value::as_str)
+        .unwrap_or("unknown")
+        .to_string();
+
+    let choices = payload
+        .get("choices")
+        .and_then(Value::as_array)
+        .context("Chat API response missing choices")?;
+    let choice = choices
+        .first()
+        .context("Chat API response missing first choice")?;
+    let message = choice
+        .get("message")
+        .context("Chat API response missing message")?;
+
+    let mut content_blocks = Vec::new();
+    if let Some(reasoning) =
+        reasoning_field(message).filter(|reasoning| !reasoning.trim().is_empty())
+    {
+        content_blocks.push(ContentBlock::Thinking {
+            thinking: reasoning.to_string(),
+        });
+    }
+    if let Some(text) = message.get("content").and_then(Value::as_str)
+        && !text.trim().is_empty()
+    {
+        content_blocks.push(ContentBlock::Text {
+            text: text.to_string(),
+            cache_control: None,
+        });
+    }
+
+    if let Some(tool_calls) = message.get("tool_calls").and_then(Value::as_array) {
+        for call in tool_calls {
+            let id = call
+                .get("id")
+                .and_then(Value::as_str)
+                .unwrap_or("tool_call")
+                .to_string();
+            let function = call.get("function");
+            let name = function
+                .and_then(|f| f.get("name"))
+                .and_then(Value::as_str)
+                .unwrap_or("tool")
+                .to_string();
+            let arguments = function
+                .and_then(|f| f.get("arguments"))
+                .and_then(Value::as_str)
+                .map(|raw| serde_json::from_str(raw).unwrap_or(Value::String(raw.to_string())))
+                .unwrap_or(Value::Null);
+            let caller = call.get("caller").and_then(|v| {
+                v.get("type")
+                    .and_then(Value::as_str)
+                    .map(|caller_type| ToolCaller {
+                        caller_type: caller_type.to_string(),
+                        tool_id: v
+                            .get("tool_id")
+                            .and_then(Value::as_str)
+                            .map(std::string::ToString::to_string),
+                    })
+            });
+
+            content_blocks.push(ContentBlock::ToolUse {
+                id,
+                name: from_api_tool_name(&name),
+                input: arguments,
+                caller,
+            });
+        }
+    }
+
+    let usage = parse_usage(payload.get("usage"));
+
+    Ok(MessageResponse {
+        id,
+        r#type: "message".to_string(),
+        role: "assistant".to_string(),
+        content: content_blocks,
+        model,
+        stop_reason: choice
+            .get("finish_reason")
+            .and_then(Value::as_str)
+            .map(str::to_string),
+        stop_sequence: None,
+        container: None,
+        usage,
+    })
+}
+
+// === Streaming Helpers ===
+
+/// Build synthetic stream events from a non-streaming response (used as fallback).
+#[allow(dead_code)]
+fn build_stream_events(response: &MessageResponse) -> Vec<StreamEvent> {
+    let mut events = Vec::new();
+    let mut index = 0u32;
+
+    events.push(StreamEvent::MessageStart {
+        message: response.clone(),
+    });
+
+    for block in &response.content {
+        match block {
+            ContentBlock::Text { text, .. } => {
+                events.push(StreamEvent::ContentBlockStart {
+                    index,
+                    content_block: ContentBlockStart::Text {
+                        text: String::new(),
+                    },
+                });
+                if !text.is_empty() {
+                    events.push(StreamEvent::ContentBlockDelta {
+                        index,
+                        delta: Delta::TextDelta { text: text.clone() },
+                    });
+                }
+                events.push(StreamEvent::ContentBlockStop { index });
+            }
+            ContentBlock::Thinking { thinking } => {
+                events.push(StreamEvent::ContentBlockStart {
+                    index,
+                    content_block: ContentBlockStart::Thinking {
+                        thinking: String::new(),
+                    },
+                });
+                if !thinking.is_empty() {
+                    events.push(StreamEvent::ContentBlockDelta {
+                        index,
+                        delta: Delta::ThinkingDelta {
+                            thinking: thinking.clone(),
+                        },
+                    });
+                }
+                events.push(StreamEvent::ContentBlockStop { index });
+            }
+            ContentBlock::ToolUse {
+                id, name, input, ..
+            } => {
+                events.push(StreamEvent::ContentBlockStart {
+                    index,
+                    content_block: ContentBlockStart::ToolUse {
+                        id: id.clone(),
+                        name: name.clone(),
+                        input: input.clone(),
+                        caller: None,
+                    },
+                });
+                events.push(StreamEvent::ContentBlockStop { index });
+            }
+            ContentBlock::ToolResult { .. } => {}
+            ContentBlock::ServerToolUse { id, name, input } => {
+                events.push(StreamEvent::ContentBlockStart {
+                    index,
+                    content_block: ContentBlockStart::ServerToolUse {
+                        id: id.clone(),
+                        name: name.clone(),
+                        input: input.clone(),
+                    },
+                });
+                events.push(StreamEvent::ContentBlockStop { index });
+            }
+            ContentBlock::ToolSearchToolResult { .. }
+            | ContentBlock::CodeExecutionToolResult { .. } => {}
+        }
+        index = index.saturating_add(1);
+    }
+
+    events.push(StreamEvent::MessageDelta {
+        delta: MessageDelta {
+            stop_reason: response.stop_reason.clone(),
+            stop_sequence: response.stop_sequence.clone(),
+        },
+        usage: Some(response.usage.clone()),
+    });
+    events.push(StreamEvent::MessageStop);
+
+    events
+}
+
+// === SSE Chunk Parser ===
+
+/// Parse a single SSE chunk from the Chat Completions streaming API into
+/// our internal `StreamEvent` representation.
+pub(super) fn parse_sse_chunk(
+    chunk: &Value,
+    content_index: &mut u32,
+    text_started: &mut bool,
+    thinking_started: &mut bool,
+    tool_indices: &mut std::collections::HashMap<u32, u32>,
+    is_reasoning_model: bool,
+) -> Vec<StreamEvent> {
+    let mut events = Vec::new();
+
+    let Some(choices) = chunk.get("choices").and_then(Value::as_array) else {
+        // Usage-only chunk (sent at end with stream_options)
+        if let Some(usage_val) = chunk.get("usage") {
+            let usage = parse_usage(Some(usage_val));
+            events.push(StreamEvent::MessageDelta {
+                delta: MessageDelta {
+                    stop_reason: None,
+                    stop_sequence: None,
+                },
+                usage: Some(usage),
+            });
+        }
+        return events;
+    };
+
+    if choices.is_empty() {
+        if let Some(usage_val) = chunk.get("usage") {
+            let usage = parse_usage(Some(usage_val));
+            events.push(StreamEvent::MessageDelta {
+                delta: MessageDelta {
+                    stop_reason: None,
+                    stop_sequence: None,
+                },
+                usage: Some(usage),
+            });
+        }
+        return events;
+    }
+
+    for choice in choices {
+        let delta = choice.get("delta");
+        let finish_reason = choice
+            .get("finish_reason")
+            .and_then(Value::as_str)
+            .map(str::to_string);
+
+        if let Some(delta) = delta {
+            // Handle reasoning_content / reasoning thinking deltas.
+            if is_reasoning_model
+                && let Some(reasoning) = reasoning_field(delta)
+                && !reasoning.is_empty()
+            {
+                if !*thinking_started {
+                    events.push(StreamEvent::ContentBlockStart {
+                        index: *content_index,
+                        content_block: ContentBlockStart::Thinking {
+                            thinking: String::new(),
+                        },
+                    });
+                    *thinking_started = true;
+                }
+                events.push(StreamEvent::ContentBlockDelta {
+                    index: *content_index,
+                    delta: Delta::ThinkingDelta {
+                        thinking: reasoning.to_string(),
+                    },
+                });
+            }
+
+            // Handle regular content
+            if let Some(content) = delta.get("content").and_then(Value::as_str)
+                && !content.is_empty()
+            {
+                // Close thinking block if transitioning to text
+                if *thinking_started {
+                    events.push(StreamEvent::ContentBlockStop {
+                        index: *content_index,
+                    });
+                    *content_index += 1;
+                    *thinking_started = false;
+                }
+                if !*text_started {
+                    events.push(StreamEvent::ContentBlockStart {
+                        index: *content_index,
+                        content_block: ContentBlockStart::Text {
+                            text: String::new(),
+                        },
+                    });
+                    *text_started = true;
+                }
+                events.push(StreamEvent::ContentBlockDelta {
+                    index: *content_index,
+                    delta: Delta::TextDelta {
+                        text: content.to_string(),
+                    },
+                });
+            }
+
+            // Handle tool calls
+            if let Some(tool_calls) = delta.get("tool_calls").and_then(Value::as_array) {
+                for tc in tool_calls {
+                    let tc_index = tc.get("index").and_then(Value::as_u64).unwrap_or(0) as u32;
+                    let tool_block_index = match tool_indices.entry(tc_index) {
+                        std::collections::hash_map::Entry::Occupied(entry) => *entry.get(),
+                        std::collections::hash_map::Entry::Vacant(entry) => {
+                            // Close text block if transitioning to tool use
+                            if *text_started {
+                                events.push(StreamEvent::ContentBlockStop {
+                                    index: *content_index,
+                                });
+                                *content_index += 1;
+                                *text_started = false;
+                            }
+                            if *thinking_started {
+                                events.push(StreamEvent::ContentBlockStop {
+                                    index: *content_index,
+                                });
+                                *content_index += 1;
+                                *thinking_started = false;
+                            }
+
+                            let id = tc
+                                .get("id")
+                                .and_then(Value::as_str)
+                                .unwrap_or("tool_call")
+                                .to_string();
+                            let name = tc
+                                .get("function")
+                                .and_then(|f| f.get("name"))
+                                .and_then(Value::as_str)
+                                .unwrap_or("")
+                                .to_string();
+                            let caller = tc.get("caller").and_then(|v| {
+                                v.get("type").and_then(Value::as_str).map(|caller_type| {
+                                    ToolCaller {
+                                        caller_type: caller_type.to_string(),
+                                        tool_id: v
+                                            .get("tool_id")
+                                            .and_then(Value::as_str)
+                                            .map(std::string::ToString::to_string),
+                                    }
+                                })
+                            });
+
+                            let block_index = *content_index;
+                            events.push(StreamEvent::ContentBlockStart {
+                                index: block_index,
+                                content_block: ContentBlockStart::ToolUse {
+                                    id,
+                                    name: from_api_tool_name(&name),
+                                    input: json!({}),
+                                    caller,
+                                },
+                            });
+                            *content_index = (*content_index).saturating_add(1);
+                            entry.insert(block_index);
+                            block_index
+                        }
+                    };
+
+                    // Stream tool call arguments
+                    if let Some(args) = tc
+                        .get("function")
+                        .and_then(|f| f.get("arguments"))
+                        .and_then(Value::as_str)
+                        && !args.is_empty()
+                    {
+                        events.push(StreamEvent::ContentBlockDelta {
+                            index: tool_block_index,
+                            delta: Delta::InputJsonDelta {
+                                partial_json: args.to_string(),
+                            },
+                        });
+                    }
+                }
+            }
+        }
+
+        // Handle finish reason
+        if let Some(reason) = finish_reason {
+            // Close any open blocks
+            if *text_started {
+                events.push(StreamEvent::ContentBlockStop {
+                    index: *content_index,
+                });
+                *text_started = false;
+            }
+            if *thinking_started {
+                events.push(StreamEvent::ContentBlockStop {
+                    index: *content_index,
+                });
+                *thinking_started = false;
+            }
+            // Close tool blocks
+            let mut open_tool_indices: Vec<u32> =
+                tool_indices.drain().map(|(_, idx)| idx).collect();
+            open_tool_indices.sort_unstable();
+            for tool_block_index in open_tool_indices {
+                events.push(StreamEvent::ContentBlockStop {
+                    index: tool_block_index,
+                });
+            }
+
+            // Emit usage from the chunk if available
+            let chunk_usage = chunk.get("usage").map(|u| parse_usage(Some(u)));
+            events.push(StreamEvent::MessageDelta {
+                delta: MessageDelta {
+                    stop_reason: Some(reason),
+                    stop_sequence: None,
+                },
+                usage: chunk_usage,
+            });
+        }
+    }
+
+    events
+}
diff --git a/crates/tui/src/client/responses.rs b/crates/tui/src/client/responses.rs
new file mode 100644
index 00000000..7a5a0169
--- /dev/null
+++ b/crates/tui/src/client/responses.rs
@@ -0,0 +1,406 @@
+//! Responses API helpers for the experimental DeepSeek endpoint.
+//!
+//! Gated behind `DEEPSEEK_EXPERIMENTAL_RESPONSES_API`. Normal traffic uses
+//! chat completions via `crate::client::chat`.
+
+use anyhow::{Context, Result};
+use serde_json::{Value, json};
+
+use crate::models::{ContentBlock, Message, MessageRequest, MessageResponse, Tool, ToolCaller};
+
+use super::{
+    DeepSeekClient, ERROR_BODY_MAX_BYTES, api_url, apply_reasoning_effort, bounded_error_text,
+    from_api_tool_name, parse_usage, system_to_instructions, to_api_tool_name,
+};
+
+#[derive(Debug)]
+pub(super) struct ResponsesFallback {
+    pub(super) status: u16,
+    pub(super) body: String,
+}
+
+impl DeepSeekClient {
+    pub(super) async fn create_message_responses(
+        &self,
+        request: &MessageRequest,
+    ) -> Result<Result<MessageResponse, ResponsesFallback>> {
+        let mut body = json!({
+            "model": request.model,
+            "input": build_responses_input(&request.messages),
+            "store": false,
+            "max_output_tokens": request.max_tokens,
+        });
+
+        if let Some(instructions) = system_to_instructions(request.system.clone()) {
+            body["instructions"] = json!(instructions);
+        }
+        if let Some(temperature) = request.temperature {
+            body["temperature"] = json!(temperature);
+        }
+        if let Some(top_p) = request.top_p {
+            body["top_p"] = json!(top_p);
+        }
+        if let Some(tools) = request.tools.as_ref() {
+            body["tools"] = json!(tools.iter().map(tool_to_responses).collect::<Vec<_>>());
+        }
+        if let Some(choice) = request.tool_choice.as_ref() {
+            body["tool_choice"] = choice.clone();
+        }
+        apply_reasoning_effort(
+            &mut body,
+            request.reasoning_effort.as_deref(),
+            self.api_provider,
+        );
+
+        let url = api_url(&self.base_url, "responses");
+        let response = self
+            .send_with_retry(|| self.http_client.post(&url).json(&body))
+            .await?;
+
+        let status = response.status();
+
+        if status.as_u16() == 404 || status.as_u16() == 405 {
+            let body = bounded_error_text(response, ERROR_BODY_MAX_BYTES).await;
+            return Ok(Err(ResponsesFallback {
+                status: status.as_u16(),
+                body,
+            }));
+        }
+
+        if !status.is_success() {
+            let error_text = bounded_error_text(response, ERROR_BODY_MAX_BYTES).await;
+            anyhow::bail!("Failed to call DeepSeek Responses API: HTTP {status}: {error_text}");
+        }
+
+        let response_text = response.text().await.unwrap_or_default();
+        let value: Value =
+            serde_json::from_str(&response_text).context("Failed to parse Responses API JSON")?;
+        let message = parse_responses_message(&value)?;
+        Ok(Ok(message))
+    }
+}
+
+fn build_responses_input(messages: &[Message]) -> Vec<Value> {
+    let mut items = Vec::new();
+
+    for message in messages {
+        let role = message.role.as_str();
+        let text_type = if role == "user" {
+            "input_text"
+        } else {
+            "output_text"
+        };
+
+        for block in &message.content {
+            match block {
+                ContentBlock::Text { text, .. } => {
+                    items.push(json!({
+                        "type": "message",
+                        "role": role,
+                        "content": [{
+                            "type": text_type,
+                            "text": text,
+                        }]
+                    }));
+                }
+                ContentBlock::ToolUse {
+                    id,
+                    name,
+                    input,
+                    caller,
+                } => {
+                    let args = serde_json::to_string(input).unwrap_or_else(|_| input.to_string());
+                    let mut item = json!({
+                        "type": "function_call",
+                        "call_id": id,
+                        "name": to_api_tool_name(name),
+                        "arguments": args,
+                    });
+                    if let Some(caller) = caller {
+                        item["caller"] = json!({
+                            "type": caller.caller_type,
+                            "tool_id": caller.tool_id,
+                        });
+                    }
+                    items.push(item);
+                }
+                ContentBlock::ToolResult {
+                    tool_use_id,
+                    content,
+                    is_error,
+                    ..
+                } => {
+                    let mut item = json!({
+                        "type": "function_call_output",
+                        "call_id": tool_use_id,
+                        "output": content,
+                    });
+                    if let Some(is_error) = is_error {
+                        item["is_error"] = json!(is_error);
+                    }
+                    items.push(item);
+                }
+                ContentBlock::Thinking { .. } => {}
+                ContentBlock::ServerToolUse { id, name, input } => {
+                    items.push(json!({
+                        "type": "server_tool_use",
+                        "id": id,
+                        "name": name,
+                        "input": input,
+                    }));
+                }
+                ContentBlock::ToolSearchToolResult {
+                    tool_use_id,
+                    content,
+                } => {
+                    items.push(json!({
+                        "type": "tool_search_tool_result",
+                        "tool_use_id": tool_use_id,
+                        "content": content,
+                    }));
+                }
+                ContentBlock::CodeExecutionToolResult {
+                    tool_use_id,
+                    content,
+                } => {
+                    items.push(json!({
+                        "type": "code_execution_tool_result",
+                        "tool_use_id": tool_use_id,
+                        "content": content,
+                    }));
+                }
+            }
+        }
+    }
+
+    items
+}
+
+fn tool_to_responses(tool: &Tool) -> Value {
+    let tool_type = tool.tool_type.as_deref().unwrap_or("function");
+    let mut value = if tool_type == "function" {
+        json!({
+            "type": "function",
+            "name": to_api_tool_name(&tool.name),
+            "description": tool.description,
+            "parameters": tool.input_schema,
+        })
+    } else if tool_type == "code_execution_20250825" {
+        json!({
+            "type": tool_type,
+            "name": to_api_tool_name(&tool.name),
+        })
+    } else {
+        json!({
+            "type": tool_type,
+            "name": to_api_tool_name(&tool.name),
+            "description": tool.description,
+            "input_schema": tool.input_schema,
+        })
+    };
+
+    if let Some(allowed_callers) = &tool.allowed_callers {
+        value["allowed_callers"] = json!(allowed_callers);
+    }
+    if let Some(defer_loading) = tool.defer_loading {
+        value["defer_loading"] = json!(defer_loading);
+    }
+    if let Some(input_examples) = &tool.input_examples {
+        value["input_examples"] = json!(input_examples);
+    }
+    if let Some(strict) = tool.strict {
+        value["strict"] = json!(strict);
+    }
+    value
+}
+
+fn parse_responses_message(payload: &Value) -> Result<MessageResponse> {
+    let id = payload
+        .get("id")
+        .and_then(Value::as_str)
+        .unwrap_or("response")
+        .to_string();
+    let model = payload
+        .get("model")
+        .and_then(Value::as_str)
+        .unwrap_or("unknown")
+        .to_string();
+
+    let usage = parse_usage(payload.get("usage"));
+    let mut content = Vec::new();
+
+    if let Some(output) = payload.get("output").and_then(Value::as_array) {
+        for item in output {
+            let item_type = item.get("type").and_then(Value::as_str).unwrap_or("");
+            match item_type {
+                "message" => {
+                    if let Some(role) = item.get("role").and_then(Value::as_str)
+                        && role != "assistant"
+                    {
+                        continue;
+                    }
+                    if let Some(content_items) = item.get("content").and_then(Value::as_array) {
+                        for content_item in content_items {
+                            let content_type = content_item
+                                .get("type")
+                                .and_then(Value::as_str)
+                                .unwrap_or("output_text");
+                            if content_type != "output_text" && content_type != "text" {
+                                continue;
+                            }
+                            if let Some(text) = content_item.get("text").and_then(Value::as_str)
+                                && !text.trim().is_empty()
+                            {
+                                content.push(ContentBlock::Text {
+                                    text: text.to_string(),
+                                    cache_control: None,
+                                });
+                            }
+                        }
+                    }
+                }
+                "function_call" => {
+                    let call_id = item
+                        .get("call_id")
+                        .or_else(|| item.get("id"))
+                        .and_then(Value::as_str)
+                        .unwrap_or("tool_call")
+                        .to_string();
+                    let name = item
+                        .get("name")
+                        .and_then(Value::as_str)
+                        .unwrap_or("tool")
+                        .to_string();
+                    let input = match item.get("arguments") {
+                        Some(Value::String(raw)) => {
+                            serde_json::from_str(raw).unwrap_or_else(|_| Value::String(raw.clone()))
+                        }
+                        Some(other) => other.clone(),
+                        None => Value::Null,
+                    };
+                    let caller = item.get("caller").and_then(|v| {
+                        v.get("type")
+                            .and_then(Value::as_str)
+                            .map(|caller_type| ToolCaller {
+                                caller_type: caller_type.to_string(),
+                                tool_id: v
+                                    .get("tool_id")
+                                    .and_then(Value::as_str)
+                                    .map(std::string::ToString::to_string),
+                            })
+                    });
+                    content.push(ContentBlock::ToolUse {
+                        id: call_id,
+                        name: from_api_tool_name(&name),
+                        input,
+                        caller,
+                    });
+                }
+                "function_call_output" => {
+                    let tool_use_id = item
+                        .get("call_id")
+                        .or_else(|| item.get("tool_use_id"))
+                        .and_then(Value::as_str)
+                        .unwrap_or("tool_call")
+                        .to_string();
+                    let content_text = item
+                        .get("output")
+                        .or_else(|| item.get("content"))
+                        .map(|v| {
+                            if let Some(s) = v.as_str() {
+                                s.to_string()
+                            } else {
+                                v.to_string()
+                            }
+                        })
+                        .unwrap_or_default();
+                    let is_error = item.get("is_error").and_then(Value::as_bool);
+                    content.push(ContentBlock::ToolResult {
+                        tool_use_id,
+                        content: content_text,
+                        is_error,
+                        content_blocks: None,
+                    });
+                }
+                "server_tool_use" => {
+                    let id = item
+                        .get("id")
+                        .and_then(Value::as_str)
+                        .unwrap_or("server_tool")
+                        .to_string();
+                    let name = item
+                        .get("name")
+                        .and_then(Value::as_str)
+                        .unwrap_or("server_tool")
+                        .to_string();
+                    let input = item.get("input").cloned().unwrap_or(Value::Null);
+                    content.push(ContentBlock::ServerToolUse { id, name, input });
+                }
+                "tool_search_tool_result" => {
+                    let tool_use_id = item
+                        .get("tool_use_id")
+                        .and_then(Value::as_str)
+                        .unwrap_or("tool_search")
+                        .to_string();
+                    let content_value = item.get("content").cloned().unwrap_or(Value::Null);
+                    content.push(ContentBlock::ToolSearchToolResult {
+                        tool_use_id,
+                        content: content_value,
+                    });
+                }
+                "code_execution_tool_result" => {
+                    let tool_use_id = item
+                        .get("tool_use_id")
+                        .and_then(Value::as_str)
+                        .unwrap_or("code_execution")
+                        .to_string();
+                    let content_value = item.get("content").cloned().unwrap_or(Value::Null);
+                    content.push(ContentBlock::CodeExecutionToolResult {
+                        tool_use_id,
+                        content: content_value,
+                    });
+                }
+                "reasoning" => {
+                    if let Some(summary) = item.get("summary").and_then(Value::as_array) {
+                        let summary_text = summary
+                            .iter()
+                            .filter_map(|s| s.get("text").and_then(Value::as_str))
+                            .collect::<Vec<_>>()
+                            .join("\n");
+                        if !summary_text.trim().is_empty() {
+                            content.push(ContentBlock::Thinking {
+                                thinking: summary_text,
+                            });
+                        }
+                    }
+                }
+                _ => {}
+            }
+        }
+    }
+
+    if content.is_empty()
+        && let Some(text) = payload.get("output_text").and_then(Value::as_str)
+        && !text.trim().is_empty()
+    {
+        content.push(ContentBlock::Text {
+            text: text.to_string(),
+            cache_control: None,
+        });
+    }
+
+    Ok(MessageResponse {
+        id,
+        r#type: "message".to_string(),
+        role: "assistant".to_string(),
+        content,
+        model,
+        stop_reason: None,
+        stop_sequence: None,
+        container: payload
+            .get("container")
+            .cloned()
+            .and_then(|v| serde_json::from_value(v).ok()),
+        usage,
+    })
+}
diff --git a/crates/tui/src/core/engine.rs b/crates/tui/src/core/engine.rs
index b0a04928..f7a4f861 100644
--- a/crates/tui/src/core/engine.rs
+++ b/crates/tui/src/core/engine.rs
@@ -1450,10 +1450,11 @@ impl Engine {
         if self.config.features.enabled(Feature::WebSearch) {
             builder = builder.with_web_tools();
         }
-        if self.config.features.enabled(Feature::ShellTool)
-            && self.session.allow_shell
-            && mode != AppMode::Plan
-        {
+        // Plan mode now keeps shell available — the existing approval flow
+        // and command-safety classifier gate destructive commands. Writes
+        // and patches stay blocked above; that's the only "destructive"
+        // boundary plan mode enforces by tool registration.
+        if self.config.features.enabled(Feature::ShellTool) && self.session.allow_shell {
             builder = builder.with_shell_tools();
         }
 
diff --git a/crates/tui/src/prompts.rs b/crates/tui/src/prompts.rs
index 6b25a99d..f867cf5a 100644
--- a/crates/tui/src/prompts.rs
+++ b/crates/tui/src/prompts.rs
@@ -141,20 +141,6 @@ mod tests {
     use super::*;
     use tempfile::tempdir;
 
-    #[test]
-    fn plan_prompt_prefers_best_effort_plans_over_clarifying_loops() {
-        let prompt = match system_prompt_for_mode(AppMode::Plan) {
-            SystemPrompt::Text(text) => text,
-            SystemPrompt::Blocks(_) => panic!("expected text system prompt"),
-        };
-
-        assert!(prompt.contains("Default to publishing a best-effort plan immediately."));
-        assert!(prompt.contains("your first action should be update_plan."));
-        assert!(prompt.contains("do not browse the repo first"));
-        assert!(prompt.contains("Do not ask clarifying questions for straightforward requests"));
-        assert!(prompt.contains("If the user asks for \"a 3-step plan\""));
-    }
-
     /// Discriminator unique to the injected handoff block (not present in the
     /// agent prompt's own discussion of the convention).
     const HANDOFF_BLOCK_MARKER: &str = "left a handoff at `.deepseek/handoff.md`";
diff --git a/crates/tui/src/prompts/agent.txt b/crates/tui/src/prompts/agent.txt
index 4fd6c93d..b7834179 100644
--- a/crates/tui/src/prompts/agent.txt
+++ b/crates/tui/src/prompts/agent.txt
@@ -1,164 +1,8 @@
-You are DeepSeek TUI, an agentic coding assistant with full tool access.
+## Mode: agent
 
-IMPORTANT: You are ALREADY running inside the DeepSeek TUI. You have direct access to all tools below - do NOT try to run or launch the CLI binary. Your tools execute directly in the current session.
+Read-only tools (reads, searches, `rlm_query`, agent status queries, git inspection) run silently.
+Any write, patch, shell execution, sub-agent spawn, or CSV batch operation will ask for approval first.
 
-When given a task:
-1. Understand the goal, constraints, and acceptance criteria first.
-2. Break work into small, testable steps and track them with todo tools.
-3. Read and search first, then make targeted edits, then verify with tools.
-4. Report concise progress updates at meaningful checkpoints.
-5. Do not stop until the full task is done or you are clearly blocked.
-6. Avoid destructive actions (deletes, irreversible changes) unless the user explicitly requests them; warn before risky actions and suggest YOLO for high-risk changes.
-
-Tool selection guidance:
-- Prefer grep_files + list_dir to quickly locate relevant files and symbols.
-- Use read_file to confirm context; do not assume file contents.
-- Prefer apply_patch/edit_file for scoped changes instead of rewriting entire files.
-- Use exec_shell for objective verification: build, test, format, lint, and targeted checks.
-- Use web_search when local context is insufficient or time-sensitive, and cite sources as (ref_id).
-
-Web browsing and citations:
-- Use web_search when info might have changed or you are unsure.
-- Cite non-trivial factual claims using (ref_id) (the ref_id returned by web_search).
-- Place citations at the end of the sentence/paragraph they support; do not dump all citations at the end.
-- Quote limits: do not quote more than 25 words verbatim from a single non-lyrical source (10 words for lyrics).
-- Avoid reproducing full articles or large excerpts; prefer short quotes + paraphrase.
-
-Testing and stop conditions:
-- After any change, run the most relevant tests/checks before declaring success.
-- Start narrow (targeted tests) and expand to broader checks when appropriate.
-- If a check fails, report it concisely, fix it, and re-run.
-- Stop when acceptance criteria are met and tests/checks pass, or explain what could not be verified.
-
-Step budgeting:
-- Budget attempts. If 2-3 attempts do not produce progress, reassess and state the blocker or a new plan.
-
-Session handoff (`.deepseek/handoff.md`):
-- If a "Previous Session Handoff" block appears in this prompt, treat it as the first artifact to read for this turn — open blockers, in-flight changes, and recent decisions live there.
-- Before the user explicitly ends the session (or before `/compact` if state is meaningful), write or update `.deepseek/handoff.md` via `write_file`. Cover: active task, open blockers, recent decisions, files touched + why, known broken state, suggested next steps. Keep it short — it's a hand-off, not a transcript.
-
-Available tools:
-
-FILE OPERATIONS (prefer these over `exec_shell` equivalents — they return structured output):
-- read_file: Read a file. PDFs are auto-extracted via pdftotext; pass `pages: "1-5"` to slice.
-- list_dir: List directory contents (structured, gitignore-aware).
-- write_file: Create or overwrite a file.
-- edit_file: Search-and-replace inside a single file. Cheaper than rewriting.
-- apply_patch: Apply a unified diff patch — the right tool for multi-hunk edits.
-
-SEARCH:
-- grep_files: Regex search file contents within the workspace; returns matches + context lines.
-- file_search: Fuzzy-match filenames (NOT contents). Use to locate a file when you know roughly the name.
-- web_search: DuckDuckGo/Bing search; returns ranked snippets with ref_ids for citation.
-- fetch_url: Direct HTTP GET on a known URL (faster than web_search when the link is already known). HTML is stripped to text by default.
-
-USER:
-- request_user_input: Ask the user a short multiple-choice question.
-
-PARALLEL TOOL USE:
-- Issue independent tool calls in parallel by emitting multiple tool_calls in one assistant turn (the model API supports this natively). Do not wrap them in any meta-tool or pseudo-XML.
-
-
-
-
-
-- list_mcp_resources: List MCP resources (optionally filtered by server)
-- list_mcp_resource_templates: List MCP resource templates
-
-GIT AND DIAGNOSTICS:
-- git_status: Inspect repo status safely
-- git_diff: Inspect working tree or staged diffs
-- diagnostics: Report workspace, git, sandbox, and toolchain info
-
-TESTING:
-- run_tests: Run `cargo test` with optional args
-
-SHELL EXECUTION:
-- exec_shell: Run shell commands (supports background execution)
-  - command: The command to execute
-  - timeout_ms: Timeout in milliseconds (default: 120000, max: 600000)
-  - background: Set true to run in background, returns task_id
-  - stdin: Optional stdin data to send before waiting
-  - tty: Allocate a pseudo-terminal (implies background)
-- exec_shell_wait: Poll a background task for incremental output
-- exec_shell_interact: Send stdin to a background task and read incremental output
-
-TASK MANAGEMENT:
-- todo_write: Write or update the todo list
-- update_plan: Publish a structured checklist for complex work
-- note: Record important information
-
-SUB-AGENTS:
-- agent_spawn: Spawn a background sub-agent (type, prompt, allowed_tools).
-- spawn_agents_on_csv: Batch-process CSV rows with one worker sub-agent per row.
-- report_agent_job_result: Worker-only job row report tool for spawn_agents_on_csv.
-- agent_swarm: Spawn a dependency-aware swarm of sub-agents (tasks, shared_context).
-- swarm_status / swarm_result: Inspect a swarm by swarm_id (status; or full results, with optional block/timeout).
-- agent_result: Get result from a sub-agent (agent_id, block, timeout_ms).
-- send_input: Send input to a running sub-agent (agent_id, message/items, interrupt).
-- agent_assign: Update assignment objective/role and optionally push immediate guidance.
-- wait: Wait for one or more sub-agents to complete (ids optional, wait_mode:any|all, timeout_ms).
-- agent_cancel: Cancel a running sub-agent (agent_id).
-- resume_agent: Resume a previously closed/completed sub-agent.
-- agent_list: List all sub-agents and their status.
-Delegation protocol:
-- Delegate only bounded, parallelizable work with a clear input, expected output, and tool limits.
-- Prefer multiple sub-agents for independent steps to maximize parallelism.
-- When spawning/delegating, include explicit assignment metadata: objective + role (worker/explorer/awaiter/default) or agent_type.
-- Use agent_assign to retask active sub-agents instead of respawning when objective/role changes.
-- After spawning, immediately track completion with wait (for groups), swarm_result (for non-blocking swarms), or agent_result (block: true) per agent.
-- For full barriers, use wait with wait_mode="all" and a generous timeout (prefer >= 60000ms). Omit ids to wait on all currently running agents.
-- For spawn_agents_on_csv workers: call report_agent_job_result exactly once per row item; missing reports are treated as failures.
-- Workers may set stop=true in report_agent_job_result to cancel remaining unstarted CSV rows.
-- If sub-agents are still running, wait for their outputs before presenting final conclusions unless the user asked a direct question that needs an immediate reply.
-- Do not present final conclusions until required sub-agent results are collected and integrated.
-- If an agent stalls or fails, retry once with a tighter prompt; otherwise cancel it and continue with an explicit fallback.
-- Close idle agents with close_agent to free capacity; use resume_agent to continue paused/completed assignments when needed.
-- Verify critical sub-agent claims with primary tool output before applying changes.
-
-Planning and progress:
-- For complex or multi-file work, call update_plan to publish a checklist.
-- Keep exactly one plan step in_progress at a time.
-- Use todo tools for granular progress when helpful.
-- Prefer short progress notes over long narration.
-- For long-running tasks, emit checkpoint updates every few actions with: done, next, and blockers.
-- Re-baseline plan/todos at each checkpoint when scope shifts.
-
-Git hygiene:
-- Run git status early (to see the workspace state) and again before finishing.
-- Do not revert or overwrite unrelated user changes.
-- Avoid destructive git commands unless explicitly requested.
-- Do not commit unless the user asks.
-
-BACKGROUND EXECUTION:
-For long-running commands (build, test, server), use exec_shell with background: true.
-This returns a task_id immediately in the tool output.
-Use exec_shell_wait to poll for output, and exec_shell_interact to send stdin (or close stdin).
-Use tty: true for interactive programs that require a TTY.
-
-## Recursive Language Model (RLM) primitive — `rlm_query`
-
-When you need parallel analysis, recursive decomposition, or batched generation, call the `rlm_query` tool. It runs N prompts in parallel against the cheap fast model (`deepseek-v4-flash`) and returns the joined results — much faster and cheaper than doing the work inline.
-
-Two shapes:
-
-- **Single child:** `rlm_query({ "prompt": "Analyze X" })` → returns the response text.
-- **Parallel batch:** `rlm_query({ "prompts": ["Analyze X angle A", "Analyze X angle B", "Analyze X angle C"] })` → returns `[0] ...\n\n---\n\n[1] ...\n\n---\n\n[2] ...`.
-
-Optional fields: `model` (override the child model — set to `"deepseek-v4-pro"` if a child genuinely needs deep reasoning), `system` (shared system prompt for all children), `max_tokens` (per-child cap, default 4096). Hard cap: 16 prompts per call.
-
-### Worked example
-
-User: "Review these three modules for risk."
-
-You call `rlm_query` once with `prompts: ["Review src/foo.rs for risk: <contents>", "Review src/bar.rs for risk: <contents>", "Review src/baz.rs for risk: <contents>"]`. Three flash children run concurrently, the joined result comes back, you synthesise.
-
-For recursive drill-down: call `rlm_query` again with a single `prompt` on the strongest finding from the first call.
-
-Do NOT use RLM when the task requires file-system modification, interactive user input, or is trivial enough for a single sentence.
-
-| Primitive | Use when | Cost | Speed |
-|---|---|---|---|
-| Inline reasoning | Simple Q&A, one-step tasks | Low | Fast |
-| `rlm_query` | Parallel / batched / recursive read-only work | Very low (flash) | Fast |
-| `agent_swarm` | Multi-step autonomous work with tools | Higher | Slower (polling) |
+Before requesting approval for writes, lay out your work with `todo_write` so the user can see what
+you intend to do and approve with context. Complex changes should also get an `update_plan` first.
+Decomposition builds trust — a clear plan gets faster approvals.
\ No newline at end of file
diff --git a/crates/tui/src/prompts/base.txt b/crates/tui/src/prompts/base.txt
index 565e178d..939575dc 100644
--- a/crates/tui/src/prompts/base.txt
+++ b/crates/tui/src/prompts/base.txt
@@ -1,45 +1,34 @@
-You are DeepSeek TUI, an agentic coding assistant.
+You are DeepSeek TUI. You're already running inside it — don't try to launch a `deepseek` or `deepseek-tui` binary.
 
-When given a task:
-1. Understand the goal, constraints, and acceptance criteria first.
-2. Break the work into small, testable steps and track them.
-3. Choose tools deliberately; read before you write, then verify.
-4. Report short progress updates at meaningful checkpoints.
-5. Do not stop until the full task is done or you are clearly blocked.
+## Decomposition Philosophy
 
-Tool selection guidance:
-- Prefer fast search tools (grep/rg) to locate relevant files and symbols.
-- Use read tools to confirm context; avoid guessing about file contents.
-- Prefer targeted edits (apply_patch/edit) over full rewrites when possible.
-- Use shell tools for build/test/format/lint and other objective verification.
-- Use web_search for time-sensitive or uncertain facts; include citations as (ref_id).
-- Issue independent tool calls in parallel (emit multiple tool_calls in a single turn) instead of serializing them.
-- Use request_user_input to ask short multiple-choice questions when needed.
+You are a "managed genius" — you excel at individual tasks, but your superpower is decomposing complex work. **Always decompose before you act.** A few minutes spent planning saves many minutes of thrashing.
 
+Your default workflow for any non-trivial request:
+1. **`todo_write`** — break the work into concrete, verifiable tasks. Mark the first one `in_progress`. This populates the sidebar so the user can see what you're doing.
+2. **Execute** — work through each todo, updating status as you go.
+3. **For complex initiatives**, layer `update_plan` (high-level strategy) above `todo_write` (granular steps).
+4. **For parallel work**, spawn sub-agents (`agent_spawn` / `agent_swarm`) — each does one thing well. Link them to plan/todo items in your thinking.
+5. **For LM-only fan-out** (summarization, classification, analysis across many items), use `rlm_query` for fast parallel inference.
+6. **For persistent cross-session memory**, use `note` sparingly for important decisions, open blockers, and architectural context.
 
-Planning and progress:
-- For non-trivial tasks, publish a checklist with update_plan.
-- Keep exactly one plan step in_progress at a time.
-- Use todo tools for granular progress when helpful.
-- Budget your steps: if 2-3 attempts fail to make progress, pause, reassess, and state the blocker.
+**Key principle**: make your work visible. The sidebar shows Plan / Todos / Tasks / Agents. When these panels are empty, the user has no idea what you're doing. Keep them populated.
 
-Testing and stop conditions:
-- After any change, run the most relevant tests/checks before declaring success.
-- If tests fail, report the failure concisely, fix it, and re-run.
-- Stop when acceptance criteria are met and checks/tests pass (or explain why they could not run).
+## Context
+You have a 1 M-token context window. When usage creeps above ~80%, suggest `/compact` to the user — it summarises earlier turns so you can keep working without losing thread.
 
-Git hygiene:
-- Check git status early and again before finishing.
-- Do not revert or overwrite unrelated user changes.
-- Avoid destructive git commands unless explicitly requested.
-- Do not commit unless the user asks.
+Model notes: DeepSeek V4 models emit *thinking tokens* (`ContentBlock::Thinking`) before final answers. These are invisible to the user but count against context. Cost/token estimates are approximate; treat them as a rough guide.
 
-Approval etiquette:
-- In approval-gated modes, ask before writes or shell commands.
-- In autonomous modes, warn before risky or irreversible actions.
+## Toolbox (fast reference — tool descriptions are authoritative)
 
-Tone: competent, warm, and concise. Use light humor sparingly when it fits; a rare example is "You're absolutely right! ... maybe."
+- **Planning / tracking**: `update_plan` (high-level strategy), `todo_write` (granular task list — use this first), `todo_add` / `todo_update` / `todo_list` (legacy single-item ops), `note` (persistent memory).
+- **File I/O**: `read_file` (PDFs auto-extracted), `list_dir`, `write_file`, `edit_file`, `apply_patch`.
+- **Shell**: `exec_shell` (`background: true` for long jobs), `exec_shell_wait`, `exec_shell_interact`. When exploring code, `rg` / `find` / `git` / `awk` / `sed` pipes are often faster than the structured search tools below.
+- **Structured search**: `grep_files`, `file_search`, `web_search`, `fetch_url`, `web.run` (browse).
+- **Git / diag / tests**: `git_status`, `git_diff`, `git_show`, `git_log`, `git_blame`, `diagnostics`, `run_tests`, `review`.
+- **Sub-agents**: `agent_spawn` (`spawn_agent`, `delegate_to_agent`), `agent_swarm`, `agent_result`, `agent_cancel` (`close_agent`), `agent_list`, `agent_wait` (`wait`), `agent_send_input` (`send_input`), `agent_assign` (`assign_agent`), `resume_agent`.
+- **CSV batch**: `spawn_agents_on_csv`, `report_agent_job_result`.
+- **LM fan-out**: `rlm_query` — `prompts: [...]` runs up to 16 children on the fast cheap model concurrently. Read-only.
+- **Other**: `code_execution` (Python sandbox), `validate_data` (JSON/TOML), `request_user_input`, `finance` (market quotes), `tool_search_tool_regex`, `tool_search_tool_bm25` (deferred tool discovery).
 
-Context Management:
-- You have a finite context window. Keep responses concise and prefer targeted file reads or searches.
-- Long conversations may be compacted into summaries; ask for clarification if critical details are missing.
+Multiple `tool_calls` in one turn run in parallel. `web_search` returns `ref_id`s — cite as `(ref_id)`.
diff --git a/crates/tui/src/prompts/normal.txt b/crates/tui/src/prompts/normal.txt
index 4d66dd8b..922bf8ec 100644
--- a/crates/tui/src/prompts/normal.txt
+++ b/crates/tui/src/prompts/normal.txt
@@ -1,61 +1,6 @@
-You are DeepSeek TUI, a helpful coding assistant running in NORMAL mode.
+## Mode: normal
 
-IMPORTANT: You are ALREADY running inside the DeepSeek TUI. You have direct access to all tools below - do NOT try to run or launch the CLI binary.
+Reads and `rlm_query` run silently. Writes, patches, and shell commands ask for approval.
 
-You help users with coding questions, explanations, debugging, and general programming assistance.
-
-Available tools in this mode:
-- list_dir: Browse directories in the workspace
-- read_file: Read file contents
-- write_file: Create or overwrite a file (ask first)
-- edit_file: Search and replace text in a file (ask first)
-- apply_patch: Apply a unified diff patch (ask first)
-- grep_files: Search files by regex
-- web_search: Quick web search (fallback when citations are not needed)
-- request_user_input: Ask the user short multiple-choice questions
-
-PARALLEL TOOL USE:
-- Issue independent tool calls in parallel by emitting multiple tool_calls in one assistant turn (the model API supports this natively). Do not wrap them in any meta-tool or pseudo-XML.
-
-
-
-
-
-- list_mcp_resources: List MCP resources (optionally filtered by server)
-- list_mcp_resource_templates: List MCP resource templates
-- git_status: Inspect repository status safely
-- git_diff: Inspect diffs (working tree or staged)
-- diagnostics: Report workspace, git, sandbox, and toolchain info
-- run_tests: Run `cargo test` with optional args
-- exec_shell: Run shell commands (ask first, if enabled)
-- exec_shell_wait: Poll a background shell task for incremental output
-- exec_shell_interact: Send stdin to a background shell task (supports TTY sessions)
-- note: Record important information
-- todo_write: Write or update the todo list
-- update_plan: Publish a structured plan
-
-Guidelines:
-1. Understand the goal and constraints before proposing changes.
-2. Prefer tool-centric reasoning: search, read, then act.
-3. Answer clearly and concisely; provide code examples when helpful.
-4. You CAN read files and explore the codebase without approval.
-5. Ask for explicit approval before any file writes, patches, or shell commands.
-6. If the user wants fully autonomous changes, suggest pressing Tab to switch to Agent or YOLO mode.
-
-Tool selection guidance:
-- Use grep_files/list_dir to find relevant files quickly.
-- Use read_file to ground your answer in the actual code.
-- When approved to edit, prefer apply_patch/edit_file for targeted diffs.
-- When approved to run commands, use exec_shell for build/test/format/lint and other objective checks.
-- For long-running or interactive commands, use exec_shell with background: true, then exec_shell_wait/exec_shell_interact for output/input. Use tty: true when a program requires a TTY.
-- When you need up-to-date or uncertain info, use web_search and cite sources as (ref_id).
-
-Testing and stop conditions (after approval to edit/run commands):
-- After any change, run the most relevant tests/checks before declaring success.
-- If a check fails, report it concisely, fix it, and re-run.
-- Stop when acceptance criteria are met and checks pass, or explain what could not be verified.
-
-Step budgeting and progress:
-- For non-trivial tasks, propose a short plan and use update_plan/todo_write when helpful.
-- Provide brief progress updates at key checkpoints, not every small action.
-- If 2-3 attempts fail, pause and ask a focused clarifying question.
+Before requesting writes, use `todo_write` to outline your approach — visible plans build trust.
+For complex work, layer `update_plan` (strategy) above `todo_write` (tactics).
\ No newline at end of file
diff --git a/crates/tui/src/prompts/plan.txt b/crates/tui/src/prompts/plan.txt
index 2700d9eb..e1aa510f 100644
--- a/crates/tui/src/prompts/plan.txt
+++ b/crates/tui/src/prompts/plan.txt
@@ -1,64 +1,8 @@
-You are DeepSeek TUI in PLAN mode. Design before implementing.
+## Mode: plan
 
-This mode is read-only: you can analyze and plan, but you cannot edit files or run shell commands.
+Investigate first, act later. Use `update_plan` to lay out high-level strategy and `todo_write` for
+granular, verifiable steps. All writes and patches are blocked — you can read the world but you
+can't change it. Shell commands go through approval.
 
-In this mode, focus on:
-1. Understanding requirements, constraints, and acceptance criteria fully.
-2. Breaking complex tasks into clear, actionable, testable steps.
-3. Identifying potential issues, regressions, and edge cases upfront.
-4. Creating a detailed plan using update_plan before implementation.
-
-Interaction workflow:
-1. For straightforward planning requests such as "quick plan", "3-step plan", "give me a plan", or review/checklist asks, your first action should be update_plan.
-2. For those straightforward planning requests, do not browse the repo first and do not ask request_user_input unless the user explicitly asks for grounded investigation or you are blocked from producing a credible plan.
-3. Default to publishing a best-effort plan immediately.
-4. Ask clarifying questions with request_user_input only when you are blocked from producing a credible plan without the answer.
-5. Do not ask clarifying questions for straightforward requests such as "give me a plan", "3-step plan", "high-level plan", review/checklist requests, or when reasonable assumptions are acceptable. State those assumptions in the plan instead.
-6. If you do ask, use concise multiple-choice questions with numbered options and clear tradeoffs. Keep it to 1 question unless the first answer still leaves the task blocked.
-7. After emitting update_plan, stop and wait for explicit user approval before implementation.
-
-Available tools:
-
-PLANNING:
-- update_plan: Publish a structured plan with steps and status
-- todo_write: Write or update the todo list
-
-EXPLORATION:
-- list_dir: Browse directories in the workspace
-- read_file: Read file contents to understand context
-- grep_files: Search files by regex
-- rlm_query: Run 1–16 cheap parallel children on `deepseek-v4-flash` for fan-out analysis ("review these 4 angles in parallel"). Pass `prompt` for one call or `prompts: [...]` for batched. Useful when one Pro turn would have to enumerate sequentially.
-- web_search: Quick web search (fallback when citations are not needed)
-- request_user_input: Ask the user short multiple-choice questions
-
-PARALLEL TOOL USE:
-- Issue independent tool calls in parallel by emitting multiple tool_calls in one assistant turn (the model API supports this natively). Do not wrap them in any meta-tool or pseudo-XML.
-
-
-
-
-
-- list_mcp_resources: List MCP resources (optionally filtered by server)
-- list_mcp_resource_templates: List MCP resource templates
-- git_status: Inspect repository status safely
-- git_diff: Inspect diffs to understand current changes
-- diagnostics: Report workspace, git, sandbox, and toolchain info
-
-Guidelines:
-- Prefer tool-centric planning for complex or implementation-grounded requests: use grep_files/list_dir/read_file to ground the plan in the actual codebase when that grounding materially improves the plan.
-- Do not explore the repo just to produce a straightforward quick/high-level plan.
-- Use web_search for time-sensitive or uncertain facts, and cite sources as (ref_id).
-- Use update_plan to create structured plans with one step in_progress at a time.
-- Each step should be specific, actionable, and include expected outcomes.
-- Include explicit verification steps (tests/checks) after each planned change.
-- Include git hygiene in the plan: check git status early and before finishing; avoid reverting unrelated changes.
-- Identify dependencies, risks, edge cases, and rollback/mitigation ideas.
-- Prefer reasonable assumptions over questions when a solid plan is still possible.
-- Treat verification-scope ambiguity as non-blocking: include the assumption in the plan instead of stopping to clarify.
-- Ask clarifying questions only when missing facts would materially change the plan or make it unsafe.
-- Budget steps: if key facts are missing after 2-3 exploration attempts and no reasonable assumption would work, ask a focused clarifying question.
-- Provide concise progress notes, then wait for user direction once the plan is ready.
-
-Examples:
-- If the user asks for "a 3-step plan" or "a quick plan", call update_plan directly and avoid request_user_input.
-- If the user asks to verify UI work, assume code-review-first unless they explicitly ask for runtime/manual testing.
+Use this mode to build a thorough plan. Spawn read-only sub-agents for parallel investigation.
+When the plan is solid, the user will switch modes so you can execute.
\ No newline at end of file
diff --git a/crates/tui/src/prompts/yolo.txt b/crates/tui/src/prompts/yolo.txt
index 591ec4fb..6bd3549f 100644
--- a/crates/tui/src/prompts/yolo.txt
+++ b/crates/tui/src/prompts/yolo.txt
@@ -1,151 +1,8 @@
-You are DeepSeek TUI, an agentic coding assistant with full tool access running in YOLO mode.
+## Mode: yolo
 
-IMPORTANT: You are ALREADY running inside the DeepSeek TUI. You have direct access to all tools below - do NOT try to run or launch the CLI binary. Your tools execute directly in the current session.
+All actions auto-approved. Move fast, but think before you write. If you're about to delete files,
+overwrite user work, or run destructive commands, pause and double-check. The undo button is the user's Git history.
 
-When given a task:
-1. Understand the goal, constraints, and acceptance criteria first.
-2. Break work into small, testable steps and track them with todo tools.
-3. Read and search first, then make targeted edits, then verify with tools.
-4. Report concise progress updates at meaningful checkpoints.
-5. Do not stop until the full task is done or you are clearly blocked.
-6. YOLO mode is auto-approved: execute directly without approval prompts, but avoid unnecessary destructive or irreversible actions.
-
-Tool selection guidance:
-- Prefer grep_files + list_dir to quickly locate relevant files and symbols.
-- Use read_file to confirm context; do not assume file contents.
-- Prefer apply_patch/edit_file for scoped changes instead of rewriting entire files.
-- Use exec_shell for objective verification: build, test, format, lint, and targeted checks.
-- Use web_search when local context is insufficient or time-sensitive, and cite sources as (ref_id).
-
-Web browsing and citations:
-- Use web_search when info might have changed or you are unsure.
-- Cite non-trivial factual claims using (ref_id) (the ref_id returned by web_search).
-- Place citations at the end of the sentence/paragraph they support; do not dump all citations at the end.
-- Quote limits: do not quote more than 25 words verbatim from a single non-lyrical source (10 words for lyrics).
-- Avoid reproducing full articles or large excerpts; prefer short quotes + paraphrase.
-
-Testing and stop conditions:
-- After any change, run the most relevant tests/checks before declaring success.
-- Start narrow (targeted tests) and expand to broader checks when appropriate.
-- If a check fails, report it concisely, fix it, and re-run.
-- Stop when acceptance criteria are met and tests/checks pass, or explain what could not be verified.
-
-Step budgeting:
-- Budget attempts. If 2-3 attempts do not produce progress, reassess and state the blocker or a new plan.
-
-Available tools:
-
-FILE OPERATIONS:
-- list_dir: List directory contents
-- read_file: Read file contents
-- write_file: Create or overwrite a file
-- edit_file: Search and replace text in a file
-- apply_patch: Apply a unified diff patch to a file
-- grep_files: Search files by regex
-- web_search: Quick web search (fallback when citations are not needed)
-- request_user_input: Ask the user short multiple-choice questions
-
-PARALLEL TOOL USE:
-- Issue independent tool calls in parallel by emitting multiple tool_calls in one assistant turn (the model API supports this natively). Do not wrap them in any meta-tool or pseudo-XML.
-
-
-
-
-
-- list_mcp_resources: List MCP resources (optionally filtered by server)
-- list_mcp_resource_templates: List MCP resource templates
-
-GIT AND DIAGNOSTICS:
-- git_status: Inspect repo status safely
-- git_diff: Inspect working tree or staged diffs
-- diagnostics: Report workspace, git, sandbox, and toolchain info
-
-TESTING:
-- run_tests: Run `cargo test` with optional args
-
-SHELL EXECUTION:
-- exec_shell: Run shell commands (supports background execution)
-  - command: The command to execute
-  - timeout_ms: Timeout in milliseconds (default: 120000, max: 600000)
-  - background: Set true to run in background, returns task_id
-  - stdin: Optional stdin data to send before waiting
-  - tty: Allocate a pseudo-terminal (implies background)
-- exec_shell_wait: Poll a background task for incremental output
-- exec_shell_interact: Send stdin to a background task and read incremental output
-
-TASK MANAGEMENT:
-- todo_write: Write or update the todo list
-- update_plan: Publish a structured checklist for complex work
-- note: Record important information
-
-SUB-AGENTS:
-- spawn_agent: Spawn a background sub-agent (agent_type, message/items)
-- agent_spawn: Spawn a background sub-agent (type, prompt, allowed_tools)
-- spawn_agents_on_csv: Batch-process CSV rows with one worker sub-agent per row
-- report_agent_job_result: Worker-only job row report tool for spawn_agents_on_csv
-- agent_swarm: Spawn a dependency-aware swarm of sub-agents (tasks, shared_context)
-- swarm_status: Check status for a previously started swarm (swarm_id)
-- swarm_result: Get full results for a previously started swarm (swarm_id, optional block/timeout)
-- agent_result: Get result from a sub-agent (agent_id, block, timeout_ms)
-- send_input: Send input to a running sub-agent (agent_id, message/items, interrupt)
-- agent_assign / assign_agent: Update assignment objective/role and optionally push immediate guidance
-- wait: Wait for one or more sub-agents to complete (ids optional, wait_mode:any|all, timeout_ms)
-- agent_cancel: Cancel a running sub-agent (agent_id)
-- close_agent: Close a running sub-agent (alias for cancel)
-- resume_agent: Resume a previously closed/completed sub-agent
-- agent_list: List all sub-agents and their status
-Delegation protocol:
-- Delegate only bounded, parallelizable work with a clear input, expected output, and tool limits.
-- Prefer multiple sub-agents for independent steps to maximize parallelism.
-- When spawning/delegating, include explicit assignment metadata: objective + role (worker/explorer/awaiter/default) or agent_type.
-- Use agent_assign to retask active sub-agents instead of respawning when objective/role changes.
-- After spawning, immediately track completion with wait (for groups), swarm_result (for non-blocking swarms), or agent_result (block: true) per agent.
-- For full barriers, use wait with wait_mode="all" and a generous timeout (prefer >= 60000ms). Omit ids to wait on all currently running agents.
-- For spawn_agents_on_csv workers: call report_agent_job_result exactly once per row item; missing reports are treated as failures.
-- Workers may set stop=true in report_agent_job_result to cancel remaining unstarted CSV rows.
-- If sub-agents are still running, wait for their outputs before presenting final conclusions unless the user asked a direct question that needs an immediate reply.
-- Do not present final conclusions until required sub-agent results are collected and integrated.
-- If an agent stalls or fails, retry once with a tighter prompt; otherwise cancel it and continue with an explicit fallback.
-- Close idle agents with close_agent to free capacity; use resume_agent to continue paused/completed assignments when needed.
-- Verify critical sub-agent claims with primary tool output before applying changes.
-
-Planning and progress:
-- For complex or multi-file work, call update_plan to publish a checklist.
-- Keep exactly one plan step in_progress at a time.
-- Use todo tools for granular progress when helpful.
-- Prefer short progress notes over long narration.
-- For long-running tasks, emit checkpoint updates every few actions with: done, next, and blockers.
-- Re-baseline plan/todos at each checkpoint when scope shifts.
-
-Git hygiene:
-- Run git status early (to see the workspace state) and again before finishing.
-- Do not revert or overwrite unrelated user changes.
-- Avoid destructive git commands unless explicitly requested.
-- Do not commit unless the user asks.
-
-BACKGROUND EXECUTION:
-For long-running commands (build, test, server), use exec_shell with background: true.
-This returns a task_id immediately in the tool output.
-Use exec_shell_wait to poll for output, and exec_shell_interact to send stdin (or close stdin).
-Use tty: true for interactive programs that require a TTY.
-
-## Recursive Language Model (RLM) primitive — `rlm_query`
-
-When you need parallel analysis, recursive decomposition, or batched generation, call the `rlm_query` tool. It runs N prompts in parallel against the cheap fast model (`deepseek-v4-flash`) and returns the joined results — much faster and cheaper than doing the work inline.
-
-Two shapes:
-
-- **Single child:** `rlm_query({ "prompt": "Analyze X" })` → returns the response text.
-- **Parallel batch:** `rlm_query({ "prompts": ["Analyze X angle A", "Analyze X angle B", "Analyze X angle C"] })` → returns `[0] ...\n\n---\n\n[1] ...\n\n---\n\n[2] ...`.
-
-Optional fields: `model` (override the child model — set to `"deepseek-v4-pro"` if a child genuinely needs deep reasoning), `system` (shared system prompt for all children), `max_tokens` (per-child cap, default 4096). Hard cap: 16 prompts per call.
-
-For recursive drill-down: call `rlm_query` once for the breakdown, then call it again with a single `prompt` on the strongest finding.
-
-Do NOT use RLM when the task requires file-system modification, interactive user input, or is trivial enough for a single sentence.
-
-| Primitive | Use when | Cost | Speed |
-|---|---|---|---|
-| Inline reasoning | Simple Q&A, one-step tasks | Low | Fast |
-| `rlm_query` | Parallel / batched / recursive read-only work | Very low (flash) | Fast |
-| `agent_swarm` | Multi-step autonomous work with tools | Higher | Slower (polling) |
+Even with auto-approval, create a `todo_write` first so your work is visible and trackable in the
+sidebar. Decomposition is not red tape — it's how you organize complex work and demonstrate thoroughness.
+For multi-step initiatives, use `update_plan` + `todo_write` together.
\ No newline at end of file
diff --git a/crates/tui/src/tools/rlm_query.rs b/crates/tui/src/tools/rlm_query.rs
index 9b20f61c..43acb9da 100644
--- a/crates/tui/src/tools/rlm_query.rs
+++ b/crates/tui/src/tools/rlm_query.rs
@@ -5,10 +5,13 @@
 //! the joined result.
 
 use std::sync::Arc;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::time::Instant;
 
 use async_trait::async_trait;
 use futures_util::future::join_all;
 use serde_json::{Value, json};
+use tracing::debug;
 
 use crate::client::DeepSeekClient;
 use crate::llm_client::LlmClient;
@@ -49,12 +52,11 @@ impl ToolSpec for RlmQueryTool {
     }
 
     fn description(&self) -> &'static str {
-        "Run one or more prompts in parallel against the fast cheap model (deepseek-v4-flash). \
-         Use for fan-out analysis, batched review, or cheap parallel decomposition: pass `prompts` \
-         as an array to run them concurrently, or `prompt` for a single call. Each child runs \
-         in isolation with its own (optional) system prompt; results come back as `[i] <text>` \
-         joined blocks (or just the text when there's one prompt). Cheaper than spawning sub-agents \
-         for read-only reasoning work."
+        "Run up to 16 prompts concurrently against the fast cheap model (deepseek-v4-flash) \
+         and return the joined results. Pass `prompts: [...]` for a parallel batch or \
+         `prompt` for a single child. Children run in isolation with an optional shared \
+         `system` prompt; results come back as `[i] <text>` blocks separated by `---` (or \
+         just the text for N=1). Read-only — no file or shell side-effects."
     }
 
     fn input_schema(&self) -> Value {
@@ -142,12 +144,33 @@ impl ToolSpec for RlmQueryTool {
         let client = Arc::new(client);
         let model = Arc::new(model);
         let system = Arc::new(system);
+        let total = prompts.len();
+        // Tracks the peak concurrent in-flight child count for this fan-out.
+        // Useful as evidence that join_all actually overlaps requests rather
+        // than walking through them serially. Surfaces in `RUST_LOG=
+        // deepseek_cli::tools=debug` as the `peak` field of the summary log.
+        let in_flight = Arc::new(AtomicUsize::new(0));
+        let peak = Arc::new(AtomicUsize::new(0));
+        let dispatch_started = Instant::now();
 
         let futures = prompts.into_iter().enumerate().map(|(idx, prompt)| {
             let client = Arc::clone(&client);
             let model = Arc::clone(&model);
             let system = Arc::clone(&system);
+            let in_flight = Arc::clone(&in_flight);
+            let peak = Arc::clone(&peak);
             async move {
+                let prior = in_flight.fetch_add(1, Ordering::Relaxed);
+                let now = prior + 1;
+                peak.fetch_max(now, Ordering::Relaxed);
+                debug!(
+                    target: "deepseek_cli::tools",
+                    tool = "rlm_query",
+                    idx,
+                    in_flight = now,
+                    "child request start"
+                );
+                let started = Instant::now();
                 let request = MessageRequest {
                     model: (*model).clone(),
                     messages: vec![Message {
@@ -168,11 +191,31 @@ impl ToolSpec for RlmQueryTool {
                     temperature: Some(0.4),
                     top_p: Some(0.9),
                 };
-                (idx, client.create_message(request).await)
+                let response = client.create_message(request).await;
+                let elapsed_ms = started.elapsed().as_millis() as u64;
+                in_flight.fetch_sub(1, Ordering::Relaxed);
+                debug!(
+                    target: "deepseek_cli::tools",
+                    tool = "rlm_query",
+                    idx,
+                    elapsed_ms,
+                    ok = response.is_ok(),
+                    "child request done"
+                );
+                (idx, response)
             }
         });
 
         let results = join_all(futures).await;
+        let dispatch_elapsed_ms = dispatch_started.elapsed().as_millis() as u64;
+        debug!(
+            target: "deepseek_cli::tools",
+            tool = "rlm_query",
+            total,
+            peak = peak.load(Ordering::Relaxed),
+            dispatch_elapsed_ms,
+            "fan-out complete"
+        );
 
         let mut ordered: Vec<(usize, String)> = results
             .into_iter()
diff --git a/crates/tui/src/tools/subagent/mod.rs b/crates/tui/src/tools/subagent/mod.rs
index 0e3153c1..9e987b8c 100644
--- a/crates/tui/src/tools/subagent/mod.rs
+++ b/crates/tui/src/tools/subagent/mod.rs
@@ -1011,7 +1011,8 @@ impl ToolSpec for AgentSpawnTool {
     }
 
     fn description(&self) -> &'static str {
-        "Spawn a background sub-agent to handle a focused task. Returns an agent_id immediately; follow with agent_result to retrieve the result."
+        "Spawn a background sub-agent for a focused task. Returns an agent_id immediately; \
+         follow with agent_result to retrieve the final result."
     }
 
     fn input_schema(&self) -> Value {
@@ -1136,7 +1137,8 @@ impl ToolSpec for AgentResultTool {
     }
 
     fn description(&self) -> &'static str {
-        "Get the latest status or final result for a sub-agent."
+        "Get the latest status or final result for a sub-agent. Set `block: true` to wait until the \
+         agent reaches a terminal state (respects `timeout_ms`)."
     }
 
     fn input_schema(&self) -> Value {
@@ -1224,7 +1226,7 @@ impl ToolSpec for AgentCancelTool {
     }
 
     fn description(&self) -> &'static str {
-        "Cancel a running sub-agent."
+        "Cancel a running sub-agent. Returns the final snapshot with the cancelled status."
     }
 
     fn input_schema(&self) -> Value {
@@ -1411,7 +1413,8 @@ impl ToolSpec for AgentListTool {
     }
 
     fn description(&self) -> &'static str {
-        "List all active and completed sub-agents with their status."
+        "List all active and recently completed sub-agents with their status, type, assignment, \
+         steps taken, and duration."
     }
 
     fn input_schema(&self) -> Value {
@@ -1458,7 +1461,7 @@ impl ToolSpec for AgentSendInputTool {
     }
 
     fn description(&self) -> &'static str {
-        "Send input to a running sub-agent."
+        "Send input to a running sub-agent. Returns the agent's current snapshot after delivery."
     }
 
     fn input_schema(&self) -> Value {
@@ -1542,7 +1545,9 @@ impl ToolSpec for AgentAssignTool {
     }
 
     fn description(&self) -> &'static str {
-        "Update a sub-agent assignment and optionally send an immediate instruction."
+        "Update a sub-agent's assignment (objective, role) and optionally deliver an immediate \
+         coordinator note. The update is delivered as a high-priority message when `interrupt` is \
+         true (the default). Returns the agent's current snapshot."
     }
 
     fn input_schema(&self) -> Value {
@@ -1634,7 +1639,9 @@ impl ToolSpec for AgentWaitTool {
     }
 
     fn description(&self) -> &'static str {
-        "Wait for one or more sub-agents to reach a terminal status."
+        "Wait for one or more sub-agents to reach a terminal status. Use `wait_mode: \"all\"` to block \
+         until every listed agent finishes, or `wait_mode: \"any\"` (default) to return as soon as \
+         one finishes. When no ids are given, waits on all currently running sub-agents."
     }
 
     fn input_schema(&self) -> Value {
@@ -1777,7 +1784,8 @@ impl ToolSpec for DelegateToAgentTool {
     }
 
     fn description(&self) -> &'static str {
-        "Delegate a task to a specialized sub-agent. This is an alias for agent_spawn."
+        "Delegate a task to a specialized sub-agent. This is an alias for agent_spawn — same schema, \
+         same behavior. Use `type` (or `agent_name`, `agent_type`) to pick the agent flavor."
     }
 
     fn input_schema(&self) -> Value {
diff --git a/crates/tui/src/tools/swarm.rs b/crates/tui/src/tools/swarm.rs
index 71d98dc9..658e9f06 100644
--- a/crates/tui/src/tools/swarm.rs
+++ b/crates/tui/src/tools/swarm.rs
@@ -281,7 +281,9 @@ impl ToolSpec for AgentSwarmTool {
     }
 
     fn description(&self) -> &'static str {
-        "Spawn multiple sub-agents with optional dependencies and aggregate their results."
+        "Spawn multiple sub-agents in parallel, each with their own tools and optional task \
+         dependencies, and aggregate their results. Returns a swarm_id; results come back via \
+         swarm_result or wait."
     }
 
     fn input_schema(&self) -> Value {
@@ -463,7 +465,8 @@ impl ToolSpec for SwarmStatusTool {
     }
 
     fn description(&self) -> &'static str {
-        "Get the latest status for a previously spawned swarm."
+        "Get the latest status snapshot for a previously spawned swarm — status, task counts, \
+         and elapsed duration, without pulling full per-task results."
     }
 
     fn input_schema(&self) -> Value {
@@ -517,7 +520,8 @@ impl ToolSpec for SwarmResultTool {
     }
 
     fn description(&self) -> &'static str {
-        "Get full outcomes for a previously spawned swarm."
+        "Get full outcomes for a previously spawned swarm. Use `block: true` to wait for completion; \
+         returns task-level results, durations, errors, and aggregated counts."
     }
 
     fn input_schema(&self) -> Value {
diff --git a/crates/tui/src/tui/active_cell.rs b/crates/tui/src/tui/active_cell.rs
index cfe9239a..ffd3db57 100644
--- a/crates/tui/src/tui/active_cell.rs
+++ b/crates/tui/src/tui/active_cell.rs
@@ -353,6 +353,7 @@ mod tests {
             status: ToolStatus::Running,
             input_summary: None,
             output: None,
+            prompts: None,
         }))
     }
 
diff --git a/crates/tui/src/tui/history.rs b/crates/tui/src/tui/history.rs
index 54a4c8b1..f25e4681 100644
--- a/crates/tui/src/tui/history.rs
+++ b/crates/tui/src/tui/history.rs
@@ -898,6 +898,10 @@ pub struct GenericToolCell {
     pub status: ToolStatus,
     pub input_summary: Option<String>,
     pub output: Option<String>,
+    /// When the tool is `rlm_query` (or any future fan-out tool that exposes a
+    /// list of child prompts), each prompt is shown on its own indented row
+    /// instead of the inline `args:` summary. `None` for ordinary tools.
+    pub prompts: Option<Vec<String>>,
 }
 
 impl GenericToolCell {
@@ -917,15 +921,37 @@ impl GenericToolCell {
             tool_value_style(),
             width,
         ));
-        let show_args = matches!(self.status, ToolStatus::Running) || self.output.is_none();
-        if show_args && let Some(summary) = self.input_summary.as_ref() {
-            lines.extend(render_compact_kv(
-                "args",
-                summary,
-                tool_value_style(),
-                width,
-            ));
+
+        // Prefer per-prompt rows over the generic args summary when the tool
+        // exposes a list of child prompts (rlm_query). One row per child with
+        // a `[i]` index makes the fan-out legible without expanding JSON.
+        let show_prompts = matches!(self.status, ToolStatus::Running) || self.output.is_none();
+        if show_prompts
+            && let Some(prompts) = self.prompts.as_ref()
+            && !prompts.is_empty()
+        {
+            for (idx, prompt) in prompts.iter().enumerate() {
+                let label = if idx == 0 { "prompts" } else { "" };
+                let value = format!("[{idx}] {}", truncate_text(prompt.trim(), 200));
+                lines.extend(render_card_detail_line(
+                    if label.is_empty() { None } else { Some(label) },
+                    &value,
+                    tool_value_style(),
+                    width,
+                ));
+            }
+        } else {
+            let show_args = matches!(self.status, ToolStatus::Running) || self.output.is_none();
+            if show_args && let Some(summary) = self.input_summary.as_ref() {
+                lines.extend(render_compact_kv(
+                    "args",
+                    summary,
+                    tool_value_style(),
+                    width,
+                ));
+            }
         }
+
         if let Some(output) = self.output.as_ref() {
             lines.extend(render_compact_kv(
                 "result",
@@ -1786,8 +1812,8 @@ fn thinking_state_accent(state: ThinkingVisualState) -> Color {
 #[cfg(test)]
 mod tests {
     use super::{
-        ExecCell, ExecSource, HistoryCell, PlanStep, PlanUpdateCell, TOOL_RUNNING_SYMBOLS,
-        TOOL_STATUS_SYMBOL_MS, ToolCell, ToolStatus, TranscriptRenderOptions,
+        ExecCell, ExecSource, GenericToolCell, HistoryCell, PlanStep, PlanUpdateCell,
+        TOOL_RUNNING_SYMBOLS, TOOL_STATUS_SYMBOL_MS, ToolCell, ToolStatus, TranscriptRenderOptions,
         extract_reasoning_summary, render_thinking, running_status_label_with_elapsed,
     };
     use crate::deepseek_theme::Theme;
@@ -2182,4 +2208,48 @@ mod tests {
         let last = format!("output line {:02}", total_output_lines - 1);
         assert!(transcript_text.contains(&last));
     }
+
+    #[test]
+    fn generic_tool_cell_renders_rlm_prompts_as_indexed_rows() {
+        // When prompts are populated (rlm_query fan-out), each child shows on
+        // its own row instead of the inline `args:` summary so the user can
+        // read what each child was asked.
+        let cell = HistoryCell::Tool(ToolCell::Generic(GenericToolCell {
+            name: "rlm_query".to_string(),
+            status: ToolStatus::Running,
+            input_summary: Some("prompts: <3 items>".to_string()),
+            output: None,
+            prompts: Some(vec![
+                "Summarize the README".to_string(),
+                "List the public types in client.rs".to_string(),
+                "Diff this commit against main".to_string(),
+            ]),
+        }));
+        let text = lines_text(&cell.lines(80));
+
+        assert!(text.contains("[0] Summarize the README"));
+        assert!(text.contains("[1] List the public types in client.rs"));
+        assert!(text.contains("[2] Diff this commit against main"));
+        // The inline args summary must not also be emitted — we replaced it
+        // with the per-child rows.
+        assert!(
+            !text.contains("args: prompts:"),
+            "inline `args:` summary must be suppressed when per-prompt rows render"
+        );
+    }
+
+    #[test]
+    fn generic_tool_cell_falls_back_to_args_when_prompts_none() {
+        // Non-fan-out tools keep the existing `args:` summary so behavior
+        // doesn't drift for everything else.
+        let cell = HistoryCell::Tool(ToolCell::Generic(GenericToolCell {
+            name: "file_search".to_string(),
+            status: ToolStatus::Running,
+            input_summary: Some("query: foo".to_string()),
+            output: None,
+            prompts: None,
+        }));
+        let text = lines_text(&cell.lines(80));
+        assert!(text.contains("query: foo"));
+    }
 }
diff --git a/crates/tui/src/tui/markdown_render.rs b/crates/tui/src/tui/markdown_render.rs
index bc98da0d..4a427df5 100644
--- a/crates/tui/src/tui/markdown_render.rs
+++ b/crates/tui/src/tui/markdown_render.rs
@@ -24,9 +24,7 @@
 //! cell and re-runs only the render step on width changes. That makes resize a
 //! re-flow operation rather than a re-parse + re-flow operation.
 
-use std::sync::Arc;
-
-#[cfg(any(test, feature = "perf-counters"))]
+#[cfg(test)]
 use std::cell::Cell;
 
 use ratatui::style::{Modifier, Style};
@@ -37,21 +35,20 @@ use crate::palette;
 
 // Thread-local counter incremented every time `parse` runs. Used by tests to
 // prove that width-only changes hit the cached-AST path and skip parsing.
-// Available in test builds and behind the `perf-counters` feature flag so
-// release builds pay no cost. Thread-local (not global atomic) so concurrent
-// tests calling `parse()` can't pollute each other's counters.
-#[cfg(any(test, feature = "perf-counters"))]
+// Thread-local (not global atomic) so concurrent tests calling `parse()` can't
+// pollute each other's counters.
+#[cfg(test)]
 thread_local! {
     static PARSE_INVOCATIONS: Cell<u64> = const { Cell::new(0) };
 }
 
-#[cfg(any(test, feature = "perf-counters"))]
+#[cfg(test)]
 #[must_use]
 pub fn parse_invocation_count() -> u64 {
     PARSE_INVOCATIONS.with(|c| c.get())
 }
 
-#[cfg(any(test, feature = "perf-counters"))]
+#[cfg(test)]
 pub fn reset_parse_invocation_count() {
     PARSE_INVOCATIONS.with(|c| c.set(0));
 }
@@ -87,20 +84,6 @@ pub struct ParsedMarkdown {
     blocks: Vec<Block>,
 }
 
-impl ParsedMarkdown {
-    /// Borrow the parsed blocks (mostly useful for tests).
-    #[must_use]
-    pub fn blocks(&self) -> &[Block] {
-        &self.blocks
-    }
-
-    /// Whether the parse was empty (no source at all).
-    #[must_use]
-    pub fn is_empty(&self) -> bool {
-        self.blocks.is_empty()
-    }
-}
-
 /// Parse markdown source into a width-independent block AST.
 ///
 /// This is a small line-oriented parser tuned for the patterns we render:
@@ -110,7 +93,7 @@ impl ParsedMarkdown {
 /// classify as `Block::Paragraph`.
 #[must_use]
 pub fn parse(content: &str) -> ParsedMarkdown {
-    #[cfg(any(test, feature = "perf-counters"))]
+    #[cfg(test)]
     PARSE_INVOCATIONS.with(|c| c.set(c.get() + 1));
 
     let mut blocks = Vec::new();
@@ -236,16 +219,6 @@ pub fn render_markdown(content: &str, width: u16, base_style: Style) -> Vec<Line
     render_parsed(&parsed, width, base_style)
 }
 
-/// Cache-friendly parsed AST for [`HistoryCell`] rendering.
-///
-/// Wraps the `ParsedMarkdown` in `Arc` so the transcript cache can hand the
-/// same parse to many render passes (e.g. across spacers / overlays) without
-/// reallocation.
-#[must_use]
-pub fn parse_arc(content: &str) -> Arc<ParsedMarkdown> {
-    Arc::new(parse(content))
-}
-
 fn parse_heading(line: &str) -> Option<(usize, &str)> {
     let trimmed = line.trim_start();
     let hashes = trimmed.chars().take_while(|c| *c == '#').count();
@@ -514,7 +487,7 @@ mod tests {
     #[test]
     fn fenced_code_block_collected_in_parse() {
         let parsed = parse("text\n```\ncode line one\ncode line two\n```\nmore\n");
-        let blocks = parsed.blocks();
+        let blocks = &parsed.blocks;
         // text paragraph, two code lines, more paragraph (fences are dropped)
         let code_lines: Vec<_> = blocks
             .iter()
@@ -530,7 +503,7 @@ mod tests {
     fn ordered_and_unordered_list_items_parse() {
         let parsed = parse("- alpha\n* beta\n1. gamma\n");
         let items: Vec<_> = parsed
-            .blocks()
+            .blocks
             .iter()
             .filter_map(|b| match b {
                 Block::ListItem { bullet, text } => Some((bullet.as_str(), text.as_str())),
diff --git a/crates/tui/src/tui/ui.rs b/crates/tui/src/tui/ui.rs
index 43e07916..5308b16c 100644
--- a/crates/tui/src/tui/ui.rs
+++ b/crates/tui/src/tui/ui.rs
@@ -95,7 +95,11 @@ const CONTEXT_WARNING_THRESHOLD_PERCENT: f64 = 85.0;
 const CONTEXT_CRITICAL_THRESHOLD_PERCENT: f64 = 95.0;
 const UI_IDLE_POLL_MS: u64 = 48;
 const UI_ACTIVE_POLL_MS: u64 = 24;
-const UI_STATUS_ANIMATION_MS: u64 = 360;
+// Forced repaint cadence while a turn is live (model loading, compacting,
+// sub-agents running). Drives the footer water-spout animation as well as
+// the per-tool spinner pulse — keep this fast enough that the spout reads as
+// motion (~12 fps) instead of teleport-frames.
+const UI_STATUS_ANIMATION_MS: u64 = 80;
 const WORKSPACE_CONTEXT_REFRESH_SECS: u64 = 15;
 const SIDEBAR_VISIBLE_MIN_WIDTH: u16 = 100;
 
@@ -3068,7 +3072,7 @@ fn render_footer(f: &mut Frame, area: Rect, app: &mut App) {
         Vec::new()
     };
 
-    let props = FooterProps::from_app(
+    let mut props = FooterProps::from_app(
         app,
         toast,
         state_label,
@@ -3079,11 +3083,35 @@ fn render_footer(f: &mut Frame, area: Rect, app: &mut App) {
         cache,
         cost,
     );
+
+    // Animate the spacer between the left status line and the right-hand
+    // chips whenever a turn is live: model loading/streaming, compacting, or
+    // sub-agents in flight. Honors the `low_motion` setting — calm terminals
+    // get the plain whitespace gap. Frame counter ticks every 80 ms; the
+    // renderer is deterministic given the frame, so tests can pin specific
+    // frames. Computed independently of `state_label` so removing the
+    // "thinking" text label doesn't kill the visual signal.
+    if !app.low_motion && footer_working_strip_active(app) {
+        let frame = std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)
+            .map(|d| d.as_millis() as u64 / 80)
+            .unwrap_or(0);
+        props.working_strip_frame = Some(frame);
+    }
+
     let widget = FooterWidget::new(props);
     let buf = f.buffer_mut();
     widget.render(area, buf);
 }
 
+/// Whether the footer should animate the water-spout strip. Driven by the
+/// underlying live-work flags (model loading, compacting, sub-agents) rather
+/// than a stringly-typed status label, so adding or removing labels never
+/// silently disables the animation.
+fn footer_working_strip_active(app: &App) -> bool {
+    app.is_loading || app.is_compacting || running_agent_count(app) > 0
+}
+
 /// Test-only helper retained as a parity reference for `FooterWidget`'s
 /// auxiliary-span composition. Production rendering is performed by the
 /// widget itself; the existing footer parity tests still exercise this
@@ -3257,9 +3285,12 @@ fn footer_state_label(app: &App) -> (&'static str, ratatui::style::Color) {
     if app.is_compacting {
         return ("compacting \u{238B}", palette::STATUS_WARNING);
     }
-    if app.is_loading {
-        return ("thinking \u{238B}", palette::STATUS_WARNING);
-    }
+    // Note: we deliberately do NOT show a "thinking" label for `is_loading`.
+    // The animated water-spout strip in the footer's spacer is the visual
+    // signal that the model is live; "thinking" was misleading because it
+    // fired for every kind of in-flight work (tool calls, streaming, etc.),
+    // not strictly reasoning. Sub-agents still surface "working" because
+    // that's a distinct lifecycle the user can act on (open `/agents`).
     if running_agent_count(app) > 0 {
         return ("working", palette::DEEPSEEK_SKY);
     }
@@ -4276,6 +4307,7 @@ fn handle_tool_call_started(app: &mut App, id: &str, name: &str, input: &serde_j
     }
 
     let input_summary = summarize_tool_args(input);
+    let prompts = extract_fanout_prompts(name, input);
     push_active_tool_cell(
         app,
         &id,
@@ -4286,10 +4318,35 @@ fn handle_tool_call_started(app: &mut App, id: &str, name: &str, input: &serde_j
             status: ToolStatus::Running,
             input_summary,
             output: None,
+            prompts,
         })),
     );
 }
 
+/// Extract per-child prompts from a fan-out tool's input. For `rlm_query` the
+/// renderer shows one row per child instead of an inline JSON summary so the
+/// user can read what each child was asked. Returns `None` for tools that
+/// don't expose a prompt list.
+fn extract_fanout_prompts(name: &str, input: &serde_json::Value) -> Option<Vec<String>> {
+    if name != "rlm_query" {
+        return None;
+    }
+    if let Some(arr) = input.get("prompts").and_then(|v| v.as_array()) {
+        let prompts: Vec<String> = arr
+            .iter()
+            .filter_map(|v| v.as_str().map(str::to_string))
+            .collect();
+        if prompts.is_empty() {
+            return None;
+        }
+        return Some(prompts);
+    }
+    if let Some(s) = input.get("prompt").and_then(|v| v.as_str()) {
+        return Some(vec![s.to_string()]);
+    }
+    None
+}
+
 /// Push a tool cell as a new entry in `active_cell`, register the tool id,
 /// and write a stub detail record so the pager / Ctrl+O can find it.
 fn push_active_tool_cell(
@@ -4585,6 +4642,7 @@ fn push_orphan_tool_completion(
         status,
         input_summary: None,
         output,
+        prompts: None,
     })));
     let cell_index = app.history.len().saturating_sub(1);
     app.tool_details_by_cell.insert(
diff --git a/crates/tui/src/tui/ui/tests.rs b/crates/tui/src/tui/ui/tests.rs
index 053b0026..6b8175d1 100644
--- a/crates/tui/src/tui/ui/tests.rs
+++ b/crates/tui/src/tui/ui/tests.rs
@@ -349,19 +349,27 @@ fn format_context_budget_caps_overflow_display() {
 }
 
 #[test]
-fn footer_state_label_prefers_compacting_then_thinking() {
+fn footer_state_label_drops_thinking_and_prefers_compacting() {
+    // We deliberately do not surface a "thinking" label for `is_loading` —
+    // the animated water-spout strip in the footer's spacer is the visual
+    // signal. `is_loading` alone falls through to "ready"; `is_compacting`
+    // still wins because compacting is a less-common, distinct state.
     let mut app = create_test_app();
     assert_eq!(footer_state_label(&app).0, "ready");
 
     app.is_loading = true;
-    assert!(footer_state_label(&app).0.starts_with("thinking"));
+    assert_eq!(
+        footer_state_label(&app).0,
+        "ready",
+        "is_loading must NOT produce a `thinking` text label — the animation handles it"
+    );
 
     app.is_compacting = true;
     assert!(footer_state_label(&app).0.starts_with("compacting"));
 }
 
 #[test]
-fn footer_status_line_spans_show_mode_model_and_status() {
+fn footer_status_line_spans_show_mode_and_model_idle_and_active() {
     let mut app = create_test_app();
     app.model = "deepseek-v4-flash".to_string();
 
@@ -371,11 +379,17 @@ fn footer_status_line_spans_show_mode_model_and_status() {
     assert!(idle.contains("\u{00B7}"));
     assert!(!idle.contains("ready"));
 
+    // is_loading no longer adds a "thinking" text label — the live-work
+    // signal is the animated water-spout strip the renderer paints into
+    // the footer's spacer. The mode + model still render unchanged.
     app.is_loading = true;
     let active = spans_text(&footer_status_line_spans(&app, 60));
     assert!(active.contains("agent"));
     assert!(active.contains("deepseek-v4-flash"));
-    assert!(active.contains("thinking"));
+    assert!(
+        !active.contains("thinking"),
+        "footer must not show a `thinking` text label while loading"
+    );
 }
 
 #[test]
@@ -896,6 +910,7 @@ fn jump_to_adjacent_tool_cell_finds_next_and_previous() {
             status: ToolStatus::Success,
             input_summary: Some("query: foo".to_string()),
             output: Some("done".to_string()),
+            prompts: None,
         })),
         HistoryCell::Assistant {
             content: "ok".to_string(),
@@ -906,6 +921,7 @@ fn jump_to_adjacent_tool_cell_finds_next_and_previous() {
             status: ToolStatus::Success,
             input_summary: Some("ls".to_string()),
             output: Some("...".to_string()),
+            prompts: None,
         })),
     ];
     app.mark_history_updated();
diff --git a/crates/tui/src/tui/widgets/footer.rs b/crates/tui/src/tui/widgets/footer.rs
index fa59606d..cb0a4fa9 100644
--- a/crates/tui/src/tui/widgets/footer.rs
+++ b/crates/tui/src/tui/widgets/footer.rs
@@ -49,6 +49,54 @@ pub struct FooterProps {
     pub cost: Vec<Span<'static>>,
     /// Optional toast that, when present, replaces the left status line.
     pub toast: Option<FooterToast>,
+    /// When `Some(frame_idx)`, the gap between the left status line and the
+    /// right-hand chips is filled with an animated water-spout strip keyed
+    /// off `frame_idx` (deterministic given the frame). `None` keeps the gap
+    /// as plain whitespace, which is the idle/ready state.
+    pub working_strip_frame: Option<u64>,
+}
+
+/// One frame of the footer's water-spout animation. `col` is the cell index
+/// inside the strip, `width` the strip's total width, `frame` the discrete
+/// frame counter. Returns the glyph that should appear in that cell on that
+/// frame.
+///
+/// Visual: a single calm water line of `─` with one upward spout glyph that
+/// drifts back and forth via a triangle-wave bounce. Minimal, artistic, and
+/// purely deterministic so the test suite can pin a specific frame.
+#[must_use]
+pub fn footer_working_strip_glyph_at(col: usize, width: usize, frame: u64) -> char {
+    if width == 0 {
+        return ' ';
+    }
+    let w = width as i64;
+    let frame = frame as i64;
+
+    // Bounce a value that counts up forever between [0, w-1] using a
+    // triangle wave so the spout rides back and forth instead of wrapping.
+    let span = (w * 2).max(2);
+    let t = frame.rem_euclid(span);
+    let pos = if t < w { t } else { (span - 1) - t };
+
+    let dist = (col as i64 - pos).abs();
+    match dist {
+        0 => '\u{257F}', // ╿  — vertical bar with a stronger top half: a spout standing up out of the surface
+        1 => '\u{2576}', // ╶  — short stub on the spout's shoulder, like a splash
+        _ => '\u{2500}', // ─  — calm water surface
+    }
+}
+
+/// Build the per-frame water-spout string of `width` characters. Empty string
+/// when width is 0. The result is the same visual width as requested (one
+/// char per column for box-drawing chars) and is safe to drop into a `Span`
+/// between the footer's left and right segments.
+#[must_use]
+pub fn footer_working_strip_string(width: usize, frame: u64) -> String {
+    let mut out = String::with_capacity(width * 4);
+    for col in 0..width {
+        out.push(footer_working_strip_glyph_at(col, width, frame));
+    }
+    out
 }
 
 /// Build a "N agents" chip span list when there are sub-agents in flight.
@@ -112,6 +160,7 @@ impl FooterProps {
             cache,
             cost,
             toast,
+            working_strip_frame: None,
         }
     }
 }
@@ -253,8 +302,18 @@ impl Renderable for FooterWidget {
         let left_width = span_width(&left_spans);
         let spacer_width = available_width.saturating_sub(left_width + right_width);
 
+        // When a turn is in flight, fill the gap with a thin animated water-
+        // spout strip; otherwise the gap stays as plain whitespace.
+        let spacer_span = match self.props.working_strip_frame {
+            Some(frame) if spacer_width > 0 => Span::styled(
+                footer_working_strip_string(spacer_width, frame),
+                Style::default().fg(palette::DEEPSEEK_SKY),
+            ),
+            _ => Span::raw(" ".repeat(spacer_width)),
+        };
+
         let mut all_spans = left_spans;
-        all_spans.push(Span::raw(" ".repeat(spacer_width)));
+        all_spans.push(spacer_span);
         all_spans.extend(right_spans);
 
         let paragraph = Paragraph::new(Line::from(all_spans));
@@ -467,6 +526,73 @@ mod tests {
         assert!(!rendered.contains("ready"));
     }
 
+    #[test]
+    fn working_strip_string_width_matches_request() {
+        // The strip must produce exactly `width` characters per frame —
+        // otherwise the spacer math in `FooterWidget::render` would
+        // mis-align the right-hand chips. (Glyphs are all ASCII / Latin-1
+        // so char count equals visual width here.)
+        for width in [0usize, 1, 8, 60, 200] {
+            let s = super::footer_working_strip_string(width, 7);
+            assert_eq!(s.chars().count(), width, "width {width} mismatch");
+        }
+    }
+
+    #[test]
+    fn working_strip_glyph_is_deterministic_per_frame() {
+        // Same (col, width, frame) → same glyph. Different `frame` values
+        // produce different overall strings, which is what makes the
+        // animation visible.
+        let a = super::footer_working_strip_string(40, 1);
+        let b = super::footer_working_strip_string(40, 1);
+        assert_eq!(a, b, "deterministic given the same frame");
+        let c = super::footer_working_strip_string(40, 2);
+        assert_ne!(a, c, "advancing the frame must change the strip");
+    }
+
+    #[test]
+    fn working_strip_renders_glyphs_only_when_frame_is_some() {
+        // Idle: spacer is plain whitespace. Active: spacer contains the
+        // box-drawing animation glyphs (`╿` spout, `╶` splash, `─` water
+        // surface) and visibly differs from the idle render.
+        let app = make_app();
+        let mut props = idle_props_for(&app);
+
+        let area = ratatui::layout::Rect::new(0, 0, 80, 1);
+        let mut buf = ratatui::buffer::Buffer::empty(area);
+        FooterWidget::new(props.clone()).render(area, &mut buf);
+        let idle: String = (0..area.width).map(|x| buf[(x, 0)].symbol()).collect();
+
+        props.working_strip_frame = Some(13);
+        let mut buf2 = ratatui::buffer::Buffer::empty(area);
+        FooterWidget::new(props).render(area, &mut buf2);
+        let active: String = (0..area.width).map(|x| buf2[(x, 0)].symbol()).collect();
+
+        assert_ne!(
+            idle, active,
+            "active footer must visibly differ from idle one"
+        );
+        assert!(
+            active.contains('\u{257F}')
+                || active.contains('\u{2576}')
+                || active.contains('\u{2500}'),
+            "active strip must contain at least one animation glyph: {active:?}",
+        );
+    }
+
+    #[test]
+    fn working_strip_spout_position_advances_with_frame() {
+        // The single spout column must move between consecutive frames so
+        // the animation reads as drift rather than a static pattern.
+        let width = 16;
+        let f0 = super::footer_working_strip_string(width, 1);
+        let f1 = super::footer_working_strip_string(width, 2);
+        let pos = |s: &str| s.chars().position(|c| c == '\u{257F}');
+        let p0 = pos(&f0).expect("frame 1 has a spout");
+        let p1 = pos(&f1).expect("frame 2 has a spout");
+        assert_ne!(p0, p1, "spout column must advance between frames");
+    }
+
     #[test]
     fn render_swaps_toast_for_status_line() {
         let app = make_app();
diff --git a/crates/tui/src/tui/widgets/mod.rs b/crates/tui/src/tui/widgets/mod.rs
index 486ca7f1..825fc568 100644
--- a/crates/tui/src/tui/widgets/mod.rs
+++ b/crates/tui/src/tui/widgets/mod.rs
@@ -1526,6 +1526,7 @@ mod tests {
             status: ToolStatus::Success,
             input_summary: Some("items: <2 items>".to_string()),
             output: Some("hello world ".repeat(420)),
+            prompts: None,
         }));
         for width in [40u16, 80, 111, 165] {
             let lines = cell.lines(width);
@@ -1568,6 +1569,7 @@ mod tests {
                 status: ToolStatus::Success,
                 input_summary: Some("todos: <1 items>".to_string()),
                 output: Some(output),
+                prompts: None,
             })));
 
             let height: u16 = 30;