From 16f62f7abf416946c5cea136edbd156e2f4836e1 Mon Sep 17 00:00:00 2001
From: Hunter Bown <hmbown@gmail.com>
Date: Fri, 24 Apr 2026 18:42:18 -0500
Subject: [PATCH] Fix reasoning replay and context accounting for NIM

---
 README.md                      |   8 +--
 crates/config/src/lib.rs       |  22 +++++-
 crates/tui/src/client.rs       | 127 ++++++++++++++++++++++++++-------
 crates/tui/src/compaction.rs   | 125 ++++++++++++++++++++++++++++++--
 crates/tui/src/tui/ui.rs       |  15 ++++
 crates/tui/src/tui/ui/tests.rs |  27 ++++++-
 docs/CONFIGURATION.md          |   6 +-
 npm/deepseek-tui/README.md     |   6 +-
 8 files changed, 296 insertions(+), 40 deletions(-)
diff --git a/README.md b/README.md
index 30ff8cb8..490024a8 100644
--- a/README.md
+++ b/README.md
@@ -118,10 +118,10 @@ Controls: `F1` help, `Esc` backs out of the current action, `Ctrl+K` command pal
 Key environment overrides: `DEEPSEEK_API_KEY`, `DEEPSEEK_BASE_URL`,
 `DEEPSEEK_MODEL`, `DEEPSEEK_PROFILE`, `DEEPSEEK_PROVIDER`.
 For NVIDIA NIM, use `DEEPSEEK_PROVIDER=nvidia-nim` plus `NVIDIA_API_KEY`
-or `NVIDIA_NIM_API_KEY`; the default model is `deepseek-ai/deepseek-v4-pro`
-and the default base URL is `https://integrate.api.nvidia.com/v1`. With
-`--provider nvidia-nim`, `--model deepseek-v4-flash` maps to
-`deepseek-ai/deepseek-v4-flash`.
+or `NVIDIA_NIM_API_KEY` (with `DEEPSEEK_API_KEY` as a compatibility fallback);
+the default model is `deepseek-ai/deepseek-v4-pro` and the default base URL is
+`https://integrate.api.nvidia.com/v1`. With `--provider nvidia-nim`,
+`--model deepseek-v4-flash` maps to `deepseek-ai/deepseek-v4-flash`.
 
 The client targets DeepSeek's documented OpenAI-compatible Chat Completions API
 (`/chat/completions`). DeepSeek context caching is automatic; when the API
diff --git a/crates/config/src/lib.rs b/crates/config/src/lib.rs
index 985a2dd1..1e47c770 100644
--- a/crates/config/src/lib.rs
+++ b/crates/config/src/lib.rs
@@ -580,7 +580,10 @@ impl EnvRuntimeOverrides {
     fn api_key_for(&self, provider: ProviderKind) -> Option<String> {
         match provider {
             ProviderKind::Deepseek => self.deepseek_api_key.clone(),
-            ProviderKind::NvidiaNim => self.nvidia_api_key.clone(),
+            ProviderKind::NvidiaNim => self
+                .nvidia_api_key
+                .clone()
+                .or_else(|| self.deepseek_api_key.clone()),
             ProviderKind::Openai => self.openai_api_key.clone(),
         }
     }
@@ -780,6 +783,23 @@ mod tests {
         assert_eq!(resolved.model, DEFAULT_NVIDIA_NIM_MODEL);
     }
 
+    #[test]
+    fn nvidia_nim_provider_can_fallback_to_deepseek_api_key_env() {
+        let _lock = env_lock();
+        let _env = EnvGuard::without_deepseek_runtime_overrides();
+        // Safety: test-only environment mutation guarded by a module mutex.
+        unsafe {
+            env::set_var("DEEPSEEK_PROVIDER", "nvidia-nim");
+            env::set_var("DEEPSEEK_API_KEY", "deepseek-compat-key");
+        }
+
+        let config = ConfigToml::default();
+        let resolved = config.resolve_runtime_options(&CliRuntimeOverrides::default());
+
+        assert_eq!(resolved.provider, ProviderKind::NvidiaNim);
+        assert_eq!(resolved.api_key.as_deref(), Some("deepseek-compat-key"));
+    }
+
     #[test]
     fn list_values_redacts_root_api_key() {
         let config = ConfigToml {
diff --git a/crates/tui/src/client.rs b/crates/tui/src/client.rs
index bb98422f..d273aa06 100644
--- a/crates/tui/src/client.rs
+++ b/crates/tui/src/client.rs
@@ -1351,6 +1351,7 @@ fn build_chat_messages_with_reasoning(
 ) -> Vec<Value> {
     let mut out = Vec::new();
     let mut pending_tool_calls: HashSet<String> = HashSet::new();
+    let current_turn_start = messages.iter().rposition(is_text_user_message);
 
     if let Some(instructions) = system_to_instructions(system.cloned())
         && !instructions.trim().is_empty()
@@ -1361,7 +1362,7 @@ fn build_chat_messages_with_reasoning(
         }));
     }
 
-    for message in messages {
+    for (message_index, message) in messages.iter().enumerate() {
         let role = message.role.as_str();
         let mut text_parts = Vec::new();
         let mut thinking_parts = Vec::new();
@@ -1423,16 +1424,22 @@ fn build_chat_messages_with_reasoning(
             let reasoning_content = thinking_parts.join("\n");
             let has_text = !content.trim().is_empty();
             let mut has_tool_calls = !tool_calls.is_empty();
-            let include_reasoning_for_turn = include_reasoning && has_tool_calls;
+            let include_reasoning_for_turn = include_reasoning
+                && has_tool_calls
+                && current_turn_start.is_some_and(|start| message_index > start)
+                && !has_later_assistant_text(messages, message_index);
             let has_reasoning = include_reasoning_for_turn && !reasoning_content.trim().is_empty();
 
             // DeepSeek thinking-mode tool turns are stateful within the
             // stateless Chat Completions transcript: if an assistant performed
-            // a tool call, its `reasoning_content` must be replayed in every
-            // later request. Older checkpoints could lose that field because
-            // the UI display stream had no visible text block. Do not forward
-            // those malformed tool calls; dropping the stale tool round is
-            // better than guaranteeing a provider-side 400.
+            // a tool call in the current user turn, its `reasoning_content`
+            // must be replayed while continuing that tool round. Once a new
+            // user text turn starts, DeepSeek recommends clearing historical
+            // reasoning content so the context is not dominated by old CoT.
+            // Older checkpoints could lose the current-round field because the
+            // UI display stream had no visible text block. Do not forward those
+            // malformed current tool calls; dropping that round is better than
+            // guaranteeing a provider-side 400.
             if include_reasoning_for_turn && !has_reasoning {
                 logging::warn(
                     "Dropping DeepSeek tool_calls with missing reasoning_content from assistant message",
@@ -1611,6 +1618,33 @@ fn build_chat_messages_with_reasoning(
     out
 }
 
+fn is_text_user_message(message: &Message) -> bool {
+    message.role == "user"
+        && message.content.iter().any(|block| {
+            matches!(
+                block,
+                ContentBlock::Text { text, .. } if !text.trim().is_empty()
+            )
+        })
+}
+
+fn has_later_assistant_text(messages: &[Message], message_index: usize) -> bool {
+    messages
+        .iter()
+        .skip(message_index.saturating_add(1))
+        .any(is_text_assistant_message)
+}
+
+fn is_text_assistant_message(message: &Message) -> bool {
+    message.role == "assistant"
+        && message.content.iter().any(|block| {
+            matches!(
+                block,
+                ContentBlock::Text { text, .. } if !text.trim().is_empty()
+            )
+        })
+}
+
 fn tool_to_chat(tool: &Tool) -> Value {
     let mut value = json!({
         "type": "function",
@@ -2403,7 +2437,7 @@ mod tests {
     }
 
     #[test]
-    fn chat_messages_preserve_prior_tool_round_reasoning_after_new_user_turn() {
+    fn chat_messages_clear_prior_tool_round_reasoning_after_new_user_turn() {
         let messages = vec![
             Message {
                 role: "user".to_string(),
@@ -2455,14 +2489,62 @@ mod tests {
             .iter()
             .find(|value| value.get("role").and_then(Value::as_str) == Some("assistant"))
             .expect("assistant message");
-        assert_eq!(
-            assistant.get("reasoning_content").and_then(Value::as_str),
-            Some("Need to call a tool")
-        );
+        assert!(assistant.get("tool_calls").is_some());
+        assert!(assistant.get("reasoning_content").is_none());
     }
 
     #[test]
-    fn chat_messages_preserve_v4_tool_round_reasoning() {
+    fn chat_messages_clear_completed_tool_round_reasoning_after_final_answer() {
+        let messages = vec![
+            Message {
+                role: "user".to_string(),
+                content: vec![ContentBlock::Text {
+                    text: "Need the date".to_string(),
+                    cache_control: None,
+                }],
+            },
+            Message {
+                role: "assistant".to_string(),
+                content: vec![
+                    ContentBlock::Thinking {
+                        thinking: "Need to call a tool".to_string(),
+                    },
+                    ContentBlock::ToolUse {
+                        id: "tool-1".to_string(),
+                        name: "get_date".to_string(),
+                        input: json!({}),
+                        caller: None,
+                    },
+                ],
+            },
+            Message {
+                role: "user".to_string(),
+                content: vec![ContentBlock::ToolResult {
+                    tool_use_id: "tool-1".to_string(),
+                    content: "2026-04-23".to_string(),
+                    is_error: None,
+                    content_blocks: None,
+                }],
+            },
+            Message {
+                role: "assistant".to_string(),
+                content: vec![ContentBlock::Text {
+                    text: "It is 2026-04-23.".to_string(),
+                    cache_control: None,
+                }],
+            },
+        ];
+        let out = build_chat_messages(None, &messages, "deepseek-v4-pro");
+        let assistant = out
+            .iter()
+            .find(|value| value.get("role").and_then(Value::as_str) == Some("assistant"))
+            .expect("assistant message");
+        assert!(assistant.get("tool_calls").is_some());
+        assert!(assistant.get("reasoning_content").is_none());
+    }
+
+    #[test]
+    fn chat_messages_clear_v4_tool_round_reasoning_after_new_user_turn() {
         let messages = vec![
             Message {
                 role: "user".to_string(),
@@ -2515,16 +2597,20 @@ mod tests {
             .iter()
             .find(|value| value.get("role").and_then(Value::as_str) == Some("assistant"))
             .expect("assistant message");
-        assert_eq!(
-            assistant.get("reasoning_content").and_then(Value::as_str),
-            Some("Need a tool for this")
-        );
         assert!(assistant.get("tool_calls").is_some());
+        assert!(assistant.get("reasoning_content").is_none());
     }
 
     #[test]
     fn chat_messages_drop_v4_tool_round_missing_reasoning() {
         let messages = vec![
+            Message {
+                role: "user".to_string(),
+                content: vec![ContentBlock::Text {
+                    text: "Use a tool".to_string(),
+                    cache_control: None,
+                }],
+            },
             Message {
                 role: "assistant".to_string(),
                 content: vec![ContentBlock::ToolUse {
@@ -2543,13 +2629,6 @@ mod tests {
                     content_blocks: None,
                 }],
             },
-            Message {
-                role: "user".to_string(),
-                content: vec![ContentBlock::Text {
-                    text: "continue".to_string(),
-                    cache_control: None,
-                }],
-            },
         ];
 
         let out = build_chat_messages(None, &messages, "deepseek-v4-pro");
diff --git a/crates/tui/src/compaction.rs b/crates/tui/src/compaction.rs
index e05116af..1a5e45b0 100644
--- a/crates/tui/src/compaction.rs
+++ b/crates/tui/src/compaction.rs
@@ -501,14 +501,15 @@ fn enforce_tool_call_pairs(messages: &[Message], pinned_indices: &mut BTreeSet<u
     }
 }
 
-fn estimate_tokens_for_message(message: &Message) -> usize {
+fn estimate_tokens_for_message(message: &Message, include_thinking: bool) -> usize {
     message
         .content
         .iter()
         .map(|c| match c {
             ContentBlock::Text { text, .. } => text.len() / 4,
             // Historical reasoning blocks are UI/session metadata for DeepSeek.
-            // They are only sent back during an in-progress tool-call round.
+            // Only current-turn tool-call reasoning is sent back to the API.
+            ContentBlock::Thinking { thinking } if include_thinking => thinking.len() / 4,
             ContentBlock::Thinking { .. } => 0,
             ContentBlock::ToolUse { input, .. } => serde_json::to_string(input)
                 .map(|s| s.len() / 4)
@@ -523,7 +524,51 @@ fn estimate_tokens_for_message(message: &Message) -> usize {
 
 pub fn estimate_tokens(messages: &[Message]) -> usize {
     // Rough estimate: ~4 chars per token
-    messages.iter().map(estimate_tokens_for_message).sum()
+    let current_turn_start = messages.iter().rposition(is_text_user_message);
+    messages
+        .iter()
+        .enumerate()
+        .map(|(index, message)| {
+            let include_thinking = current_turn_start.is_some_and(|start| index > start)
+                && message_has_tool_use(message)
+                && !has_later_assistant_text(messages, index);
+            estimate_tokens_for_message(message, include_thinking)
+        })
+        .sum()
+}
+
+fn is_text_user_message(message: &Message) -> bool {
+    message.role == "user"
+        && message.content.iter().any(|block| {
+            matches!(
+                block,
+                ContentBlock::Text { text, .. } if !text.trim().is_empty()
+            )
+        })
+}
+
+fn message_has_tool_use(message: &Message) -> bool {
+    message
+        .content
+        .iter()
+        .any(|block| matches!(block, ContentBlock::ToolUse { .. }))
+}
+
+fn has_later_assistant_text(messages: &[Message], message_index: usize) -> bool {
+    messages
+        .iter()
+        .skip(message_index.saturating_add(1))
+        .any(is_text_assistant_message)
+}
+
+fn is_text_assistant_message(message: &Message) -> bool {
+    message.role == "assistant"
+        && message.content.iter().any(|block| {
+            matches!(
+                block,
+                ContentBlock::Text { text, .. } if !text.trim().is_empty()
+            )
+        })
 }
 
 fn estimate_text_tokens_conservative(text: &str) -> usize {
@@ -576,14 +621,14 @@ pub fn should_compact(
     let pinned_tokens: usize = plan
         .pinned_indices
         .iter()
-        .map(|&idx| estimate_tokens_for_message(&messages[idx]))
+        .map(|&idx| estimate_tokens_for_message(&messages[idx], false))
         .sum();
     let pinned_count = plan.pinned_indices.len();
 
     let token_estimate: usize = plan
         .summarize_indices
         .iter()
-        .map(|&idx| estimate_tokens_for_message(&messages[idx]))
+        .map(|&idx| estimate_tokens_for_message(&messages[idx], false))
         .sum();
     let message_count = plan.summarize_indices.len();
 
@@ -1112,6 +1157,76 @@ mod tests {
         assert!(tokens > 0 && tokens < 10);
     }
 
+    #[test]
+    fn estimate_tokens_counts_current_tool_round_thinking_only() {
+        let thinking = "reasoning ".repeat(800);
+        let current_messages = vec![
+            Message {
+                role: "user".to_string(),
+                content: vec![ContentBlock::Text {
+                    text: "Use a tool".to_string(),
+                    cache_control: None,
+                }],
+            },
+            Message {
+                role: "assistant".to_string(),
+                content: vec![
+                    ContentBlock::Thinking {
+                        thinking: thinking.clone(),
+                    },
+                    ContentBlock::ToolUse {
+                        id: "tool-1".to_string(),
+                        name: "read_file".to_string(),
+                        input: serde_json::json!({"path": "Cargo.toml"}),
+                        caller: None,
+                    },
+                ],
+            },
+            Message {
+                role: "user".to_string(),
+                content: vec![ContentBlock::ToolResult {
+                    tool_use_id: "tool-1".to_string(),
+                    content: "manifest".to_string(),
+                    is_error: None,
+                    content_blocks: None,
+                }],
+            },
+        ];
+        let historical_messages = {
+            let mut messages = current_messages.clone();
+            messages.push(Message {
+                role: "assistant".to_string(),
+                content: vec![ContentBlock::Text {
+                    text: "Done.".to_string(),
+                    cache_control: None,
+                }],
+            });
+            messages.push(Message {
+                role: "user".to_string(),
+                content: vec![ContentBlock::Text {
+                    text: "Next question.".to_string(),
+                    cache_control: None,
+                }],
+            });
+            messages
+        };
+        let completed_messages = {
+            let mut messages = current_messages.clone();
+            messages.push(Message {
+                role: "assistant".to_string(),
+                content: vec![ContentBlock::Text {
+                    text: "Done.".to_string(),
+                    cache_control: None,
+                }],
+            });
+            messages
+        };
+
+        assert!(estimate_tokens(&current_messages) > thinking.len() / 5);
+        assert!(estimate_tokens(&completed_messages) < thinking.len() / 8);
+        assert!(estimate_tokens(&historical_messages) < thinking.len() / 8);
+    }
+
     #[test]
     fn should_compact_respects_enabled_flag() {
         let config = CompactionConfig {
diff --git a/crates/tui/src/tui/ui.rs b/crates/tui/src/tui/ui.rs
index ce48579e..67055a56 100644
--- a/crates/tui/src/tui/ui.rs
+++ b/crates/tui/src/tui/ui.rs
@@ -3531,6 +3531,11 @@ fn context_usage_snapshot(app: &App) -> Option<(i64, u32, f64)> {
             {
                 estimated
             }
+            (Some(reported), Some(estimated))
+                if is_reported_context_inflated(reported, estimated) =>
+            {
+                estimated
+            }
             (Some(reported), _) => reported,
             (None, Some(estimated)) => estimated,
             (None, None) => return None,
@@ -3543,6 +3548,16 @@ fn context_usage_snapshot(app: &App) -> Option<(i64, u32, f64)> {
     Some((used, max, percent))
 }
 
+fn is_reported_context_inflated(reported: i64, estimated: i64) -> bool {
+    const MIN_ABSOLUTE_GAP: i64 = 4_096;
+    if estimated <= 0 || reported <= estimated {
+        return false;
+    }
+
+    reported.saturating_sub(estimated) >= MIN_ABSOLUTE_GAP
+        && reported >= estimated.saturating_mul(4)
+}
+
 fn maybe_warn_context_pressure(app: &mut App) {
     let Some((used, max, percent)) = context_usage_snapshot(app) else {
         return;
diff --git a/crates/tui/src/tui/ui/tests.rs b/crates/tui/src/tui/ui/tests.rs
index f2320a0d..a696acbb 100644
--- a/crates/tui/src/tui/ui/tests.rs
+++ b/crates/tui/src/tui/ui/tests.rs
@@ -372,6 +372,25 @@ fn context_usage_snapshot_prefers_estimate_when_reported_exceeds_window() {
     assert!(percent < 100.0);
 }
 
+#[test]
+fn context_usage_snapshot_prefers_estimate_when_reported_is_inflated_by_old_reasoning() {
+    let mut app = create_test_app();
+    app.last_prompt_tokens = Some(980_000);
+    app.api_messages = vec![Message {
+        role: "user".to_string(),
+        content: vec![ContentBlock::Text {
+            text: "small current context".to_string(),
+            cache_control: None,
+        }],
+    }];
+
+    let (used, max, percent) =
+        context_usage_snapshot(&app).expect("context usage should be available");
+    assert_eq!(max, 1_000_000);
+    assert!(used < 10_000);
+    assert!(percent < 2.0);
+}
+
 #[test]
 fn context_usage_snapshot_prefers_live_estimate_while_loading() {
     let mut app = create_test_app();
@@ -397,7 +416,13 @@ fn context_usage_snapshot_prefers_live_estimate_while_loading() {
 #[test]
 fn should_auto_compact_before_send_respects_threshold_and_setting() {
     let mut app = create_test_app();
-    app.last_prompt_tokens = Some(950_000);
+    app.api_messages = vec![Message {
+        role: "user".to_string(),
+        content: vec![ContentBlock::Text {
+            text: "context ".repeat(400_000),
+            cache_control: None,
+        }],
+    }];
     app.auto_compact = true;
     assert!(should_auto_compact_before_send(&app));
 
diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md
index 39120754..ed94f6fd 100644
--- a/docs/CONFIGURATION.md
+++ b/docs/CONFIGURATION.md
@@ -25,7 +25,9 @@ For NVIDIA NIM-hosted DeepSeek V4 Pro, set `provider = "nvidia-nim"` or pass
 `[providers.nvidia_nim]` and forwards the resolved key, base URL, provider, and
 model to the TUI process. Use
 `deepseek auth set --provider nvidia-nim --api-key "YOUR_NVIDIA_API_KEY"` to
-save the NIM key through the facade.
+save the NIM key through the facade. `DEEPSEEK_API_KEY` remains a compatibility
+fallback when `DEEPSEEK_PROVIDER=nvidia-nim`, but `NVIDIA_API_KEY` or
+`NVIDIA_NIM_API_KEY` is preferred.
 
 To bootstrap MCP and skills directories at their resolved paths, run `deepseek-tui setup`.
 To only scaffold MCP, run `deepseek-tui mcp init`.
@@ -70,7 +72,7 @@ These override config values:
 - `DEEPSEEK_BASE_URL`
 - `DEEPSEEK_PROVIDER` (`deepseek|nvidia-nim`)
 - `DEEPSEEK_MODEL` or `DEEPSEEK_DEFAULT_TEXT_MODEL`
-- `NVIDIA_API_KEY` or `NVIDIA_NIM_API_KEY` (used when provider is `nvidia-nim`)
+- `NVIDIA_API_KEY` or `NVIDIA_NIM_API_KEY` (preferred when provider is `nvidia-nim`; falls back to `DEEPSEEK_API_KEY`)
 - `NVIDIA_BASE_URL` or `NVIDIA_NIM_BASE_URL`
 - `NVIDIA_NIM_MODEL`
 - `DEEPSEEK_SKILLS_DIR`
diff --git a/npm/deepseek-tui/README.md b/npm/deepseek-tui/README.md
index 13c71e96..d6ff56f3 100644
--- a/npm/deepseek-tui/README.md
+++ b/npm/deepseek-tui/README.md
@@ -45,9 +45,9 @@ deepseek --provider nvidia-nim
 ```
 
 For a single process, set `DEEPSEEK_PROVIDER=nvidia-nim` and `NVIDIA_API_KEY`
-or `NVIDIA_NIM_API_KEY`. The NIM default model is
-`deepseek-ai/deepseek-v4-pro` and the default base URL is
-`https://integrate.api.nvidia.com/v1`. With `--provider nvidia-nim`,
+or `NVIDIA_NIM_API_KEY` (with `DEEPSEEK_API_KEY` as a compatibility fallback).
+The NIM default model is `deepseek-ai/deepseek-v4-pro` and the default base URL
+is `https://integrate.api.nvidia.com/v1`. With `--provider nvidia-nim`,
 `--model deepseek-v4-flash` maps to `deepseek-ai/deepseek-v4-flash`.
 
 ## Supported platforms