Fix reasoning replay and context accounting for NIM

2026-04-24 18:42:18 -05:00
parent d0dc26ce25
commit 16f62f7abf
8 changed files with 296 additions and 40 deletions
@@ -118,10 +118,10 @@ Controls: `F1` help, `Esc` backs out of the current action, `Ctrl+K` command pal
 Key environment overrides: `DEEPSEEK_API_KEY`, `DEEPSEEK_BASE_URL`,
 `DEEPSEEK_MODEL`, `DEEPSEEK_PROFILE`, `DEEPSEEK_PROVIDER`.
 For NVIDIA NIM, use `DEEPSEEK_PROVIDER=nvidia-nim` plus `NVIDIA_API_KEY`
-or `NVIDIA_NIM_API_KEY`; the default model is `deepseek-ai/deepseek-v4-pro`
-and the default base URL is `https://integrate.api.nvidia.com/v1`. With
-`--provider nvidia-nim`, `--model deepseek-v4-flash` maps to
-`deepseek-ai/deepseek-v4-flash`.
+or `NVIDIA_NIM_API_KEY` (with `DEEPSEEK_API_KEY` as a compatibility fallback);
+the default model is `deepseek-ai/deepseek-v4-pro` and the default base URL is
+`https://integrate.api.nvidia.com/v1`. With `--provider nvidia-nim`,
+`--model deepseek-v4-flash` maps to `deepseek-ai/deepseek-v4-flash`.

 The client targets DeepSeek's documented OpenAI-compatible Chat Completions API
 (`/chat/completions`). DeepSeek context caching is automatic; when the API
@@ -580,7 +580,10 @@ impl EnvRuntimeOverrides {
    fn api_key_for(&self, provider: ProviderKind) -> Option<String> {
        match provider {
            ProviderKind::Deepseek => self.deepseek_api_key.clone(),
-            ProviderKind::NvidiaNim => self.nvidia_api_key.clone(),
+            ProviderKind::NvidiaNim => self
+                .nvidia_api_key
+                .clone()
+                .or_else(|| self.deepseek_api_key.clone()),
            ProviderKind::Openai => self.openai_api_key.clone(),
        }
    }
@@ -780,6 +783,23 @@ mod tests {
        assert_eq!(resolved.model, DEFAULT_NVIDIA_NIM_MODEL);
    }

+    #[test]
+    fn nvidia_nim_provider_can_fallback_to_deepseek_api_key_env() {
+        let _lock = env_lock();
+        let _env = EnvGuard::without_deepseek_runtime_overrides();
+        // Safety: test-only environment mutation guarded by a module mutex.
+        unsafe {
+            env::set_var("DEEPSEEK_PROVIDER", "nvidia-nim");
+            env::set_var("DEEPSEEK_API_KEY", "deepseek-compat-key");
+        }
+
+        let config = ConfigToml::default();
+        let resolved = config.resolve_runtime_options(&CliRuntimeOverrides::default());
+
+        assert_eq!(resolved.provider, ProviderKind::NvidiaNim);
+        assert_eq!(resolved.api_key.as_deref(), Some("deepseek-compat-key"));
+    }
+
    #[test]
    fn list_values_redacts_root_api_key() {
        let config = ConfigToml {
@@ -1351,6 +1351,7 @@ fn build_chat_messages_with_reasoning(
 ) -> Vec<Value> {
    let mut out = Vec::new();
    let mut pending_tool_calls: HashSet<String> = HashSet::new();
+    let current_turn_start = messages.iter().rposition(is_text_user_message);

    if let Some(instructions) = system_to_instructions(system.cloned())
        && !instructions.trim().is_empty()
@@ -1361,7 +1362,7 @@ fn build_chat_messages_with_reasoning(
        }));
    }

-    for message in messages {
+    for (message_index, message) in messages.iter().enumerate() {
        let role = message.role.as_str();
        let mut text_parts = Vec::new();
        let mut thinking_parts = Vec::new();
@@ -1423,16 +1424,22 @@ fn build_chat_messages_with_reasoning(
            let reasoning_content = thinking_parts.join("\n");
            let has_text = !content.trim().is_empty();
            let mut has_tool_calls = !tool_calls.is_empty();
-            let include_reasoning_for_turn = include_reasoning && has_tool_calls;
+            let include_reasoning_for_turn = include_reasoning
+                && has_tool_calls
+                && current_turn_start.is_some_and(|start| message_index > start)
+                && !has_later_assistant_text(messages, message_index);
            let has_reasoning = include_reasoning_for_turn && !reasoning_content.trim().is_empty();

            // DeepSeek thinking-mode tool turns are stateful within the
            // stateless Chat Completions transcript: if an assistant performed
-            // a tool call, its `reasoning_content` must be replayed in every
-            // later request. Older checkpoints could lose that field because
-            // the UI display stream had no visible text block. Do not forward
-            // those malformed tool calls; dropping the stale tool round is
-            // better than guaranteeing a provider-side 400.
+            // a tool call in the current user turn, its `reasoning_content`
+            // must be replayed while continuing that tool round. Once a new
+            // user text turn starts, DeepSeek recommends clearing historical
+            // reasoning content so the context is not dominated by old CoT.
+            // Older checkpoints could lose the current-round field because the
+            // UI display stream had no visible text block. Do not forward those
+            // malformed current tool calls; dropping that round is better than
+            // guaranteeing a provider-side 400.
            if include_reasoning_for_turn && !has_reasoning {
                logging::warn(
                    "Dropping DeepSeek tool_calls with missing reasoning_content from assistant message",
@@ -1611,6 +1618,33 @@ fn build_chat_messages_with_reasoning(
    out
 }

+fn is_text_user_message(message: &Message) -> bool {
+    message.role == "user"
+        && message.content.iter().any(|block| {
+            matches!(
+                block,
+                ContentBlock::Text { text, .. } if !text.trim().is_empty()
+            )
+        })
+}
+
+fn has_later_assistant_text(messages: &[Message], message_index: usize) -> bool {
+    messages
+        .iter()
+        .skip(message_index.saturating_add(1))
+        .any(is_text_assistant_message)
+}
+
+fn is_text_assistant_message(message: &Message) -> bool {
+    message.role == "assistant"
+        && message.content.iter().any(|block| {
+            matches!(
+                block,
+                ContentBlock::Text { text, .. } if !text.trim().is_empty()
+            )
+        })
+}
+
 fn tool_to_chat(tool: &Tool) -> Value {
    let mut value = json!({
        "type": "function",
@@ -2403,7 +2437,7 @@ mod tests {
    }

    #[test]
-    fn chat_messages_preserve_prior_tool_round_reasoning_after_new_user_turn() {
+    fn chat_messages_clear_prior_tool_round_reasoning_after_new_user_turn() {
        let messages = vec![
            Message {
                role: "user".to_string(),
@@ -2455,14 +2489,62 @@ mod tests {
            .iter()
            .find(|value| value.get("role").and_then(Value::as_str) == Some("assistant"))
            .expect("assistant message");
-        assert_eq!(
-            assistant.get("reasoning_content").and_then(Value::as_str),
-            Some("Need to call a tool")
-        );
+        assert!(assistant.get("tool_calls").is_some());
+        assert!(assistant.get("reasoning_content").is_none());
    }

    #[test]
-    fn chat_messages_preserve_v4_tool_round_reasoning() {
+    fn chat_messages_clear_completed_tool_round_reasoning_after_final_answer() {
+        let messages = vec![
+            Message {
+                role: "user".to_string(),
+                content: vec![ContentBlock::Text {
+                    text: "Need the date".to_string(),
+                    cache_control: None,
+                }],
+            },
+            Message {
+                role: "assistant".to_string(),
+                content: vec![
+                    ContentBlock::Thinking {
+                        thinking: "Need to call a tool".to_string(),
+                    },
+                    ContentBlock::ToolUse {
+                        id: "tool-1".to_string(),
+                        name: "get_date".to_string(),
+                        input: json!({}),
+                        caller: None,
+                    },
+                ],
+            },
+            Message {
+                role: "user".to_string(),
+                content: vec![ContentBlock::ToolResult {
+                    tool_use_id: "tool-1".to_string(),
+                    content: "2026-04-23".to_string(),
+                    is_error: None,
+                    content_blocks: None,
+                }],
+            },
+            Message {
+                role: "assistant".to_string(),
+                content: vec![ContentBlock::Text {
+                    text: "It is 2026-04-23.".to_string(),
+                    cache_control: None,
+                }],
+            },
+        ];
+        let out = build_chat_messages(None, &messages, "deepseek-v4-pro");
+        let assistant = out
+            .iter()
+            .find(|value| value.get("role").and_then(Value::as_str) == Some("assistant"))
+            .expect("assistant message");
+        assert!(assistant.get("tool_calls").is_some());
+        assert!(assistant.get("reasoning_content").is_none());
+    }
+
+    #[test]
+    fn chat_messages_clear_v4_tool_round_reasoning_after_new_user_turn() {
        let messages = vec![
            Message {
                role: "user".to_string(),
@@ -2515,16 +2597,20 @@ mod tests {
            .iter()
            .find(|value| value.get("role").and_then(Value::as_str) == Some("assistant"))
            .expect("assistant message");
-        assert_eq!(
-            assistant.get("reasoning_content").and_then(Value::as_str),
-            Some("Need a tool for this")
-        );
        assert!(assistant.get("tool_calls").is_some());
+        assert!(assistant.get("reasoning_content").is_none());
    }

    #[test]
    fn chat_messages_drop_v4_tool_round_missing_reasoning() {
        let messages = vec![
+            Message {
+                role: "user".to_string(),
+                content: vec![ContentBlock::Text {
+                    text: "Use a tool".to_string(),
+                    cache_control: None,
+                }],
+            },
            Message {
                role: "assistant".to_string(),
                content: vec![ContentBlock::ToolUse {
@@ -2543,13 +2629,6 @@ mod tests {
                    content_blocks: None,
                }],
            },
-            Message {
-                role: "user".to_string(),
-                content: vec![ContentBlock::Text {
-                    text: "continue".to_string(),
-                    cache_control: None,
-                }],
-            },
        ];

        let out = build_chat_messages(None, &messages, "deepseek-v4-pro");
@@ -501,14 +501,15 @@ fn enforce_tool_call_pairs(messages: &[Message], pinned_indices: &mut BTreeSet<u
    }
 }

-fn estimate_tokens_for_message(message: &Message) -> usize {
+fn estimate_tokens_for_message(message: &Message, include_thinking: bool) -> usize {
    message
        .content
        .iter()
        .map(|c| match c {
            ContentBlock::Text { text, .. } => text.len() / 4,
            // Historical reasoning blocks are UI/session metadata for DeepSeek.
-            // They are only sent back during an in-progress tool-call round.
+            // Only current-turn tool-call reasoning is sent back to the API.
+            ContentBlock::Thinking { thinking } if include_thinking => thinking.len() / 4,
            ContentBlock::Thinking { .. } => 0,
            ContentBlock::ToolUse { input, .. } => serde_json::to_string(input)
                .map(|s| s.len() / 4)
@@ -523,7 +524,51 @@ fn estimate_tokens_for_message(message: &Message) -> usize {

 pub fn estimate_tokens(messages: &[Message]) -> usize {
    // Rough estimate: ~4 chars per token
-    messages.iter().map(estimate_tokens_for_message).sum()
+    let current_turn_start = messages.iter().rposition(is_text_user_message);
+    messages
+        .iter()
+        .enumerate()
+        .map(|(index, message)| {
+            let include_thinking = current_turn_start.is_some_and(|start| index > start)
+                && message_has_tool_use(message)
+                && !has_later_assistant_text(messages, index);
+            estimate_tokens_for_message(message, include_thinking)
+        })
+        .sum()
+}
+
+fn is_text_user_message(message: &Message) -> bool {
+    message.role == "user"
+        && message.content.iter().any(|block| {
+            matches!(
+                block,
+                ContentBlock::Text { text, .. } if !text.trim().is_empty()
+            )
+        })
+}
+
+fn message_has_tool_use(message: &Message) -> bool {
+    message
+        .content
+        .iter()
+        .any(|block| matches!(block, ContentBlock::ToolUse { .. }))
+}
+
+fn has_later_assistant_text(messages: &[Message], message_index: usize) -> bool {
+    messages
+        .iter()
+        .skip(message_index.saturating_add(1))
+        .any(is_text_assistant_message)
+}
+
+fn is_text_assistant_message(message: &Message) -> bool {
+    message.role == "assistant"
+        && message.content.iter().any(|block| {
+            matches!(
+                block,
+                ContentBlock::Text { text, .. } if !text.trim().is_empty()
+            )
+        })
 }

 fn estimate_text_tokens_conservative(text: &str) -> usize {
@@ -576,14 +621,14 @@ pub fn should_compact(
    let pinned_tokens: usize = plan
        .pinned_indices
        .iter()
-        .map(|&idx| estimate_tokens_for_message(&messages[idx]))
+        .map(|&idx| estimate_tokens_for_message(&messages[idx], false))
        .sum();
    let pinned_count = plan.pinned_indices.len();

    let token_estimate: usize = plan
        .summarize_indices
        .iter()
-        .map(|&idx| estimate_tokens_for_message(&messages[idx]))
+        .map(|&idx| estimate_tokens_for_message(&messages[idx], false))
        .sum();
    let message_count = plan.summarize_indices.len();

@@ -1112,6 +1157,76 @@ mod tests {
        assert!(tokens > 0 && tokens < 10);
    }

+    #[test]
+    fn estimate_tokens_counts_current_tool_round_thinking_only() {
+        let thinking = "reasoning ".repeat(800);
+        let current_messages = vec![
+            Message {
+                role: "user".to_string(),
+                content: vec![ContentBlock::Text {
+                    text: "Use a tool".to_string(),
+                    cache_control: None,
+                }],
+            },
+            Message {
+                role: "assistant".to_string(),
+                content: vec![
+                    ContentBlock::Thinking {
+                        thinking: thinking.clone(),
+                    },
+                    ContentBlock::ToolUse {
+                        id: "tool-1".to_string(),
+                        name: "read_file".to_string(),
+                        input: serde_json::json!({"path": "Cargo.toml"}),
+                        caller: None,
+                    },
+                ],
+            },
+            Message {
+                role: "user".to_string(),
+                content: vec![ContentBlock::ToolResult {
+                    tool_use_id: "tool-1".to_string(),
+                    content: "manifest".to_string(),
+                    is_error: None,
+                    content_blocks: None,
+                }],
+            },
+        ];
+        let historical_messages = {
+            let mut messages = current_messages.clone();
+            messages.push(Message {
+                role: "assistant".to_string(),
+                content: vec![ContentBlock::Text {
+                    text: "Done.".to_string(),
+                    cache_control: None,
+                }],
+            });
+            messages.push(Message {
+                role: "user".to_string(),
+                content: vec![ContentBlock::Text {
+                    text: "Next question.".to_string(),
+                    cache_control: None,
+                }],
+            });
+            messages
+        };
+        let completed_messages = {
+            let mut messages = current_messages.clone();
+            messages.push(Message {
+                role: "assistant".to_string(),
+                content: vec![ContentBlock::Text {
+                    text: "Done.".to_string(),
+                    cache_control: None,
+                }],
+            });
+            messages
+        };
+
+        assert!(estimate_tokens(&current_messages) > thinking.len() / 5);
+        assert!(estimate_tokens(&completed_messages) < thinking.len() / 8);
+        assert!(estimate_tokens(&historical_messages) < thinking.len() / 8);
+    }
+
    #[test]
    fn should_compact_respects_enabled_flag() {
        let config = CompactionConfig {
@@ -3531,6 +3531,11 @@ fn context_usage_snapshot(app: &App) -> Option<(i64, u32, f64)> {
            {
                estimated
            }
+            (Some(reported), Some(estimated))
+                if is_reported_context_inflated(reported, estimated) =>
+            {
+                estimated
+            }
            (Some(reported), _) => reported,
            (None, Some(estimated)) => estimated,
            (None, None) => return None,
@@ -3543,6 +3548,16 @@ fn context_usage_snapshot(app: &App) -> Option<(i64, u32, f64)> {
    Some((used, max, percent))
 }

+fn is_reported_context_inflated(reported: i64, estimated: i64) -> bool {
+    const MIN_ABSOLUTE_GAP: i64 = 4_096;
+    if estimated <= 0 || reported <= estimated {
+        return false;
+    }
+
+    reported.saturating_sub(estimated) >= MIN_ABSOLUTE_GAP
+        && reported >= estimated.saturating_mul(4)
+}
+
 fn maybe_warn_context_pressure(app: &mut App) {
    let Some((used, max, percent)) = context_usage_snapshot(app) else {
        return;
@@ -372,6 +372,25 @@ fn context_usage_snapshot_prefers_estimate_when_reported_exceeds_window() {
    assert!(percent < 100.0);
 }

+#[test]
+fn context_usage_snapshot_prefers_estimate_when_reported_is_inflated_by_old_reasoning() {
+    let mut app = create_test_app();
+    app.last_prompt_tokens = Some(980_000);
+    app.api_messages = vec![Message {
+        role: "user".to_string(),
+        content: vec![ContentBlock::Text {
+            text: "small current context".to_string(),
+            cache_control: None,
+        }],
+    }];
+
+    let (used, max, percent) =
+        context_usage_snapshot(&app).expect("context usage should be available");
+    assert_eq!(max, 1_000_000);
+    assert!(used < 10_000);
+    assert!(percent < 2.0);
+}
+
 #[test]
 fn context_usage_snapshot_prefers_live_estimate_while_loading() {
    let mut app = create_test_app();
@@ -397,7 +416,13 @@ fn context_usage_snapshot_prefers_live_estimate_while_loading() {
 #[test]
 fn should_auto_compact_before_send_respects_threshold_and_setting() {
    let mut app = create_test_app();
-    app.last_prompt_tokens = Some(950_000);
+    app.api_messages = vec![Message {
+        role: "user".to_string(),
+        content: vec![ContentBlock::Text {
+            text: "context ".repeat(400_000),
+            cache_control: None,
+        }],
+    }];
    app.auto_compact = true;
    assert!(should_auto_compact_before_send(&app));

@@ -25,7 +25,9 @@ For NVIDIA NIM-hosted DeepSeek V4 Pro, set `provider = "nvidia-nim"` or pass
 `[providers.nvidia_nim]` and forwards the resolved key, base URL, provider, and
 model to the TUI process. Use
 `deepseek auth set --provider nvidia-nim --api-key "YOUR_NVIDIA_API_KEY"` to
-save the NIM key through the facade.
+save the NIM key through the facade. `DEEPSEEK_API_KEY` remains a compatibility
+fallback when `DEEPSEEK_PROVIDER=nvidia-nim`, but `NVIDIA_API_KEY` or
+`NVIDIA_NIM_API_KEY` is preferred.

 To bootstrap MCP and skills directories at their resolved paths, run `deepseek-tui setup`.
 To only scaffold MCP, run `deepseek-tui mcp init`.
@@ -70,7 +72,7 @@ These override config values:
 - `DEEPSEEK_BASE_URL`
 - `DEEPSEEK_PROVIDER` (`deepseek|nvidia-nim`)
 - `DEEPSEEK_MODEL` or `DEEPSEEK_DEFAULT_TEXT_MODEL`
- `NVIDIA_API_KEY` or `NVIDIA_NIM_API_KEY` (used when provider is `nvidia-nim`)
+- `NVIDIA_API_KEY` or `NVIDIA_NIM_API_KEY` (preferred when provider is `nvidia-nim`; falls back to `DEEPSEEK_API_KEY`)
 - `NVIDIA_BASE_URL` or `NVIDIA_NIM_BASE_URL`
 - `NVIDIA_NIM_MODEL`
 - `DEEPSEEK_SKILLS_DIR`
@@ -45,9 +45,9 @@ deepseek --provider nvidia-nim
 ```

 For a single process, set `DEEPSEEK_PROVIDER=nvidia-nim` and `NVIDIA_API_KEY`
-or `NVIDIA_NIM_API_KEY`. The NIM default model is
-`deepseek-ai/deepseek-v4-pro` and the default base URL is
-`https://integrate.api.nvidia.com/v1`. With `--provider nvidia-nim`,
+or `NVIDIA_NIM_API_KEY` (with `DEEPSEEK_API_KEY` as a compatibility fallback).
+The NIM default model is `deepseek-ai/deepseek-v4-pro` and the default base URL
+is `https://integrate.api.nvidia.com/v1`. With `--provider nvidia-nim`,
 `--model deepseek-v4-flash` maps to `deepseek-ai/deepseek-v4-flash`.

 ## Supported platforms