From 16f62f7abf416946c5cea136edbd156e2f4836e1 Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Fri, 24 Apr 2026 18:42:18 -0500 Subject: [PATCH] Fix reasoning replay and context accounting for NIM --- README.md | 8 +-- crates/config/src/lib.rs | 22 +++++- crates/tui/src/client.rs | 127 ++++++++++++++++++++++++++------- crates/tui/src/compaction.rs | 125 ++++++++++++++++++++++++++++++-- crates/tui/src/tui/ui.rs | 15 ++++ crates/tui/src/tui/ui/tests.rs | 27 ++++++- docs/CONFIGURATION.md | 6 +- npm/deepseek-tui/README.md | 6 +- 8 files changed, 296 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index 30ff8cb8..490024a8 100644 --- a/README.md +++ b/README.md @@ -118,10 +118,10 @@ Controls: `F1` help, `Esc` backs out of the current action, `Ctrl+K` command pal Key environment overrides: `DEEPSEEK_API_KEY`, `DEEPSEEK_BASE_URL`, `DEEPSEEK_MODEL`, `DEEPSEEK_PROFILE`, `DEEPSEEK_PROVIDER`. For NVIDIA NIM, use `DEEPSEEK_PROVIDER=nvidia-nim` plus `NVIDIA_API_KEY` -or `NVIDIA_NIM_API_KEY`; the default model is `deepseek-ai/deepseek-v4-pro` -and the default base URL is `https://integrate.api.nvidia.com/v1`. With -`--provider nvidia-nim`, `--model deepseek-v4-flash` maps to -`deepseek-ai/deepseek-v4-flash`. +or `NVIDIA_NIM_API_KEY` (with `DEEPSEEK_API_KEY` as a compatibility fallback); +the default model is `deepseek-ai/deepseek-v4-pro` and the default base URL is +`https://integrate.api.nvidia.com/v1`. With `--provider nvidia-nim`, +`--model deepseek-v4-flash` maps to `deepseek-ai/deepseek-v4-flash`. The client targets DeepSeek's documented OpenAI-compatible Chat Completions API (`/chat/completions`). DeepSeek context caching is automatic; when the API diff --git a/crates/config/src/lib.rs b/crates/config/src/lib.rs index 985a2dd1..1e47c770 100644 --- a/crates/config/src/lib.rs +++ b/crates/config/src/lib.rs @@ -580,7 +580,10 @@ impl EnvRuntimeOverrides { fn api_key_for(&self, provider: ProviderKind) -> Option { match provider { ProviderKind::Deepseek => self.deepseek_api_key.clone(), - ProviderKind::NvidiaNim => self.nvidia_api_key.clone(), + ProviderKind::NvidiaNim => self + .nvidia_api_key + .clone() + .or_else(|| self.deepseek_api_key.clone()), ProviderKind::Openai => self.openai_api_key.clone(), } } @@ -780,6 +783,23 @@ mod tests { assert_eq!(resolved.model, DEFAULT_NVIDIA_NIM_MODEL); } + #[test] + fn nvidia_nim_provider_can_fallback_to_deepseek_api_key_env() { + let _lock = env_lock(); + let _env = EnvGuard::without_deepseek_runtime_overrides(); + // Safety: test-only environment mutation guarded by a module mutex. + unsafe { + env::set_var("DEEPSEEK_PROVIDER", "nvidia-nim"); + env::set_var("DEEPSEEK_API_KEY", "deepseek-compat-key"); + } + + let config = ConfigToml::default(); + let resolved = config.resolve_runtime_options(&CliRuntimeOverrides::default()); + + assert_eq!(resolved.provider, ProviderKind::NvidiaNim); + assert_eq!(resolved.api_key.as_deref(), Some("deepseek-compat-key")); + } + #[test] fn list_values_redacts_root_api_key() { let config = ConfigToml { diff --git a/crates/tui/src/client.rs b/crates/tui/src/client.rs index bb98422f..d273aa06 100644 --- a/crates/tui/src/client.rs +++ b/crates/tui/src/client.rs @@ -1351,6 +1351,7 @@ fn build_chat_messages_with_reasoning( ) -> Vec { let mut out = Vec::new(); let mut pending_tool_calls: HashSet = HashSet::new(); + let current_turn_start = messages.iter().rposition(is_text_user_message); if let Some(instructions) = system_to_instructions(system.cloned()) && !instructions.trim().is_empty() @@ -1361,7 +1362,7 @@ fn build_chat_messages_with_reasoning( })); } - for message in messages { + for (message_index, message) in messages.iter().enumerate() { let role = message.role.as_str(); let mut text_parts = Vec::new(); let mut thinking_parts = Vec::new(); @@ -1423,16 +1424,22 @@ fn build_chat_messages_with_reasoning( let reasoning_content = thinking_parts.join("\n"); let has_text = !content.trim().is_empty(); let mut has_tool_calls = !tool_calls.is_empty(); - let include_reasoning_for_turn = include_reasoning && has_tool_calls; + let include_reasoning_for_turn = include_reasoning + && has_tool_calls + && current_turn_start.is_some_and(|start| message_index > start) + && !has_later_assistant_text(messages, message_index); let has_reasoning = include_reasoning_for_turn && !reasoning_content.trim().is_empty(); // DeepSeek thinking-mode tool turns are stateful within the // stateless Chat Completions transcript: if an assistant performed - // a tool call, its `reasoning_content` must be replayed in every - // later request. Older checkpoints could lose that field because - // the UI display stream had no visible text block. Do not forward - // those malformed tool calls; dropping the stale tool round is - // better than guaranteeing a provider-side 400. + // a tool call in the current user turn, its `reasoning_content` + // must be replayed while continuing that tool round. Once a new + // user text turn starts, DeepSeek recommends clearing historical + // reasoning content so the context is not dominated by old CoT. + // Older checkpoints could lose the current-round field because the + // UI display stream had no visible text block. Do not forward those + // malformed current tool calls; dropping that round is better than + // guaranteeing a provider-side 400. if include_reasoning_for_turn && !has_reasoning { logging::warn( "Dropping DeepSeek tool_calls with missing reasoning_content from assistant message", @@ -1611,6 +1618,33 @@ fn build_chat_messages_with_reasoning( out } +fn is_text_user_message(message: &Message) -> bool { + message.role == "user" + && message.content.iter().any(|block| { + matches!( + block, + ContentBlock::Text { text, .. } if !text.trim().is_empty() + ) + }) +} + +fn has_later_assistant_text(messages: &[Message], message_index: usize) -> bool { + messages + .iter() + .skip(message_index.saturating_add(1)) + .any(is_text_assistant_message) +} + +fn is_text_assistant_message(message: &Message) -> bool { + message.role == "assistant" + && message.content.iter().any(|block| { + matches!( + block, + ContentBlock::Text { text, .. } if !text.trim().is_empty() + ) + }) +} + fn tool_to_chat(tool: &Tool) -> Value { let mut value = json!({ "type": "function", @@ -2403,7 +2437,7 @@ mod tests { } #[test] - fn chat_messages_preserve_prior_tool_round_reasoning_after_new_user_turn() { + fn chat_messages_clear_prior_tool_round_reasoning_after_new_user_turn() { let messages = vec![ Message { role: "user".to_string(), @@ -2455,14 +2489,62 @@ mod tests { .iter() .find(|value| value.get("role").and_then(Value::as_str) == Some("assistant")) .expect("assistant message"); - assert_eq!( - assistant.get("reasoning_content").and_then(Value::as_str), - Some("Need to call a tool") - ); + assert!(assistant.get("tool_calls").is_some()); + assert!(assistant.get("reasoning_content").is_none()); } #[test] - fn chat_messages_preserve_v4_tool_round_reasoning() { + fn chat_messages_clear_completed_tool_round_reasoning_after_final_answer() { + let messages = vec![ + Message { + role: "user".to_string(), + content: vec![ContentBlock::Text { + text: "Need the date".to_string(), + cache_control: None, + }], + }, + Message { + role: "assistant".to_string(), + content: vec![ + ContentBlock::Thinking { + thinking: "Need to call a tool".to_string(), + }, + ContentBlock::ToolUse { + id: "tool-1".to_string(), + name: "get_date".to_string(), + input: json!({}), + caller: None, + }, + ], + }, + Message { + role: "user".to_string(), + content: vec![ContentBlock::ToolResult { + tool_use_id: "tool-1".to_string(), + content: "2026-04-23".to_string(), + is_error: None, + content_blocks: None, + }], + }, + Message { + role: "assistant".to_string(), + content: vec![ContentBlock::Text { + text: "It is 2026-04-23.".to_string(), + cache_control: None, + }], + }, + ]; + let out = build_chat_messages(None, &messages, "deepseek-v4-pro"); + let assistant = out + .iter() + .find(|value| value.get("role").and_then(Value::as_str) == Some("assistant")) + .expect("assistant message"); + assert!(assistant.get("tool_calls").is_some()); + assert!(assistant.get("reasoning_content").is_none()); + } + + #[test] + fn chat_messages_clear_v4_tool_round_reasoning_after_new_user_turn() { let messages = vec![ Message { role: "user".to_string(), @@ -2515,16 +2597,20 @@ mod tests { .iter() .find(|value| value.get("role").and_then(Value::as_str) == Some("assistant")) .expect("assistant message"); - assert_eq!( - assistant.get("reasoning_content").and_then(Value::as_str), - Some("Need a tool for this") - ); assert!(assistant.get("tool_calls").is_some()); + assert!(assistant.get("reasoning_content").is_none()); } #[test] fn chat_messages_drop_v4_tool_round_missing_reasoning() { let messages = vec![ + Message { + role: "user".to_string(), + content: vec![ContentBlock::Text { + text: "Use a tool".to_string(), + cache_control: None, + }], + }, Message { role: "assistant".to_string(), content: vec![ContentBlock::ToolUse { @@ -2543,13 +2629,6 @@ mod tests { content_blocks: None, }], }, - Message { - role: "user".to_string(), - content: vec![ContentBlock::Text { - text: "continue".to_string(), - cache_control: None, - }], - }, ]; let out = build_chat_messages(None, &messages, "deepseek-v4-pro"); diff --git a/crates/tui/src/compaction.rs b/crates/tui/src/compaction.rs index e05116af..1a5e45b0 100644 --- a/crates/tui/src/compaction.rs +++ b/crates/tui/src/compaction.rs @@ -501,14 +501,15 @@ fn enforce_tool_call_pairs(messages: &[Message], pinned_indices: &mut BTreeSet usize { +fn estimate_tokens_for_message(message: &Message, include_thinking: bool) -> usize { message .content .iter() .map(|c| match c { ContentBlock::Text { text, .. } => text.len() / 4, // Historical reasoning blocks are UI/session metadata for DeepSeek. - // They are only sent back during an in-progress tool-call round. + // Only current-turn tool-call reasoning is sent back to the API. + ContentBlock::Thinking { thinking } if include_thinking => thinking.len() / 4, ContentBlock::Thinking { .. } => 0, ContentBlock::ToolUse { input, .. } => serde_json::to_string(input) .map(|s| s.len() / 4) @@ -523,7 +524,51 @@ fn estimate_tokens_for_message(message: &Message) -> usize { pub fn estimate_tokens(messages: &[Message]) -> usize { // Rough estimate: ~4 chars per token - messages.iter().map(estimate_tokens_for_message).sum() + let current_turn_start = messages.iter().rposition(is_text_user_message); + messages + .iter() + .enumerate() + .map(|(index, message)| { + let include_thinking = current_turn_start.is_some_and(|start| index > start) + && message_has_tool_use(message) + && !has_later_assistant_text(messages, index); + estimate_tokens_for_message(message, include_thinking) + }) + .sum() +} + +fn is_text_user_message(message: &Message) -> bool { + message.role == "user" + && message.content.iter().any(|block| { + matches!( + block, + ContentBlock::Text { text, .. } if !text.trim().is_empty() + ) + }) +} + +fn message_has_tool_use(message: &Message) -> bool { + message + .content + .iter() + .any(|block| matches!(block, ContentBlock::ToolUse { .. })) +} + +fn has_later_assistant_text(messages: &[Message], message_index: usize) -> bool { + messages + .iter() + .skip(message_index.saturating_add(1)) + .any(is_text_assistant_message) +} + +fn is_text_assistant_message(message: &Message) -> bool { + message.role == "assistant" + && message.content.iter().any(|block| { + matches!( + block, + ContentBlock::Text { text, .. } if !text.trim().is_empty() + ) + }) } fn estimate_text_tokens_conservative(text: &str) -> usize { @@ -576,14 +621,14 @@ pub fn should_compact( let pinned_tokens: usize = plan .pinned_indices .iter() - .map(|&idx| estimate_tokens_for_message(&messages[idx])) + .map(|&idx| estimate_tokens_for_message(&messages[idx], false)) .sum(); let pinned_count = plan.pinned_indices.len(); let token_estimate: usize = plan .summarize_indices .iter() - .map(|&idx| estimate_tokens_for_message(&messages[idx])) + .map(|&idx| estimate_tokens_for_message(&messages[idx], false)) .sum(); let message_count = plan.summarize_indices.len(); @@ -1112,6 +1157,76 @@ mod tests { assert!(tokens > 0 && tokens < 10); } + #[test] + fn estimate_tokens_counts_current_tool_round_thinking_only() { + let thinking = "reasoning ".repeat(800); + let current_messages = vec![ + Message { + role: "user".to_string(), + content: vec![ContentBlock::Text { + text: "Use a tool".to_string(), + cache_control: None, + }], + }, + Message { + role: "assistant".to_string(), + content: vec![ + ContentBlock::Thinking { + thinking: thinking.clone(), + }, + ContentBlock::ToolUse { + id: "tool-1".to_string(), + name: "read_file".to_string(), + input: serde_json::json!({"path": "Cargo.toml"}), + caller: None, + }, + ], + }, + Message { + role: "user".to_string(), + content: vec![ContentBlock::ToolResult { + tool_use_id: "tool-1".to_string(), + content: "manifest".to_string(), + is_error: None, + content_blocks: None, + }], + }, + ]; + let historical_messages = { + let mut messages = current_messages.clone(); + messages.push(Message { + role: "assistant".to_string(), + content: vec![ContentBlock::Text { + text: "Done.".to_string(), + cache_control: None, + }], + }); + messages.push(Message { + role: "user".to_string(), + content: vec![ContentBlock::Text { + text: "Next question.".to_string(), + cache_control: None, + }], + }); + messages + }; + let completed_messages = { + let mut messages = current_messages.clone(); + messages.push(Message { + role: "assistant".to_string(), + content: vec![ContentBlock::Text { + text: "Done.".to_string(), + cache_control: None, + }], + }); + messages + }; + + assert!(estimate_tokens(¤t_messages) > thinking.len() / 5); + assert!(estimate_tokens(&completed_messages) < thinking.len() / 8); + assert!(estimate_tokens(&historical_messages) < thinking.len() / 8); + } + #[test] fn should_compact_respects_enabled_flag() { let config = CompactionConfig { diff --git a/crates/tui/src/tui/ui.rs b/crates/tui/src/tui/ui.rs index ce48579e..67055a56 100644 --- a/crates/tui/src/tui/ui.rs +++ b/crates/tui/src/tui/ui.rs @@ -3531,6 +3531,11 @@ fn context_usage_snapshot(app: &App) -> Option<(i64, u32, f64)> { { estimated } + (Some(reported), Some(estimated)) + if is_reported_context_inflated(reported, estimated) => + { + estimated + } (Some(reported), _) => reported, (None, Some(estimated)) => estimated, (None, None) => return None, @@ -3543,6 +3548,16 @@ fn context_usage_snapshot(app: &App) -> Option<(i64, u32, f64)> { Some((used, max, percent)) } +fn is_reported_context_inflated(reported: i64, estimated: i64) -> bool { + const MIN_ABSOLUTE_GAP: i64 = 4_096; + if estimated <= 0 || reported <= estimated { + return false; + } + + reported.saturating_sub(estimated) >= MIN_ABSOLUTE_GAP + && reported >= estimated.saturating_mul(4) +} + fn maybe_warn_context_pressure(app: &mut App) { let Some((used, max, percent)) = context_usage_snapshot(app) else { return; diff --git a/crates/tui/src/tui/ui/tests.rs b/crates/tui/src/tui/ui/tests.rs index f2320a0d..a696acbb 100644 --- a/crates/tui/src/tui/ui/tests.rs +++ b/crates/tui/src/tui/ui/tests.rs @@ -372,6 +372,25 @@ fn context_usage_snapshot_prefers_estimate_when_reported_exceeds_window() { assert!(percent < 100.0); } +#[test] +fn context_usage_snapshot_prefers_estimate_when_reported_is_inflated_by_old_reasoning() { + let mut app = create_test_app(); + app.last_prompt_tokens = Some(980_000); + app.api_messages = vec![Message { + role: "user".to_string(), + content: vec![ContentBlock::Text { + text: "small current context".to_string(), + cache_control: None, + }], + }]; + + let (used, max, percent) = + context_usage_snapshot(&app).expect("context usage should be available"); + assert_eq!(max, 1_000_000); + assert!(used < 10_000); + assert!(percent < 2.0); +} + #[test] fn context_usage_snapshot_prefers_live_estimate_while_loading() { let mut app = create_test_app(); @@ -397,7 +416,13 @@ fn context_usage_snapshot_prefers_live_estimate_while_loading() { #[test] fn should_auto_compact_before_send_respects_threshold_and_setting() { let mut app = create_test_app(); - app.last_prompt_tokens = Some(950_000); + app.api_messages = vec![Message { + role: "user".to_string(), + content: vec![ContentBlock::Text { + text: "context ".repeat(400_000), + cache_control: None, + }], + }]; app.auto_compact = true; assert!(should_auto_compact_before_send(&app)); diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index 39120754..ed94f6fd 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -25,7 +25,9 @@ For NVIDIA NIM-hosted DeepSeek V4 Pro, set `provider = "nvidia-nim"` or pass `[providers.nvidia_nim]` and forwards the resolved key, base URL, provider, and model to the TUI process. Use `deepseek auth set --provider nvidia-nim --api-key "YOUR_NVIDIA_API_KEY"` to -save the NIM key through the facade. +save the NIM key through the facade. `DEEPSEEK_API_KEY` remains a compatibility +fallback when `DEEPSEEK_PROVIDER=nvidia-nim`, but `NVIDIA_API_KEY` or +`NVIDIA_NIM_API_KEY` is preferred. To bootstrap MCP and skills directories at their resolved paths, run `deepseek-tui setup`. To only scaffold MCP, run `deepseek-tui mcp init`. @@ -70,7 +72,7 @@ These override config values: - `DEEPSEEK_BASE_URL` - `DEEPSEEK_PROVIDER` (`deepseek|nvidia-nim`) - `DEEPSEEK_MODEL` or `DEEPSEEK_DEFAULT_TEXT_MODEL` -- `NVIDIA_API_KEY` or `NVIDIA_NIM_API_KEY` (used when provider is `nvidia-nim`) +- `NVIDIA_API_KEY` or `NVIDIA_NIM_API_KEY` (preferred when provider is `nvidia-nim`; falls back to `DEEPSEEK_API_KEY`) - `NVIDIA_BASE_URL` or `NVIDIA_NIM_BASE_URL` - `NVIDIA_NIM_MODEL` - `DEEPSEEK_SKILLS_DIR` diff --git a/npm/deepseek-tui/README.md b/npm/deepseek-tui/README.md index 13c71e96..d6ff56f3 100644 --- a/npm/deepseek-tui/README.md +++ b/npm/deepseek-tui/README.md @@ -45,9 +45,9 @@ deepseek --provider nvidia-nim ``` For a single process, set `DEEPSEEK_PROVIDER=nvidia-nim` and `NVIDIA_API_KEY` -or `NVIDIA_NIM_API_KEY`. The NIM default model is -`deepseek-ai/deepseek-v4-pro` and the default base URL is -`https://integrate.api.nvidia.com/v1`. With `--provider nvidia-nim`, +or `NVIDIA_NIM_API_KEY` (with `DEEPSEEK_API_KEY` as a compatibility fallback). +The NIM default model is `deepseek-ai/deepseek-v4-pro` and the default base URL +is `https://integrate.api.nvidia.com/v1`. With `--provider nvidia-nim`, `--model deepseek-v4-flash` maps to `deepseek-ai/deepseek-v4-flash`. ## Supported platforms