Fix reasoning replay and context accounting for NIM

This commit is contained in:
Hunter Bown
2026-04-24 18:42:18 -05:00
parent d0dc26ce25
commit 16f62f7abf
8 changed files with 296 additions and 40 deletions
+4 -4
View File
@@ -118,10 +118,10 @@ Controls: `F1` help, `Esc` backs out of the current action, `Ctrl+K` command pal
Key environment overrides: `DEEPSEEK_API_KEY`, `DEEPSEEK_BASE_URL`,
`DEEPSEEK_MODEL`, `DEEPSEEK_PROFILE`, `DEEPSEEK_PROVIDER`.
For NVIDIA NIM, use `DEEPSEEK_PROVIDER=nvidia-nim` plus `NVIDIA_API_KEY`
or `NVIDIA_NIM_API_KEY`; the default model is `deepseek-ai/deepseek-v4-pro`
and the default base URL is `https://integrate.api.nvidia.com/v1`. With
`--provider nvidia-nim`, `--model deepseek-v4-flash` maps to
`deepseek-ai/deepseek-v4-flash`.
or `NVIDIA_NIM_API_KEY` (with `DEEPSEEK_API_KEY` as a compatibility fallback);
the default model is `deepseek-ai/deepseek-v4-pro` and the default base URL is
`https://integrate.api.nvidia.com/v1`. With `--provider nvidia-nim`,
`--model deepseek-v4-flash` maps to `deepseek-ai/deepseek-v4-flash`.
The client targets DeepSeek's documented OpenAI-compatible Chat Completions API
(`/chat/completions`). DeepSeek context caching is automatic; when the API
+21 -1
View File
@@ -580,7 +580,10 @@ impl EnvRuntimeOverrides {
fn api_key_for(&self, provider: ProviderKind) -> Option<String> {
match provider {
ProviderKind::Deepseek => self.deepseek_api_key.clone(),
ProviderKind::NvidiaNim => self.nvidia_api_key.clone(),
ProviderKind::NvidiaNim => self
.nvidia_api_key
.clone()
.or_else(|| self.deepseek_api_key.clone()),
ProviderKind::Openai => self.openai_api_key.clone(),
}
}
@@ -780,6 +783,23 @@ mod tests {
assert_eq!(resolved.model, DEFAULT_NVIDIA_NIM_MODEL);
}
#[test]
fn nvidia_nim_provider_can_fallback_to_deepseek_api_key_env() {
let _lock = env_lock();
let _env = EnvGuard::without_deepseek_runtime_overrides();
// Safety: test-only environment mutation guarded by a module mutex.
unsafe {
env::set_var("DEEPSEEK_PROVIDER", "nvidia-nim");
env::set_var("DEEPSEEK_API_KEY", "deepseek-compat-key");
}
let config = ConfigToml::default();
let resolved = config.resolve_runtime_options(&CliRuntimeOverrides::default());
assert_eq!(resolved.provider, ProviderKind::NvidiaNim);
assert_eq!(resolved.api_key.as_deref(), Some("deepseek-compat-key"));
}
#[test]
fn list_values_redacts_root_api_key() {
let config = ConfigToml {
+103 -24
View File
@@ -1351,6 +1351,7 @@ fn build_chat_messages_with_reasoning(
) -> Vec<Value> {
let mut out = Vec::new();
let mut pending_tool_calls: HashSet<String> = HashSet::new();
let current_turn_start = messages.iter().rposition(is_text_user_message);
if let Some(instructions) = system_to_instructions(system.cloned())
&& !instructions.trim().is_empty()
@@ -1361,7 +1362,7 @@ fn build_chat_messages_with_reasoning(
}));
}
for message in messages {
for (message_index, message) in messages.iter().enumerate() {
let role = message.role.as_str();
let mut text_parts = Vec::new();
let mut thinking_parts = Vec::new();
@@ -1423,16 +1424,22 @@ fn build_chat_messages_with_reasoning(
let reasoning_content = thinking_parts.join("\n");
let has_text = !content.trim().is_empty();
let mut has_tool_calls = !tool_calls.is_empty();
let include_reasoning_for_turn = include_reasoning && has_tool_calls;
let include_reasoning_for_turn = include_reasoning
&& has_tool_calls
&& current_turn_start.is_some_and(|start| message_index > start)
&& !has_later_assistant_text(messages, message_index);
let has_reasoning = include_reasoning_for_turn && !reasoning_content.trim().is_empty();
// DeepSeek thinking-mode tool turns are stateful within the
// stateless Chat Completions transcript: if an assistant performed
// a tool call, its `reasoning_content` must be replayed in every
// later request. Older checkpoints could lose that field because
// the UI display stream had no visible text block. Do not forward
// those malformed tool calls; dropping the stale tool round is
// better than guaranteeing a provider-side 400.
// a tool call in the current user turn, its `reasoning_content`
// must be replayed while continuing that tool round. Once a new
// user text turn starts, DeepSeek recommends clearing historical
// reasoning content so the context is not dominated by old CoT.
// Older checkpoints could lose the current-round field because the
// UI display stream had no visible text block. Do not forward those
// malformed current tool calls; dropping that round is better than
// guaranteeing a provider-side 400.
if include_reasoning_for_turn && !has_reasoning {
logging::warn(
"Dropping DeepSeek tool_calls with missing reasoning_content from assistant message",
@@ -1611,6 +1618,33 @@ fn build_chat_messages_with_reasoning(
out
}
fn is_text_user_message(message: &Message) -> bool {
message.role == "user"
&& message.content.iter().any(|block| {
matches!(
block,
ContentBlock::Text { text, .. } if !text.trim().is_empty()
)
})
}
fn has_later_assistant_text(messages: &[Message], message_index: usize) -> bool {
messages
.iter()
.skip(message_index.saturating_add(1))
.any(is_text_assistant_message)
}
fn is_text_assistant_message(message: &Message) -> bool {
message.role == "assistant"
&& message.content.iter().any(|block| {
matches!(
block,
ContentBlock::Text { text, .. } if !text.trim().is_empty()
)
})
}
fn tool_to_chat(tool: &Tool) -> Value {
let mut value = json!({
"type": "function",
@@ -2403,7 +2437,7 @@ mod tests {
}
#[test]
fn chat_messages_preserve_prior_tool_round_reasoning_after_new_user_turn() {
fn chat_messages_clear_prior_tool_round_reasoning_after_new_user_turn() {
let messages = vec![
Message {
role: "user".to_string(),
@@ -2455,14 +2489,62 @@ mod tests {
.iter()
.find(|value| value.get("role").and_then(Value::as_str) == Some("assistant"))
.expect("assistant message");
assert_eq!(
assistant.get("reasoning_content").and_then(Value::as_str),
Some("Need to call a tool")
);
assert!(assistant.get("tool_calls").is_some());
assert!(assistant.get("reasoning_content").is_none());
}
#[test]
fn chat_messages_preserve_v4_tool_round_reasoning() {
fn chat_messages_clear_completed_tool_round_reasoning_after_final_answer() {
let messages = vec![
Message {
role: "user".to_string(),
content: vec![ContentBlock::Text {
text: "Need the date".to_string(),
cache_control: None,
}],
},
Message {
role: "assistant".to_string(),
content: vec![
ContentBlock::Thinking {
thinking: "Need to call a tool".to_string(),
},
ContentBlock::ToolUse {
id: "tool-1".to_string(),
name: "get_date".to_string(),
input: json!({}),
caller: None,
},
],
},
Message {
role: "user".to_string(),
content: vec![ContentBlock::ToolResult {
tool_use_id: "tool-1".to_string(),
content: "2026-04-23".to_string(),
is_error: None,
content_blocks: None,
}],
},
Message {
role: "assistant".to_string(),
content: vec![ContentBlock::Text {
text: "It is 2026-04-23.".to_string(),
cache_control: None,
}],
},
];
let out = build_chat_messages(None, &messages, "deepseek-v4-pro");
let assistant = out
.iter()
.find(|value| value.get("role").and_then(Value::as_str) == Some("assistant"))
.expect("assistant message");
assert!(assistant.get("tool_calls").is_some());
assert!(assistant.get("reasoning_content").is_none());
}
#[test]
fn chat_messages_clear_v4_tool_round_reasoning_after_new_user_turn() {
let messages = vec![
Message {
role: "user".to_string(),
@@ -2515,16 +2597,20 @@ mod tests {
.iter()
.find(|value| value.get("role").and_then(Value::as_str) == Some("assistant"))
.expect("assistant message");
assert_eq!(
assistant.get("reasoning_content").and_then(Value::as_str),
Some("Need a tool for this")
);
assert!(assistant.get("tool_calls").is_some());
assert!(assistant.get("reasoning_content").is_none());
}
#[test]
fn chat_messages_drop_v4_tool_round_missing_reasoning() {
let messages = vec![
Message {
role: "user".to_string(),
content: vec![ContentBlock::Text {
text: "Use a tool".to_string(),
cache_control: None,
}],
},
Message {
role: "assistant".to_string(),
content: vec![ContentBlock::ToolUse {
@@ -2543,13 +2629,6 @@ mod tests {
content_blocks: None,
}],
},
Message {
role: "user".to_string(),
content: vec![ContentBlock::Text {
text: "continue".to_string(),
cache_control: None,
}],
},
];
let out = build_chat_messages(None, &messages, "deepseek-v4-pro");
+120 -5
View File
@@ -501,14 +501,15 @@ fn enforce_tool_call_pairs(messages: &[Message], pinned_indices: &mut BTreeSet<u
}
}
fn estimate_tokens_for_message(message: &Message) -> usize {
fn estimate_tokens_for_message(message: &Message, include_thinking: bool) -> usize {
message
.content
.iter()
.map(|c| match c {
ContentBlock::Text { text, .. } => text.len() / 4,
// Historical reasoning blocks are UI/session metadata for DeepSeek.
// They are only sent back during an in-progress tool-call round.
// Only current-turn tool-call reasoning is sent back to the API.
ContentBlock::Thinking { thinking } if include_thinking => thinking.len() / 4,
ContentBlock::Thinking { .. } => 0,
ContentBlock::ToolUse { input, .. } => serde_json::to_string(input)
.map(|s| s.len() / 4)
@@ -523,7 +524,51 @@ fn estimate_tokens_for_message(message: &Message) -> usize {
pub fn estimate_tokens(messages: &[Message]) -> usize {
// Rough estimate: ~4 chars per token
messages.iter().map(estimate_tokens_for_message).sum()
let current_turn_start = messages.iter().rposition(is_text_user_message);
messages
.iter()
.enumerate()
.map(|(index, message)| {
let include_thinking = current_turn_start.is_some_and(|start| index > start)
&& message_has_tool_use(message)
&& !has_later_assistant_text(messages, index);
estimate_tokens_for_message(message, include_thinking)
})
.sum()
}
fn is_text_user_message(message: &Message) -> bool {
message.role == "user"
&& message.content.iter().any(|block| {
matches!(
block,
ContentBlock::Text { text, .. } if !text.trim().is_empty()
)
})
}
fn message_has_tool_use(message: &Message) -> bool {
message
.content
.iter()
.any(|block| matches!(block, ContentBlock::ToolUse { .. }))
}
fn has_later_assistant_text(messages: &[Message], message_index: usize) -> bool {
messages
.iter()
.skip(message_index.saturating_add(1))
.any(is_text_assistant_message)
}
fn is_text_assistant_message(message: &Message) -> bool {
message.role == "assistant"
&& message.content.iter().any(|block| {
matches!(
block,
ContentBlock::Text { text, .. } if !text.trim().is_empty()
)
})
}
fn estimate_text_tokens_conservative(text: &str) -> usize {
@@ -576,14 +621,14 @@ pub fn should_compact(
let pinned_tokens: usize = plan
.pinned_indices
.iter()
.map(|&idx| estimate_tokens_for_message(&messages[idx]))
.map(|&idx| estimate_tokens_for_message(&messages[idx], false))
.sum();
let pinned_count = plan.pinned_indices.len();
let token_estimate: usize = plan
.summarize_indices
.iter()
.map(|&idx| estimate_tokens_for_message(&messages[idx]))
.map(|&idx| estimate_tokens_for_message(&messages[idx], false))
.sum();
let message_count = plan.summarize_indices.len();
@@ -1112,6 +1157,76 @@ mod tests {
assert!(tokens > 0 && tokens < 10);
}
#[test]
fn estimate_tokens_counts_current_tool_round_thinking_only() {
let thinking = "reasoning ".repeat(800);
let current_messages = vec![
Message {
role: "user".to_string(),
content: vec![ContentBlock::Text {
text: "Use a tool".to_string(),
cache_control: None,
}],
},
Message {
role: "assistant".to_string(),
content: vec![
ContentBlock::Thinking {
thinking: thinking.clone(),
},
ContentBlock::ToolUse {
id: "tool-1".to_string(),
name: "read_file".to_string(),
input: serde_json::json!({"path": "Cargo.toml"}),
caller: None,
},
],
},
Message {
role: "user".to_string(),
content: vec![ContentBlock::ToolResult {
tool_use_id: "tool-1".to_string(),
content: "manifest".to_string(),
is_error: None,
content_blocks: None,
}],
},
];
let historical_messages = {
let mut messages = current_messages.clone();
messages.push(Message {
role: "assistant".to_string(),
content: vec![ContentBlock::Text {
text: "Done.".to_string(),
cache_control: None,
}],
});
messages.push(Message {
role: "user".to_string(),
content: vec![ContentBlock::Text {
text: "Next question.".to_string(),
cache_control: None,
}],
});
messages
};
let completed_messages = {
let mut messages = current_messages.clone();
messages.push(Message {
role: "assistant".to_string(),
content: vec![ContentBlock::Text {
text: "Done.".to_string(),
cache_control: None,
}],
});
messages
};
assert!(estimate_tokens(&current_messages) > thinking.len() / 5);
assert!(estimate_tokens(&completed_messages) < thinking.len() / 8);
assert!(estimate_tokens(&historical_messages) < thinking.len() / 8);
}
#[test]
fn should_compact_respects_enabled_flag() {
let config = CompactionConfig {
+15
View File
@@ -3531,6 +3531,11 @@ fn context_usage_snapshot(app: &App) -> Option<(i64, u32, f64)> {
{
estimated
}
(Some(reported), Some(estimated))
if is_reported_context_inflated(reported, estimated) =>
{
estimated
}
(Some(reported), _) => reported,
(None, Some(estimated)) => estimated,
(None, None) => return None,
@@ -3543,6 +3548,16 @@ fn context_usage_snapshot(app: &App) -> Option<(i64, u32, f64)> {
Some((used, max, percent))
}
fn is_reported_context_inflated(reported: i64, estimated: i64) -> bool {
const MIN_ABSOLUTE_GAP: i64 = 4_096;
if estimated <= 0 || reported <= estimated {
return false;
}
reported.saturating_sub(estimated) >= MIN_ABSOLUTE_GAP
&& reported >= estimated.saturating_mul(4)
}
fn maybe_warn_context_pressure(app: &mut App) {
let Some((used, max, percent)) = context_usage_snapshot(app) else {
return;
+26 -1
View File
@@ -372,6 +372,25 @@ fn context_usage_snapshot_prefers_estimate_when_reported_exceeds_window() {
assert!(percent < 100.0);
}
#[test]
fn context_usage_snapshot_prefers_estimate_when_reported_is_inflated_by_old_reasoning() {
let mut app = create_test_app();
app.last_prompt_tokens = Some(980_000);
app.api_messages = vec![Message {
role: "user".to_string(),
content: vec![ContentBlock::Text {
text: "small current context".to_string(),
cache_control: None,
}],
}];
let (used, max, percent) =
context_usage_snapshot(&app).expect("context usage should be available");
assert_eq!(max, 1_000_000);
assert!(used < 10_000);
assert!(percent < 2.0);
}
#[test]
fn context_usage_snapshot_prefers_live_estimate_while_loading() {
let mut app = create_test_app();
@@ -397,7 +416,13 @@ fn context_usage_snapshot_prefers_live_estimate_while_loading() {
#[test]
fn should_auto_compact_before_send_respects_threshold_and_setting() {
let mut app = create_test_app();
app.last_prompt_tokens = Some(950_000);
app.api_messages = vec![Message {
role: "user".to_string(),
content: vec![ContentBlock::Text {
text: "context ".repeat(400_000),
cache_control: None,
}],
}];
app.auto_compact = true;
assert!(should_auto_compact_before_send(&app));
+4 -2
View File
@@ -25,7 +25,9 @@ For NVIDIA NIM-hosted DeepSeek V4 Pro, set `provider = "nvidia-nim"` or pass
`[providers.nvidia_nim]` and forwards the resolved key, base URL, provider, and
model to the TUI process. Use
`deepseek auth set --provider nvidia-nim --api-key "YOUR_NVIDIA_API_KEY"` to
save the NIM key through the facade.
save the NIM key through the facade. `DEEPSEEK_API_KEY` remains a compatibility
fallback when `DEEPSEEK_PROVIDER=nvidia-nim`, but `NVIDIA_API_KEY` or
`NVIDIA_NIM_API_KEY` is preferred.
To bootstrap MCP and skills directories at their resolved paths, run `deepseek-tui setup`.
To only scaffold MCP, run `deepseek-tui mcp init`.
@@ -70,7 +72,7 @@ These override config values:
- `DEEPSEEK_BASE_URL`
- `DEEPSEEK_PROVIDER` (`deepseek|nvidia-nim`)
- `DEEPSEEK_MODEL` or `DEEPSEEK_DEFAULT_TEXT_MODEL`
- `NVIDIA_API_KEY` or `NVIDIA_NIM_API_KEY` (used when provider is `nvidia-nim`)
- `NVIDIA_API_KEY` or `NVIDIA_NIM_API_KEY` (preferred when provider is `nvidia-nim`; falls back to `DEEPSEEK_API_KEY`)
- `NVIDIA_BASE_URL` or `NVIDIA_NIM_BASE_URL`
- `NVIDIA_NIM_MODEL`
- `DEEPSEEK_SKILLS_DIR`
+3 -3
View File
@@ -45,9 +45,9 @@ deepseek --provider nvidia-nim
```
For a single process, set `DEEPSEEK_PROVIDER=nvidia-nim` and `NVIDIA_API_KEY`
or `NVIDIA_NIM_API_KEY`. The NIM default model is
`deepseek-ai/deepseek-v4-pro` and the default base URL is
`https://integrate.api.nvidia.com/v1`. With `--provider nvidia-nim`,
or `NVIDIA_NIM_API_KEY` (with `DEEPSEEK_API_KEY` as a compatibility fallback).
The NIM default model is `deepseek-ai/deepseek-v4-pro` and the default base URL
is `https://integrate.api.nvidia.com/v1`. With `--provider nvidia-nim`,
`--model deepseek-v4-flash` maps to `deepseek-ai/deepseek-v4-flash`.
## Supported platforms