Fix reasoning replay and context accounting for NIM
This commit is contained in:
@@ -118,10 +118,10 @@ Controls: `F1` help, `Esc` backs out of the current action, `Ctrl+K` command pal
|
||||
Key environment overrides: `DEEPSEEK_API_KEY`, `DEEPSEEK_BASE_URL`,
|
||||
`DEEPSEEK_MODEL`, `DEEPSEEK_PROFILE`, `DEEPSEEK_PROVIDER`.
|
||||
For NVIDIA NIM, use `DEEPSEEK_PROVIDER=nvidia-nim` plus `NVIDIA_API_KEY`
|
||||
or `NVIDIA_NIM_API_KEY`; the default model is `deepseek-ai/deepseek-v4-pro`
|
||||
and the default base URL is `https://integrate.api.nvidia.com/v1`. With
|
||||
`--provider nvidia-nim`, `--model deepseek-v4-flash` maps to
|
||||
`deepseek-ai/deepseek-v4-flash`.
|
||||
or `NVIDIA_NIM_API_KEY` (with `DEEPSEEK_API_KEY` as a compatibility fallback);
|
||||
the default model is `deepseek-ai/deepseek-v4-pro` and the default base URL is
|
||||
`https://integrate.api.nvidia.com/v1`. With `--provider nvidia-nim`,
|
||||
`--model deepseek-v4-flash` maps to `deepseek-ai/deepseek-v4-flash`.
|
||||
|
||||
The client targets DeepSeek's documented OpenAI-compatible Chat Completions API
|
||||
(`/chat/completions`). DeepSeek context caching is automatic; when the API
|
||||
|
||||
@@ -580,7 +580,10 @@ impl EnvRuntimeOverrides {
|
||||
fn api_key_for(&self, provider: ProviderKind) -> Option<String> {
|
||||
match provider {
|
||||
ProviderKind::Deepseek => self.deepseek_api_key.clone(),
|
||||
ProviderKind::NvidiaNim => self.nvidia_api_key.clone(),
|
||||
ProviderKind::NvidiaNim => self
|
||||
.nvidia_api_key
|
||||
.clone()
|
||||
.or_else(|| self.deepseek_api_key.clone()),
|
||||
ProviderKind::Openai => self.openai_api_key.clone(),
|
||||
}
|
||||
}
|
||||
@@ -780,6 +783,23 @@ mod tests {
|
||||
assert_eq!(resolved.model, DEFAULT_NVIDIA_NIM_MODEL);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn nvidia_nim_provider_can_fallback_to_deepseek_api_key_env() {
|
||||
let _lock = env_lock();
|
||||
let _env = EnvGuard::without_deepseek_runtime_overrides();
|
||||
// Safety: test-only environment mutation guarded by a module mutex.
|
||||
unsafe {
|
||||
env::set_var("DEEPSEEK_PROVIDER", "nvidia-nim");
|
||||
env::set_var("DEEPSEEK_API_KEY", "deepseek-compat-key");
|
||||
}
|
||||
|
||||
let config = ConfigToml::default();
|
||||
let resolved = config.resolve_runtime_options(&CliRuntimeOverrides::default());
|
||||
|
||||
assert_eq!(resolved.provider, ProviderKind::NvidiaNim);
|
||||
assert_eq!(resolved.api_key.as_deref(), Some("deepseek-compat-key"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn list_values_redacts_root_api_key() {
|
||||
let config = ConfigToml {
|
||||
|
||||
+103
-24
@@ -1351,6 +1351,7 @@ fn build_chat_messages_with_reasoning(
|
||||
) -> Vec<Value> {
|
||||
let mut out = Vec::new();
|
||||
let mut pending_tool_calls: HashSet<String> = HashSet::new();
|
||||
let current_turn_start = messages.iter().rposition(is_text_user_message);
|
||||
|
||||
if let Some(instructions) = system_to_instructions(system.cloned())
|
||||
&& !instructions.trim().is_empty()
|
||||
@@ -1361,7 +1362,7 @@ fn build_chat_messages_with_reasoning(
|
||||
}));
|
||||
}
|
||||
|
||||
for message in messages {
|
||||
for (message_index, message) in messages.iter().enumerate() {
|
||||
let role = message.role.as_str();
|
||||
let mut text_parts = Vec::new();
|
||||
let mut thinking_parts = Vec::new();
|
||||
@@ -1423,16 +1424,22 @@ fn build_chat_messages_with_reasoning(
|
||||
let reasoning_content = thinking_parts.join("\n");
|
||||
let has_text = !content.trim().is_empty();
|
||||
let mut has_tool_calls = !tool_calls.is_empty();
|
||||
let include_reasoning_for_turn = include_reasoning && has_tool_calls;
|
||||
let include_reasoning_for_turn = include_reasoning
|
||||
&& has_tool_calls
|
||||
&& current_turn_start.is_some_and(|start| message_index > start)
|
||||
&& !has_later_assistant_text(messages, message_index);
|
||||
let has_reasoning = include_reasoning_for_turn && !reasoning_content.trim().is_empty();
|
||||
|
||||
// DeepSeek thinking-mode tool turns are stateful within the
|
||||
// stateless Chat Completions transcript: if an assistant performed
|
||||
// a tool call, its `reasoning_content` must be replayed in every
|
||||
// later request. Older checkpoints could lose that field because
|
||||
// the UI display stream had no visible text block. Do not forward
|
||||
// those malformed tool calls; dropping the stale tool round is
|
||||
// better than guaranteeing a provider-side 400.
|
||||
// a tool call in the current user turn, its `reasoning_content`
|
||||
// must be replayed while continuing that tool round. Once a new
|
||||
// user text turn starts, DeepSeek recommends clearing historical
|
||||
// reasoning content so the context is not dominated by old CoT.
|
||||
// Older checkpoints could lose the current-round field because the
|
||||
// UI display stream had no visible text block. Do not forward those
|
||||
// malformed current tool calls; dropping that round is better than
|
||||
// guaranteeing a provider-side 400.
|
||||
if include_reasoning_for_turn && !has_reasoning {
|
||||
logging::warn(
|
||||
"Dropping DeepSeek tool_calls with missing reasoning_content from assistant message",
|
||||
@@ -1611,6 +1618,33 @@ fn build_chat_messages_with_reasoning(
|
||||
out
|
||||
}
|
||||
|
||||
fn is_text_user_message(message: &Message) -> bool {
|
||||
message.role == "user"
|
||||
&& message.content.iter().any(|block| {
|
||||
matches!(
|
||||
block,
|
||||
ContentBlock::Text { text, .. } if !text.trim().is_empty()
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
fn has_later_assistant_text(messages: &[Message], message_index: usize) -> bool {
|
||||
messages
|
||||
.iter()
|
||||
.skip(message_index.saturating_add(1))
|
||||
.any(is_text_assistant_message)
|
||||
}
|
||||
|
||||
fn is_text_assistant_message(message: &Message) -> bool {
|
||||
message.role == "assistant"
|
||||
&& message.content.iter().any(|block| {
|
||||
matches!(
|
||||
block,
|
||||
ContentBlock::Text { text, .. } if !text.trim().is_empty()
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
fn tool_to_chat(tool: &Tool) -> Value {
|
||||
let mut value = json!({
|
||||
"type": "function",
|
||||
@@ -2403,7 +2437,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chat_messages_preserve_prior_tool_round_reasoning_after_new_user_turn() {
|
||||
fn chat_messages_clear_prior_tool_round_reasoning_after_new_user_turn() {
|
||||
let messages = vec![
|
||||
Message {
|
||||
role: "user".to_string(),
|
||||
@@ -2455,14 +2489,62 @@ mod tests {
|
||||
.iter()
|
||||
.find(|value| value.get("role").and_then(Value::as_str) == Some("assistant"))
|
||||
.expect("assistant message");
|
||||
assert_eq!(
|
||||
assistant.get("reasoning_content").and_then(Value::as_str),
|
||||
Some("Need to call a tool")
|
||||
);
|
||||
assert!(assistant.get("tool_calls").is_some());
|
||||
assert!(assistant.get("reasoning_content").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chat_messages_preserve_v4_tool_round_reasoning() {
|
||||
fn chat_messages_clear_completed_tool_round_reasoning_after_final_answer() {
|
||||
let messages = vec![
|
||||
Message {
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentBlock::Text {
|
||||
text: "Need the date".to_string(),
|
||||
cache_control: None,
|
||||
}],
|
||||
},
|
||||
Message {
|
||||
role: "assistant".to_string(),
|
||||
content: vec![
|
||||
ContentBlock::Thinking {
|
||||
thinking: "Need to call a tool".to_string(),
|
||||
},
|
||||
ContentBlock::ToolUse {
|
||||
id: "tool-1".to_string(),
|
||||
name: "get_date".to_string(),
|
||||
input: json!({}),
|
||||
caller: None,
|
||||
},
|
||||
],
|
||||
},
|
||||
Message {
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentBlock::ToolResult {
|
||||
tool_use_id: "tool-1".to_string(),
|
||||
content: "2026-04-23".to_string(),
|
||||
is_error: None,
|
||||
content_blocks: None,
|
||||
}],
|
||||
},
|
||||
Message {
|
||||
role: "assistant".to_string(),
|
||||
content: vec![ContentBlock::Text {
|
||||
text: "It is 2026-04-23.".to_string(),
|
||||
cache_control: None,
|
||||
}],
|
||||
},
|
||||
];
|
||||
let out = build_chat_messages(None, &messages, "deepseek-v4-pro");
|
||||
let assistant = out
|
||||
.iter()
|
||||
.find(|value| value.get("role").and_then(Value::as_str) == Some("assistant"))
|
||||
.expect("assistant message");
|
||||
assert!(assistant.get("tool_calls").is_some());
|
||||
assert!(assistant.get("reasoning_content").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chat_messages_clear_v4_tool_round_reasoning_after_new_user_turn() {
|
||||
let messages = vec![
|
||||
Message {
|
||||
role: "user".to_string(),
|
||||
@@ -2515,16 +2597,20 @@ mod tests {
|
||||
.iter()
|
||||
.find(|value| value.get("role").and_then(Value::as_str) == Some("assistant"))
|
||||
.expect("assistant message");
|
||||
assert_eq!(
|
||||
assistant.get("reasoning_content").and_then(Value::as_str),
|
||||
Some("Need a tool for this")
|
||||
);
|
||||
assert!(assistant.get("tool_calls").is_some());
|
||||
assert!(assistant.get("reasoning_content").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chat_messages_drop_v4_tool_round_missing_reasoning() {
|
||||
let messages = vec![
|
||||
Message {
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentBlock::Text {
|
||||
text: "Use a tool".to_string(),
|
||||
cache_control: None,
|
||||
}],
|
||||
},
|
||||
Message {
|
||||
role: "assistant".to_string(),
|
||||
content: vec![ContentBlock::ToolUse {
|
||||
@@ -2543,13 +2629,6 @@ mod tests {
|
||||
content_blocks: None,
|
||||
}],
|
||||
},
|
||||
Message {
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentBlock::Text {
|
||||
text: "continue".to_string(),
|
||||
cache_control: None,
|
||||
}],
|
||||
},
|
||||
];
|
||||
|
||||
let out = build_chat_messages(None, &messages, "deepseek-v4-pro");
|
||||
|
||||
@@ -501,14 +501,15 @@ fn enforce_tool_call_pairs(messages: &[Message], pinned_indices: &mut BTreeSet<u
|
||||
}
|
||||
}
|
||||
|
||||
fn estimate_tokens_for_message(message: &Message) -> usize {
|
||||
fn estimate_tokens_for_message(message: &Message, include_thinking: bool) -> usize {
|
||||
message
|
||||
.content
|
||||
.iter()
|
||||
.map(|c| match c {
|
||||
ContentBlock::Text { text, .. } => text.len() / 4,
|
||||
// Historical reasoning blocks are UI/session metadata for DeepSeek.
|
||||
// They are only sent back during an in-progress tool-call round.
|
||||
// Only current-turn tool-call reasoning is sent back to the API.
|
||||
ContentBlock::Thinking { thinking } if include_thinking => thinking.len() / 4,
|
||||
ContentBlock::Thinking { .. } => 0,
|
||||
ContentBlock::ToolUse { input, .. } => serde_json::to_string(input)
|
||||
.map(|s| s.len() / 4)
|
||||
@@ -523,7 +524,51 @@ fn estimate_tokens_for_message(message: &Message) -> usize {
|
||||
|
||||
pub fn estimate_tokens(messages: &[Message]) -> usize {
|
||||
// Rough estimate: ~4 chars per token
|
||||
messages.iter().map(estimate_tokens_for_message).sum()
|
||||
let current_turn_start = messages.iter().rposition(is_text_user_message);
|
||||
messages
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(index, message)| {
|
||||
let include_thinking = current_turn_start.is_some_and(|start| index > start)
|
||||
&& message_has_tool_use(message)
|
||||
&& !has_later_assistant_text(messages, index);
|
||||
estimate_tokens_for_message(message, include_thinking)
|
||||
})
|
||||
.sum()
|
||||
}
|
||||
|
||||
fn is_text_user_message(message: &Message) -> bool {
|
||||
message.role == "user"
|
||||
&& message.content.iter().any(|block| {
|
||||
matches!(
|
||||
block,
|
||||
ContentBlock::Text { text, .. } if !text.trim().is_empty()
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
fn message_has_tool_use(message: &Message) -> bool {
|
||||
message
|
||||
.content
|
||||
.iter()
|
||||
.any(|block| matches!(block, ContentBlock::ToolUse { .. }))
|
||||
}
|
||||
|
||||
fn has_later_assistant_text(messages: &[Message], message_index: usize) -> bool {
|
||||
messages
|
||||
.iter()
|
||||
.skip(message_index.saturating_add(1))
|
||||
.any(is_text_assistant_message)
|
||||
}
|
||||
|
||||
fn is_text_assistant_message(message: &Message) -> bool {
|
||||
message.role == "assistant"
|
||||
&& message.content.iter().any(|block| {
|
||||
matches!(
|
||||
block,
|
||||
ContentBlock::Text { text, .. } if !text.trim().is_empty()
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
fn estimate_text_tokens_conservative(text: &str) -> usize {
|
||||
@@ -576,14 +621,14 @@ pub fn should_compact(
|
||||
let pinned_tokens: usize = plan
|
||||
.pinned_indices
|
||||
.iter()
|
||||
.map(|&idx| estimate_tokens_for_message(&messages[idx]))
|
||||
.map(|&idx| estimate_tokens_for_message(&messages[idx], false))
|
||||
.sum();
|
||||
let pinned_count = plan.pinned_indices.len();
|
||||
|
||||
let token_estimate: usize = plan
|
||||
.summarize_indices
|
||||
.iter()
|
||||
.map(|&idx| estimate_tokens_for_message(&messages[idx]))
|
||||
.map(|&idx| estimate_tokens_for_message(&messages[idx], false))
|
||||
.sum();
|
||||
let message_count = plan.summarize_indices.len();
|
||||
|
||||
@@ -1112,6 +1157,76 @@ mod tests {
|
||||
assert!(tokens > 0 && tokens < 10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn estimate_tokens_counts_current_tool_round_thinking_only() {
|
||||
let thinking = "reasoning ".repeat(800);
|
||||
let current_messages = vec![
|
||||
Message {
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentBlock::Text {
|
||||
text: "Use a tool".to_string(),
|
||||
cache_control: None,
|
||||
}],
|
||||
},
|
||||
Message {
|
||||
role: "assistant".to_string(),
|
||||
content: vec![
|
||||
ContentBlock::Thinking {
|
||||
thinking: thinking.clone(),
|
||||
},
|
||||
ContentBlock::ToolUse {
|
||||
id: "tool-1".to_string(),
|
||||
name: "read_file".to_string(),
|
||||
input: serde_json::json!({"path": "Cargo.toml"}),
|
||||
caller: None,
|
||||
},
|
||||
],
|
||||
},
|
||||
Message {
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentBlock::ToolResult {
|
||||
tool_use_id: "tool-1".to_string(),
|
||||
content: "manifest".to_string(),
|
||||
is_error: None,
|
||||
content_blocks: None,
|
||||
}],
|
||||
},
|
||||
];
|
||||
let historical_messages = {
|
||||
let mut messages = current_messages.clone();
|
||||
messages.push(Message {
|
||||
role: "assistant".to_string(),
|
||||
content: vec![ContentBlock::Text {
|
||||
text: "Done.".to_string(),
|
||||
cache_control: None,
|
||||
}],
|
||||
});
|
||||
messages.push(Message {
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentBlock::Text {
|
||||
text: "Next question.".to_string(),
|
||||
cache_control: None,
|
||||
}],
|
||||
});
|
||||
messages
|
||||
};
|
||||
let completed_messages = {
|
||||
let mut messages = current_messages.clone();
|
||||
messages.push(Message {
|
||||
role: "assistant".to_string(),
|
||||
content: vec![ContentBlock::Text {
|
||||
text: "Done.".to_string(),
|
||||
cache_control: None,
|
||||
}],
|
||||
});
|
||||
messages
|
||||
};
|
||||
|
||||
assert!(estimate_tokens(¤t_messages) > thinking.len() / 5);
|
||||
assert!(estimate_tokens(&completed_messages) < thinking.len() / 8);
|
||||
assert!(estimate_tokens(&historical_messages) < thinking.len() / 8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn should_compact_respects_enabled_flag() {
|
||||
let config = CompactionConfig {
|
||||
|
||||
@@ -3531,6 +3531,11 @@ fn context_usage_snapshot(app: &App) -> Option<(i64, u32, f64)> {
|
||||
{
|
||||
estimated
|
||||
}
|
||||
(Some(reported), Some(estimated))
|
||||
if is_reported_context_inflated(reported, estimated) =>
|
||||
{
|
||||
estimated
|
||||
}
|
||||
(Some(reported), _) => reported,
|
||||
(None, Some(estimated)) => estimated,
|
||||
(None, None) => return None,
|
||||
@@ -3543,6 +3548,16 @@ fn context_usage_snapshot(app: &App) -> Option<(i64, u32, f64)> {
|
||||
Some((used, max, percent))
|
||||
}
|
||||
|
||||
fn is_reported_context_inflated(reported: i64, estimated: i64) -> bool {
|
||||
const MIN_ABSOLUTE_GAP: i64 = 4_096;
|
||||
if estimated <= 0 || reported <= estimated {
|
||||
return false;
|
||||
}
|
||||
|
||||
reported.saturating_sub(estimated) >= MIN_ABSOLUTE_GAP
|
||||
&& reported >= estimated.saturating_mul(4)
|
||||
}
|
||||
|
||||
fn maybe_warn_context_pressure(app: &mut App) {
|
||||
let Some((used, max, percent)) = context_usage_snapshot(app) else {
|
||||
return;
|
||||
|
||||
@@ -372,6 +372,25 @@ fn context_usage_snapshot_prefers_estimate_when_reported_exceeds_window() {
|
||||
assert!(percent < 100.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn context_usage_snapshot_prefers_estimate_when_reported_is_inflated_by_old_reasoning() {
|
||||
let mut app = create_test_app();
|
||||
app.last_prompt_tokens = Some(980_000);
|
||||
app.api_messages = vec![Message {
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentBlock::Text {
|
||||
text: "small current context".to_string(),
|
||||
cache_control: None,
|
||||
}],
|
||||
}];
|
||||
|
||||
let (used, max, percent) =
|
||||
context_usage_snapshot(&app).expect("context usage should be available");
|
||||
assert_eq!(max, 1_000_000);
|
||||
assert!(used < 10_000);
|
||||
assert!(percent < 2.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn context_usage_snapshot_prefers_live_estimate_while_loading() {
|
||||
let mut app = create_test_app();
|
||||
@@ -397,7 +416,13 @@ fn context_usage_snapshot_prefers_live_estimate_while_loading() {
|
||||
#[test]
|
||||
fn should_auto_compact_before_send_respects_threshold_and_setting() {
|
||||
let mut app = create_test_app();
|
||||
app.last_prompt_tokens = Some(950_000);
|
||||
app.api_messages = vec![Message {
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentBlock::Text {
|
||||
text: "context ".repeat(400_000),
|
||||
cache_control: None,
|
||||
}],
|
||||
}];
|
||||
app.auto_compact = true;
|
||||
assert!(should_auto_compact_before_send(&app));
|
||||
|
||||
|
||||
@@ -25,7 +25,9 @@ For NVIDIA NIM-hosted DeepSeek V4 Pro, set `provider = "nvidia-nim"` or pass
|
||||
`[providers.nvidia_nim]` and forwards the resolved key, base URL, provider, and
|
||||
model to the TUI process. Use
|
||||
`deepseek auth set --provider nvidia-nim --api-key "YOUR_NVIDIA_API_KEY"` to
|
||||
save the NIM key through the facade.
|
||||
save the NIM key through the facade. `DEEPSEEK_API_KEY` remains a compatibility
|
||||
fallback when `DEEPSEEK_PROVIDER=nvidia-nim`, but `NVIDIA_API_KEY` or
|
||||
`NVIDIA_NIM_API_KEY` is preferred.
|
||||
|
||||
To bootstrap MCP and skills directories at their resolved paths, run `deepseek-tui setup`.
|
||||
To only scaffold MCP, run `deepseek-tui mcp init`.
|
||||
@@ -70,7 +72,7 @@ These override config values:
|
||||
- `DEEPSEEK_BASE_URL`
|
||||
- `DEEPSEEK_PROVIDER` (`deepseek|nvidia-nim`)
|
||||
- `DEEPSEEK_MODEL` or `DEEPSEEK_DEFAULT_TEXT_MODEL`
|
||||
- `NVIDIA_API_KEY` or `NVIDIA_NIM_API_KEY` (used when provider is `nvidia-nim`)
|
||||
- `NVIDIA_API_KEY` or `NVIDIA_NIM_API_KEY` (preferred when provider is `nvidia-nim`; falls back to `DEEPSEEK_API_KEY`)
|
||||
- `NVIDIA_BASE_URL` or `NVIDIA_NIM_BASE_URL`
|
||||
- `NVIDIA_NIM_MODEL`
|
||||
- `DEEPSEEK_SKILLS_DIR`
|
||||
|
||||
@@ -45,9 +45,9 @@ deepseek --provider nvidia-nim
|
||||
```
|
||||
|
||||
For a single process, set `DEEPSEEK_PROVIDER=nvidia-nim` and `NVIDIA_API_KEY`
|
||||
or `NVIDIA_NIM_API_KEY`. The NIM default model is
|
||||
`deepseek-ai/deepseek-v4-pro` and the default base URL is
|
||||
`https://integrate.api.nvidia.com/v1`. With `--provider nvidia-nim`,
|
||||
or `NVIDIA_NIM_API_KEY` (with `DEEPSEEK_API_KEY` as a compatibility fallback).
|
||||
The NIM default model is `deepseek-ai/deepseek-v4-pro` and the default base URL
|
||||
is `https://integrate.api.nvidia.com/v1`. With `--provider nvidia-nim`,
|
||||
`--model deepseek-v4-flash` maps to `deepseek-ai/deepseek-v4-flash`.
|
||||
|
||||
## Supported platforms
|
||||
|
||||
Reference in New Issue
Block a user