From a4dee56fcc9908dafc14e28102782f1728e850fe Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Mon, 4 May 2026 22:06:07 -0500 Subject: [PATCH 01/11] fix(compaction): 500K hard floor plus V4 default --- crates/tui/src/compaction.rs | 125 +++++++++++++++++++++++++++++++++-- 1 file changed, 119 insertions(+), 6 deletions(-) diff --git a/crates/tui/src/compaction.rs b/crates/tui/src/compaction.rs index 59024426..0f3acb09 100644 --- a/crates/tui/src/compaction.rs +++ b/crates/tui/src/compaction.rs @@ -25,25 +25,58 @@ pub struct CompactionConfig { pub message_threshold: usize, pub model: String, pub cache_summary: bool, + /// Hard floor — `should_compact` returns `false` when total session + /// tokens fall below this number, regardless of `enabled`, + /// `token_threshold`, or `message_threshold`. Defaults to + /// [`MINIMUM_AUTO_COMPACTION_TOKENS`] (500K) for v0.8.11+. Tests that + /// want to exercise the older threshold/message-count logic at small + /// fixture sizes can set this to `0` to disable the floor. + pub auto_floor_tokens: usize, } impl Default for CompactionConfig { fn default() -> Self { Self { - // ON BY DEFAULT since v0.8.6 (#402 P0 survivability). - // Long-running sessions need automatic compaction to stay - // within the model's context budget. Users who prefer the - // previous behaviour can opt out via `auto_compact = false` - // in settings or `compaction.enabled = false` in config. + // ON BY DEFAULT since v0.8.6 (#402 P0 survivability) — but the + // engine-level `auto_compact` setting was flipped OFF in v0.8.11 + // (#665) so this default is mostly a fallback for code paths + // that build a `CompactionConfig` without going through + // `compaction_threshold_for_model_and_effort`. Real per-model + // values are still derived through that helper. enabled: true, - token_threshold: 50000, + // v0.8.11: 50K was a 128K-era leftover that biased every + // unconfigured caller toward "compact almost immediately on V4." + // Bumped to 800K (80% of V4's 1M window) so the dead-code + // default no longer lies. Real call sites override this via + // `compaction_threshold_for_model_and_effort`. + token_threshold: 800_000, message_threshold: 50, model: DEFAULT_TEXT_MODEL.to_string(), cache_summary: true, + auto_floor_tokens: MINIMUM_AUTO_COMPACTION_TOKENS, } } } +/// Hard floor for automatic compaction in v0.8.11+. +/// +/// Below this token count, `should_compact` returns `false` regardless of +/// `enabled`, `token_threshold`, or `message_threshold`. The point of the +/// floor is V4 prefix-cache economics: compaction rewrites the stable +/// prefix, which destroys the KV cache. At low token counts the prefix +/// cache is healthy and compaction's cost (full re-prefill at miss prices) +/// dwarfs its benefit (a tiny budget reclaim). Above the floor compaction +/// can still be net-positive — cache is already pressured, the prefix has +/// drifted, and freeing budget matters. +/// +/// Manual `/compact` slash command and the model-callable `compact_now` +/// tool both bypass this floor with a deliberate refusal message — they +/// represent explicit agency rather than implicit policy. +/// +/// Constant rather than configurable for v0.8.11. If anyone needs to dial +/// it (smaller models, opinionated workflows), we can add a setting later. +pub const MINIMUM_AUTO_COMPACTION_TOKENS: usize = 500_000; + pub const KEEP_RECENT_MESSAGES: usize = 4; const RECENT_WORKING_SET_WINDOW: usize = 12; const MAX_WORKING_SET_PATHS: usize = 24; @@ -585,6 +618,21 @@ pub fn should_compact( return false; } + // v0.8.11: hard floor enforcement. Below the floor (default 500K tokens + // — see `MINIMUM_AUTO_COMPACTION_TOKENS`), automatic compaction is + // refused because rewriting the prefix kills V4's prefix cache for + // little budget recovery. Manual `/compact` and the `compact_now` tool + // bypass this floor by going through different code paths. + if config.auto_floor_tokens > 0 { + let total_session_tokens: usize = messages + .iter() + .map(|m| estimate_tokens_for_message(m, false)) + .sum(); + if total_session_tokens < config.auto_floor_tokens { + return false; + } + } + let plan = plan_compaction( messages, workspace, @@ -1445,6 +1493,9 @@ mod tests { enabled: true, token_threshold: 1_000_000, // Very high message_threshold: 5, + // Disable the v0.8.11 500K floor so this test exercises the + // pure message-count threshold logic at small fixture sizes. + auto_floor_tokens: 0, ..Default::default() }; @@ -1585,6 +1636,7 @@ mod tests { enabled: true, token_threshold: 1_000_000, message_threshold: 5, + auto_floor_tokens: 0, ..Default::default() }; @@ -1603,6 +1655,7 @@ mod tests { enabled: true, token_threshold: 50, message_threshold: 50, + auto_floor_tokens: 0, ..Default::default() }; @@ -1874,6 +1927,7 @@ mod tests { enabled: true, token_threshold: 100, // Low threshold for testing message_threshold: 1000, // High message threshold + auto_floor_tokens: 0, ..Default::default() }; @@ -1901,6 +1955,65 @@ mod tests { assert!(!should_compact(&messages, &config, None, None, None)); } + /// v0.8.11: the 500K hard floor blocks auto-compaction even when the + /// token-percentage threshold would otherwise fire. This is the V4 + /// prefix-cache protection — below 500K total tokens, rewriting the + /// prefix loses cache for tiny budget gains. + #[test] + fn auto_compaction_floor_blocks_below_500k_even_when_threshold_says_yes() { + let config = CompactionConfig { + enabled: true, + token_threshold: 100, // would normally fire instantly + message_threshold: 1000, // not the trigger + // Use the production default explicitly so this test pins the + // floor's contract rather than relying on `Default`. + auto_floor_tokens: MINIMUM_AUTO_COMPACTION_TOKENS, + ..Default::default() + }; + + let messages: Vec = (0..10).map(|_| msg("user", &"x".repeat(50))).collect(); + // Total tokens way under 500K, so floor blocks compaction. + assert!(!should_compact(&messages, &config, None, None, None)); + } + + /// v0.8.11: when total tokens cross the 500K floor, the existing + /// threshold/message-count logic takes over again. + #[test] + fn auto_compaction_floor_yields_to_threshold_logic_above_500k() { + let config = CompactionConfig { + enabled: true, + token_threshold: 2_000_000, + message_threshold: 2_000, + auto_floor_tokens: MINIMUM_AUTO_COMPACTION_TOKENS, + ..Default::default() + }; + + // Each message ~500 tokens; 1100 messages → ~550K total tokens. + // That's above the floor (500K) AND below the deliberately high + // token_threshold, so auto-compaction stays off — by threshold, + // not floor. + let messages: Vec = (0..1100).map(|_| msg("user", &"x".repeat(2000))).collect(); + assert!(!should_compact(&messages, &config, None, None, None)); + + // Crank threshold below total → compaction fires now that we're + // past the floor. + let config_lower = CompactionConfig { + token_threshold: 100_000, + ..config + }; + assert!(should_compact(&messages, &config_lower, None, None, None)); + } + + /// `CompactionConfig::default()` ships with the 500K floor on by + /// default — production callers via `..Default::default()` get the + /// safety guarantee automatically. + #[test] + fn compaction_config_default_carries_500k_floor() { + let config = CompactionConfig::default(); + assert_eq!(config.auto_floor_tokens, MINIMUM_AUTO_COMPACTION_TOKENS); + assert_eq!(config.auto_floor_tokens, 500_000); + } + #[test] fn test_plan_compaction_pins_error_messages() { let messages = vec![ From a14227edf8169195501160dba1cdf3c400de9c0b Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Mon, 4 May 2026 22:06:16 -0500 Subject: [PATCH 02/11] refactor(models): rename legacy DeepSeek context window --- crates/tui/src/config.rs | 4 +- crates/tui/src/core/engine/capacity_flow.rs | 5 ++- crates/tui/src/models.rs | 45 +++++++++++---------- crates/tui/src/tui/context_inspector.rs | 13 +++--- 4 files changed, 36 insertions(+), 31 deletions(-) diff --git a/crates/tui/src/config.rs b/crates/tui/src/config.rs index 149fb35f..49fd56f3 100644 --- a/crates/tui/src/config.rs +++ b/crates/tui/src/config.rs @@ -225,7 +225,7 @@ pub fn provider_capability(provider: ApiProvider, resolved_model: &str) -> Provi crate::models::DEEPSEEK_V4_CONTEXT_WINDOW_TOKENS } else { crate::models::context_window_for_model(resolved_model) - .unwrap_or(crate::models::DEFAULT_CONTEXT_WINDOW_TOKENS) + .unwrap_or(crate::models::LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS) }; // Max output tokens: DeepSeek V4 models allow 262K; others get 4096. @@ -4070,7 +4070,7 @@ model = "deepseek-v4-pro" let cap = provider_capability(ApiProvider::Deepseek, "deepseek-coder"); assert_eq!( cap.context_window, - crate::models::DEFAULT_CONTEXT_WINDOW_TOKENS + crate::models::LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS ); assert_eq!(cap.max_output, 4096); assert!(!cap.thinking_supported); diff --git a/crates/tui/src/core/engine/capacity_flow.rs b/crates/tui/src/core/engine/capacity_flow.rs index f280e644..cee5fb76 100644 --- a/crates/tui/src/core/engine/capacity_flow.rs +++ b/crates/tui/src/core/engine/capacity_flow.rs @@ -160,9 +160,10 @@ impl Engine { let unique_reference_ids_recent_window = self.recent_unique_reference_count(message_window, turn); let context_window = usize::try_from( - context_window_for_model(&self.session.model).unwrap_or(DEFAULT_CONTEXT_WINDOW_TOKENS), + context_window_for_model(&self.session.model) + .unwrap_or(LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS), ) - .unwrap_or(usize::try_from(DEFAULT_CONTEXT_WINDOW_TOKENS).unwrap_or(128_000)) + .unwrap_or(usize::try_from(LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS).unwrap_or(128_000)) .max(1); let context_used_ratio = (self.estimated_input_tokens() as f64) / (context_window as f64); diff --git a/crates/tui/src/models.rs b/crates/tui/src/models.rs index 8b1413ee..320d8305 100644 --- a/crates/tui/src/models.rs +++ b/crates/tui/src/models.rs @@ -2,15 +2,17 @@ use serde::{Deserialize, Serialize}; -pub const DEFAULT_CONTEXT_WINDOW_TOKENS: u32 = 128_000; +/// Context window used only for legacy DeepSeek model IDs that do not name a +/// newer V4 alias and do not carry an explicit `*k` suffix. +pub const LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS: u32 = 128_000; pub const DEEPSEEK_V4_CONTEXT_WINDOW_TOKENS: u32 = 1_000_000; /// Last-resort compaction trigger when [`context_window_for_model`] returns /// `None` (an unrecognised model id). v0.8.11 raised this from `50_000` to -/// `102_400` (80% of [`DEFAULT_CONTEXT_WINDOW_TOKENS`]) so unknown models -/// inherit the same late-trigger discipline as V4 instead of paying the -/// prefix-cache hit at 5% of the V4 window. Known DeepSeek / Claude models -/// resolve to their own scaled value via [`compaction_threshold_for_model`] -/// (#664). +/// `102_400` (80% of [`LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS`]) so unknown +/// models inherit the same late-trigger discipline as V4 instead of paying +/// the prefix-cache hit at 5% of the V4 window. Known DeepSeek / Claude +/// models resolve to their own scaled value via +/// [`compaction_threshold_for_model`] (#664). pub const DEFAULT_COMPACTION_TOKEN_THRESHOLD: usize = 102_400; pub const DEFAULT_COMPACTION_MESSAGE_THRESHOLD: usize = 50; const COMPACTION_THRESHOLD_PERCENT: u32 = 80; @@ -212,8 +214,9 @@ pub struct Usage { #[must_use] pub fn context_window_for_model(model: &str) -> Option { let lower = model.to_lowercase(); - // Unknown DeepSeek model IDs default to 128k unless an explicit *k suffix is present. - // DeepSeek-V4 family and current legacy aliases ship with a 1M context window. + // Unknown legacy DeepSeek model IDs default to 128K unless an explicit + // *k suffix is present. DeepSeek-V4 family and current compatibility + // aliases ship with a 1M context window. if lower.contains("deepseek") { if let Some(explicit_window) = deepseek_context_window_hint(&lower) { return Some(explicit_window); @@ -221,7 +224,7 @@ pub fn context_window_for_model(model: &str) -> Option { if lower.contains("v4") || is_current_deepseek_v4_alias(&lower) { return Some(DEEPSEEK_V4_CONTEXT_WINDOW_TOKENS); } - return Some(DEFAULT_CONTEXT_WINDOW_TOKENS); + return Some(LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS); } if lower.contains("claude") { return Some(200_000); @@ -411,14 +414,14 @@ mod tests { } #[test] - fn unknown_deepseek_models_map_to_128k_context_window() { + fn unknown_legacy_deepseek_models_map_to_128k_context_window() { assert_eq!( context_window_for_model("deepseek-coder"), - Some(DEFAULT_CONTEXT_WINDOW_TOKENS) + Some(LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS) ); assert_eq!( context_window_for_model("deepseek-v3.2-0324"), - Some(DEFAULT_CONTEXT_WINDOW_TOKENS) + Some(LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS) ); } @@ -447,7 +450,7 @@ mod tests { ); assert_eq!( context_window_for_model("deepseek-v3.2-2k-preview"), - Some(DEFAULT_CONTEXT_WINDOW_TOKENS) + Some(LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS) ); } @@ -458,11 +461,11 @@ mod tests { 102_400 ); // v0.8.11 (#664): unknown-model fallback also resolves to 80% of - // `DEFAULT_CONTEXT_WINDOW_TOKENS` (128k) — same late-trigger - // discipline as the V4 path. Was `50_000` pre-v0.8.11; that - // hardcoded value compacted at ~5% of a 1M window when the model - // detection silently fell through, which is exactly the - // prefix-cache-burning behaviour we're getting away from. + // `LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS` (128K legacy DeepSeek + // fallback) — same late-trigger discipline as the V4 path. Was + // `50_000` pre-v0.8.11; that hardcoded value compacted at ~5% of a + // 1M window when model detection silently fell through, which is + // exactly the prefix-cache-burning behaviour we're getting away from. assert_eq!(compaction_threshold_for_model("unknown-model"), 102_400); } @@ -509,9 +512,9 @@ mod tests { 102_400 ); // v0.8.11 (#664): unknown-model fallback also lands on the - // 80%-of-128K floor instead of the legacy hardcoded 50K, so - // model-detection-fall-through doesn't quietly burn V4 prefix - // cache at 5%-of-window. + // 80%-of-128K legacy DeepSeek fallback instead of the legacy + // hardcoded 50K, so model-detection-fall-through doesn't quietly + // burn V4 prefix cache at 5%-of-window. assert_eq!( compaction_threshold_for_model_and_effort("unknown-model", Some("max")), 102_400 diff --git a/crates/tui/src/tui/context_inspector.rs b/crates/tui/src/tui/context_inspector.rs index 7f0e139b..12a6cca4 100644 --- a/crates/tui/src/tui/context_inspector.rs +++ b/crates/tui/src/tui/context_inspector.rs @@ -4,16 +4,17 @@ use std::collections::HashSet; use std::fmt::Write; use crate::compaction::estimate_input_tokens_conservative; -use crate::models::{DEFAULT_CONTEXT_WINDOW_TOKENS, SystemPrompt, context_window_for_model}; +use crate::models::{ + LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS, SystemPrompt, context_window_for_model, +}; use crate::session_manager::SessionContextReference; use crate::tui::app::{App, ToolDetailRecord}; use crate::tui::file_mention::ContextReferenceSource; use crate::utils::estimate_message_chars; -/// Marker used by the engine's `append_working_set_summary` to tag the -/// volatile tail block in the system prompt. Replicated here so the -/// context inspector can distinguish stable prefix blocks from the -/// ephemeral working-set block without importing engine internals. +/// Marker used by per-turn working-set metadata. Replicated here so the +/// context inspector can distinguish stable prompt blocks from volatile +/// working-set context without importing engine internals. const WORKING_SET_MARKER: &str = "## Repo Working Set"; const CONTEXT_WARNING_THRESHOLD_PERCENT: f64 = 85.0; @@ -68,7 +69,7 @@ pub fn build_context_inspector_text(app: &App) -> String { } fn context_usage(app: &App) -> (usize, u32, f64) { - let max = context_window_for_model(&app.model).unwrap_or(DEFAULT_CONTEXT_WINDOW_TOKENS); + let max = context_window_for_model(&app.model).unwrap_or(LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS); let estimated = estimate_input_tokens_conservative(&app.api_messages, app.system_prompt.as_ref()); let total_chars = estimate_message_chars(&app.api_messages); From b48b68f078c61cf97d07354d32468c7346f3d950 Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Mon, 4 May 2026 22:06:55 -0500 Subject: [PATCH 03/11] perf(engine): stabilize system prompt and move working set metadata --- crates/tui/src/core/engine.rs | 68 ++++++++++------ crates/tui/src/core/engine/context.rs | 52 +----------- crates/tui/src/core/engine/tests.rs | 100 ++++++++++++++++++------ crates/tui/src/core/engine/turn_loop.rs | 33 +++++++- crates/tui/src/core/session.rs | 4 + crates/tui/src/prompts.rs | 69 +++++++--------- 6 files changed, 185 insertions(+), 141 deletions(-) diff --git a/crates/tui/src/core/engine.rs b/crates/tui/src/core/engine.rs index ff053451..aeeb6fb4 100644 --- a/crates/tui/src/core/engine.rs +++ b/crates/tui/src/core/engine.rs @@ -8,6 +8,8 @@ //! - Tool execution orchestration use std::collections::HashMap; +use std::collections::hash_map::DefaultHasher; +use std::hash::{Hash, Hasher}; use std::path::PathBuf; use std::sync::{Arc, Mutex as StdMutex}; use std::time::{Duration, Instant}; @@ -35,8 +37,8 @@ use crate::mcp::McpPool; #[cfg(test)] use crate::models::ToolCaller; use crate::models::{ - ContentBlock, ContentBlockStart, DEFAULT_CONTEXT_WINDOW_TOKENS, Delta, Message, MessageRequest, - StreamEvent, SystemPrompt, Tool, Usage, + ContentBlock, ContentBlockStart, Delta, LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS, Message, + MessageRequest, StreamEvent, SystemPrompt, Tool, Usage, }; use crate::prompts; use crate::seam_manager::{SeamConfig, SeamManager}; @@ -353,8 +355,9 @@ impl Engine { config.mcp_config_path.clone(), ); - // Set up system prompt with project context (default to agent mode) - let working_set_summary = session.working_set.summary_block(&config.workspace); + // Set up stable system prompt with project context (default to agent mode). + // Per-turn working-set metadata is injected into the latest user + // message at request time so file churn does not rewrite this prefix. let user_memory_block = crate::memory::compose_block(config.memory_enabled, &config.memory_path); let system_prompt = prompts::system_prompt_for_mode_with_context_skills_and_session( @@ -368,8 +371,9 @@ impl Engine { goal_objective: config.goal_objective.as_deref(), }, ); - session.system_prompt = - append_working_set_summary(Some(system_prompt), working_set_summary.as_deref()); + let stable_prompt = Some(system_prompt); + session.last_system_prompt_hash = Some(system_prompt_hash(stable_prompt.as_ref())); + session.system_prompt = stable_prompt; let subagent_manager = new_shared_subagent_manager(config.workspace.clone(), config.max_subagents); @@ -1645,10 +1649,6 @@ impl Engine { /// Refresh the system prompt based on current mode and context. fn refresh_system_prompt(&mut self, mode: AppMode) { - let working_set_summary = self - .session - .working_set - .summary_block(&self.config.workspace); let user_memory_block = crate::memory::compose_block(self.config.memory_enabled, &self.config.memory_path); let base = prompts::system_prompt_for_mode_with_context_skills_and_session( @@ -1664,8 +1664,11 @@ impl Engine { ); let stable_prompt = merge_system_prompts(Some(&base), self.session.compaction_summary_prompt.clone()); - self.session.system_prompt = - append_working_set_summary(stable_prompt, working_set_summary.as_deref()); + let stable_hash = system_prompt_hash(stable_prompt.as_ref()); + if self.session.last_system_prompt_hash != Some(stable_hash) { + self.session.system_prompt = stable_prompt; + self.session.last_system_prompt_hash = Some(stable_hash); + } } fn merge_compaction_summary(&mut self, summary_prompt: Option) { @@ -1676,18 +1679,36 @@ impl Engine { self.session.compaction_summary_prompt.as_ref(), summary_prompt.clone(), ); - let current_without_working_set = - remove_working_set_summary(self.session.system_prompt.as_ref()); - let merged = merge_system_prompts(current_without_working_set.as_ref(), summary_prompt); - let working_set_summary = self - .session - .working_set - .summary_block(&self.config.workspace); - self.session.system_prompt = - append_working_set_summary(merged, working_set_summary.as_deref()); + let merged = merge_system_prompts(self.session.system_prompt.as_ref(), summary_prompt); + self.session.last_system_prompt_hash = Some(system_prompt_hash(merged.as_ref())); + self.session.system_prompt = merged; } } +fn system_prompt_hash(prompt: Option<&SystemPrompt>) -> u64 { + let mut hasher = DefaultHasher::new(); + match prompt { + Some(SystemPrompt::Text(text)) => { + 0u8.hash(&mut hasher); + text.hash(&mut hasher); + } + Some(SystemPrompt::Blocks(blocks)) => { + 1u8.hash(&mut hasher); + for block in blocks { + block.block_type.hash(&mut hasher); + block.text.hash(&mut hasher); + if let Some(cache_control) = &block.cache_control { + cache_control.cache_type.hash(&mut hasher); + } + } + } + None => { + 2u8.hash(&mut hasher); + } + } + hasher.finish() +} + /// Spawn the engine in a background task pub fn spawn_engine(config: EngineConfig, api_config: &Config) -> EngineHandle { let (engine, handle) = Engine::new(config, api_config); @@ -1775,9 +1796,8 @@ mod context; pub(crate) use context::compact_tool_result_for_context; use context::{ COMPACTION_SUMMARY_MARKER, MAX_CONTEXT_RECOVERY_ATTEMPTS, MIN_RECENT_MESSAGES_TO_KEEP, - TURN_MAX_OUTPUT_TOKENS, append_working_set_summary, context_input_budget, - estimate_input_tokens_conservative, extract_compaction_summary_prompt, - is_context_length_error_message, remove_working_set_summary, summarize_text, + TURN_MAX_OUTPUT_TOKENS, context_input_budget, estimate_input_tokens_conservative, + extract_compaction_summary_prompt, is_context_length_error_message, summarize_text, turn_response_headroom_tokens, }; mod dispatch; diff --git a/crates/tui/src/core/engine/context.rs b/crates/tui/src/core/engine/context.rs index 7e80008b..81f053e0 100644 --- a/crates/tui/src/core/engine/context.rs +++ b/crates/tui/src/core/engine/context.rs @@ -6,7 +6,7 @@ use crate::compaction::estimate_tokens; use crate::error_taxonomy::ErrorCategory; -use crate::models::{Message, SystemBlock, SystemPrompt, context_window_for_model}; +use crate::models::{Message, SystemPrompt, context_window_for_model}; use crate::tools::spec::ToolResult; /// Max output tokens requested for normal agent turns. Generous on purpose: @@ -288,56 +288,6 @@ pub(super) fn extract_compaction_summary_prompt( } } -pub(super) fn remove_working_set_summary(prompt: Option<&SystemPrompt>) -> Option { - match prompt { - Some(SystemPrompt::Blocks(blocks)) => { - let filtered: Vec = blocks - .iter() - .filter(|block| !block.text.contains(WORKING_SET_SUMMARY_MARKER)) - .cloned() - .collect(); - if filtered.is_empty() { - None - } else { - Some(SystemPrompt::Blocks(filtered)) - } - } - Some(SystemPrompt::Text(text)) => Some(SystemPrompt::Text(text.clone())), - None => None, - } -} - -pub(super) fn append_working_set_summary( - prompt: Option, - working_set_summary: Option<&str>, -) -> Option { - let Some(summary) = working_set_summary.map(str::trim).filter(|s| !s.is_empty()) else { - return prompt; - }; - let working_set_block = SystemBlock { - block_type: "text".to_string(), - text: summary.to_string(), - cache_control: None, - }; - - match prompt { - Some(SystemPrompt::Text(text)) => Some(SystemPrompt::Blocks(vec![ - SystemBlock { - block_type: "text".to_string(), - text, - cache_control: None, - }, - working_set_block, - ])), - Some(SystemPrompt::Blocks(mut blocks)) => { - blocks.retain(|block| !block.text.contains(WORKING_SET_SUMMARY_MARKER)); - blocks.push(working_set_block); - Some(SystemPrompt::Blocks(blocks)) - } - None => Some(SystemPrompt::Blocks(vec![working_set_block])), - } -} - fn estimate_text_tokens_conservative(text: &str) -> usize { text.chars().count().div_ceil(3) } diff --git a/crates/tui/src/core/engine/tests.rs b/crates/tui/src/core/engine/tests.rs index 1e5e0349..8a9751e7 100644 --- a/crates/tui/src/core/engine/tests.rs +++ b/crates/tui/src/core/engine/tests.rs @@ -501,7 +501,7 @@ fn subagent_results_are_summarized_before_parent_context_insertion() { } #[test] -fn refresh_system_prompt_places_working_set_after_stable_prefix() { +fn refresh_system_prompt_leaves_working_set_out_of_system_prompt() { let tmp = tempdir().expect("tempdir"); fs::create_dir_all(tmp.path().join("src")).expect("mkdir"); fs::write(tmp.path().join("src/lib.rs"), "pub fn sample() {}").expect("write"); @@ -518,20 +518,74 @@ fn refresh_system_prompt_places_working_set_after_stable_prefix() { engine.refresh_system_prompt(AppMode::Agent); - let Some(SystemPrompt::Blocks(blocks)) = &engine.session.system_prompt else { - panic!("expected structured prompt blocks"); - }; - let last = blocks.last().expect("working-set block"); - assert!(last.text.contains(WORKING_SET_SUMMARY_MARKER)); - assert!( - blocks[..blocks.len() - 1] + let prompt = match &engine.session.system_prompt { + Some(SystemPrompt::Text(text)) => text.clone(), + Some(SystemPrompt::Blocks(blocks)) => blocks .iter() - .all(|block| !block.text.contains(WORKING_SET_SUMMARY_MARKER)) - ); + .map(|block| block.text.as_str()) + .collect::>() + .join("\n"), + None => panic!("expected system prompt"), + }; + assert!(!prompt.contains(WORKING_SET_SUMMARY_MARKER)); } #[test] -fn compaction_summary_stays_before_volatile_working_set() { +fn working_set_reaches_model_as_turn_metadata() { + let tmp = tempdir().expect("tempdir"); + fs::create_dir_all(tmp.path().join("src")).expect("mkdir"); + fs::write(tmp.path().join("src/lib.rs"), "pub fn sample() {}").expect("write"); + + let config = EngineConfig { + workspace: tmp.path().to_path_buf(), + ..Default::default() + }; + let (mut engine, _handle) = Engine::new(config, &Config::default()); + engine + .session + .working_set + .observe_user_message("please inspect src/lib.rs", tmp.path()); + engine.session.add_message(Message { + role: "user".to_string(), + content: vec![ContentBlock::Text { + text: "please inspect src/lib.rs".to_string(), + cache_control: None, + }], + }); + + let messages = engine.messages_with_turn_metadata(); + let first_block = messages + .last() + .and_then(|message| message.content.first()) + .expect("turn metadata block"); + let ContentBlock::Text { text, .. } = first_block else { + panic!("expected text metadata block"); + }; + assert!(text.starts_with("\n")); + assert!(text.contains(WORKING_SET_SUMMARY_MARKER)); + assert!(text.contains("src/lib.rs")); +} + +#[test] +fn refresh_system_prompt_is_noop_when_unchanged() { + let tmp = tempdir().expect("tempdir"); + let config = EngineConfig { + workspace: tmp.path().to_path_buf(), + ..Default::default() + }; + let (mut engine, _handle) = Engine::new(config, &Config::default()); + + engine.refresh_system_prompt(AppMode::Agent); + let first_hash = engine.session.last_system_prompt_hash; + let first_prompt = engine.session.system_prompt.clone(); + engine.refresh_system_prompt(AppMode::Agent); + + assert_eq!(engine.session.last_system_prompt_hash, first_hash); + assert_eq!(engine.session.system_prompt, first_prompt); +} + +#[test] +fn compaction_summary_stays_in_stable_system_prompt() { let tmp = tempdir().expect("tempdir"); fs::create_dir_all(tmp.path().join("src")).expect("mkdir"); fs::write(tmp.path().join("src/main.rs"), "fn main() {}").expect("write"); @@ -552,20 +606,18 @@ fn compaction_summary_stays_before_volatile_working_set() { cache_control: None, }]))); - let Some(SystemPrompt::Blocks(blocks)) = &engine.session.system_prompt else { - panic!("expected structured prompt blocks"); + let prompt = match &engine.session.system_prompt { + Some(SystemPrompt::Text(text)) => text.clone(), + Some(SystemPrompt::Blocks(blocks)) => blocks + .iter() + .map(|block| block.text.as_str()) + .collect::>() + .join("\n"), + None => panic!("expected system prompt"), }; - let summary_index = blocks - .iter() - .position(|block| block.text.contains(COMPACTION_SUMMARY_MARKER)) - .expect("summary block"); - let working_set_index = blocks - .iter() - .position(|block| block.text.contains(WORKING_SET_SUMMARY_MARKER)) - .expect("working-set block"); - assert!(summary_index < working_set_index); - assert_eq!(working_set_index, blocks.len() - 1); + assert!(prompt.contains(COMPACTION_SUMMARY_MARKER)); + assert!(!prompt.contains(WORKING_SET_SUMMARY_MARKER)); } #[tokio::test] @@ -635,7 +687,7 @@ async fn pre_request_refresh_invoked_when_medium_risk() { engine.config.model = "deepseek-v3.2-128k".to_string(); let long = "x".repeat(5_000); - for _ in 0..200 { + for _ in 0..900 { engine.session.messages.push(Message { role: "user".to_string(), content: vec![ContentBlock::Text { diff --git a/crates/tui/src/core/engine/turn_loop.rs b/crates/tui/src/core/engine/turn_loop.rs index 603309d0..2220dc9a 100644 --- a/crates/tui/src/core/engine/turn_loop.rs +++ b/crates/tui/src/core/engine/turn_loop.rs @@ -230,7 +230,7 @@ impl Engine { }; let request = MessageRequest { model: self.session.model.clone(), - messages: self.session.messages.clone(), + messages: self.messages_with_turn_metadata(), max_tokens: TURN_MAX_OUTPUT_TOKENS, system: self.session.system_prompt.clone(), tools: active_tools.clone(), @@ -1594,4 +1594,35 @@ impl Engine { } (TurnOutcomeStatus::Completed, None) } + + pub(super) fn messages_with_turn_metadata(&self) -> Vec { + let Some(summary) = self + .session + .working_set + .summary_block(&self.config.workspace) + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + else { + return self.session.messages.clone(); + }; + + let mut messages = self.session.messages.clone(); + let Some(last_user) = messages + .iter_mut() + .rev() + .find(|message| message.role == "user") + else { + return messages; + }; + + let turn_meta = format!("\n{summary}\n"); + last_user.content.insert( + 0, + ContentBlock::Text { + text: turn_meta, + cache_control: None, + }, + ); + messages + } } diff --git a/crates/tui/src/core/session.rs b/crates/tui/src/core/session.rs index 94791c40..347b1fa5 100644 --- a/crates/tui/src/core/session.rs +++ b/crates/tui/src/core/session.rs @@ -25,6 +25,9 @@ pub struct Session { /// System prompt (optional) pub system_prompt: Option, + /// Hash of the last assembled stable system prompt. Used to avoid + /// replacing `system_prompt` when unchanged. + pub last_system_prompt_hash: Option, /// Persisted summary blocks generated by context compaction. pub compaction_summary_prompt: Option, @@ -131,6 +134,7 @@ impl Session { } else { None }, + last_system_prompt_hash: None, working_set: WorkingSet::default(), cycle_count: 0, current_cycle_started: Utc::now(), diff --git a/crates/tui/src/prompts.rs b/crates/tui/src/prompts.rs index 9b2b2ecc..eec2646f 100644 --- a/crates/tui/src/prompts.rs +++ b/crates/tui/src/prompts.rs @@ -254,11 +254,11 @@ pub fn system_prompt_for_mode_with_context( /// 4. `## Context Management` (compile-time constant, Agent/Yolo only) /// 5. compaction handoff template (compile-time constant) /// 6. handoff block — file-backed; rewritten by `/compact` and on exit -/// 7. working-set summary — drifts when a new path is observed /// /// Anything appended after a volatile block forfeits the cache for the rest -/// of the request. New blocks belong above the handoff/working-set boundary -/// unless they themselves are turn-volatile. +/// of the request. New blocks belong above the handoff boundary unless they +/// themselves are turn-volatile. Working-set metadata is now injected into the +/// latest user message as per-turn metadata instead of this system prompt. pub fn system_prompt_for_mode_with_context_and_skills( mode: AppMode, workspace: &Path, @@ -283,7 +283,7 @@ pub fn system_prompt_for_mode_with_context_and_skills( pub fn system_prompt_for_mode_with_context_skills_and_session( mode: AppMode, workspace: &Path, - working_set_summary: Option<&str>, + _working_set_summary: Option<&str>, skills_dir: Option<&Path>, instructions: Option<&[PathBuf]>, session_context: PromptSessionContext<'_>, @@ -360,6 +360,7 @@ pub fn system_prompt_for_mode_with_context_skills_and_session( If you notice context is getting long (>80%), proactively suggest using `/compact` to the user.\n\n\ ### Prompt-cache awareness\n\n\ DeepSeek caches the longest *byte-stable prefix* of every request and charges roughly 100× less for cache-hit tokens than miss tokens. The system prompt above is layered most-static-first specifically so the prefix stays stable turn-over-turn. To keep cache hits high:\n\ + - **Working set location:** the current repo working set is injected into the latest user message inside a `` block. Treat it as high-priority turn metadata, not as a stable system-prompt section.\n\ - **Append, don't reorder.** New context goes at the end (latest user / tool messages). Reshuffling earlier messages or rewriting their content invalidates the cache for everything after the change.\n\ - **Don't paraphrase quoted content.** If you've already read a file, refer to it by path or line range instead of re-quoting it with different formatting.\n\ - **Use `/compact` as a hard reset, not a tweak.** Compaction is meant for when the cache is already losing — it intentionally rewrites the prefix to a shorter summary. Don't trigger it for small wins.\n\ @@ -382,13 +383,6 @@ pub fn system_prompt_for_mode_with_context_skills_and_session( full_prompt = format!("{full_prompt}\n\n{handoff_block}"); } - // 7. Working-set summary (drifts when a new path is observed). - if let Some(summary) = working_set_summary - && !summary.trim().is_empty() - { - full_prompt = format!("{full_prompt}\n\n{summary}"); - } - SystemPrompt::Text(full_prompt) } @@ -547,7 +541,7 @@ mod tests { } #[test] - fn session_goal_is_injected_above_volatile_prompt_tail() { + fn session_goal_is_injected_above_handoff_tail() { let tmp = tempdir().expect("tempdir"); let prompt = match system_prompt_for_mode_with_context_skills_and_session( AppMode::Agent, @@ -566,11 +560,10 @@ mod tests { let goal_pos = prompt.find("").expect("goal block"); let compact_pos = prompt.find("## Compaction Handoff").expect("compact block"); - let working_set_pos = prompt.find("## Repo Working Set").expect("working set"); assert!(prompt.contains("Fix transcript corruption")); assert!(goal_pos < compact_pos); - assert!(goal_pos < working_set_pos); + assert!(!prompt.contains("src/lib.rs")); } #[test] @@ -729,12 +722,10 @@ mod tests { } #[test] - fn system_prompt_with_working_set_summary_is_byte_stable_for_constant_summary() { - // The `working_set_summary` argument is the volatile surface (suspect - // #1 in #263). Independently verifying THIS surface needs a separate - // test in working_set.rs; here we just pin that the surrounding - // prompt construction faithfully embeds whatever summary it's given - // without injecting any non-determinism on its own. + fn system_prompt_ignores_working_set_summary_argument() { + // Working-set metadata is now injected into the latest user message + // per turn. The legacy argument remains for call-site compatibility + // but must not reintroduce volatile bytes into the system prompt. let tmp = tempdir().expect("tempdir"); let workspace = tmp.path(); let summary = "## Repo Working Set\nWorkspace: /tmp/x\n"; @@ -754,16 +745,18 @@ mod tests { &a, &b, ); - assert!(a.contains(summary), "summary must be embedded as-is"); + assert!( + !a.contains(summary), + "summary must not be embedded in system prompt" + ); } #[test] fn system_prompt_with_handoff_file_is_byte_stable_when_file_is_unchanged() { - // Companion to the working-set stability test: if `.deepseek/handoff.md` - // hasn't moved between two builds, the rendered prompt must produce - // identical bytes. The handoff block is the second volatile surface - // (the first is the working-set summary) — both land below the static - // boundary in `system_prompt_for_mode_with_context_and_skills`. + // If `.deepseek/handoff.md` hasn't moved between two builds, the + // rendered prompt must produce identical bytes. The handoff block + // lands below the static boundary in + // `system_prompt_for_mode_with_context_and_skills`. let tmp = tempdir().expect("tempdir"); let workspace = tmp.path(); let handoff_dir = workspace.join(".deepseek"); @@ -792,14 +785,11 @@ mod tests { } #[test] - fn handoff_and_working_set_appear_after_static_blocks() { - // Cache-prefix invariant: the volatile blocks (handoff, working_set) - // must come *after* the static `## Context Management` and the - // compaction handoff template (`## Compaction Handoff`) so a churn - // in either volatile section doesn't drag the static blocks out of - // the cached prefix. Pre-fix ordering placed handoff between the - // skills block and `## Context Management`, which busted the cache - // every time `/compact` rewrote the file. + fn handoff_appears_after_static_blocks_without_working_set() { + // Cache-prefix invariant: the handoff block must come after static + // `## Context Management` and the compaction handoff template + // (`## Compaction Handoff`). Working-set metadata is per-turn user + // metadata now, not a system-prompt tail block. let tmp = tempdir().expect("tempdir"); let workspace = tmp.path(); let handoff_dir = workspace.join(".deepseek"); @@ -822,9 +812,10 @@ mod tests { let handoff_pos = prompt .find(HANDOFF_BLOCK_MARKER) .expect("handoff block present when fixture file exists"); - let working_set_pos = prompt - .find("## Repo Working Set") - .expect("working-set summary present when supplied"); + assert!( + !prompt.contains("## Repo Working Set"), + "working-set summary must stay out of the system prompt" + ); assert!( context_pos < handoff_pos, @@ -834,10 +825,6 @@ mod tests { compact_pos < handoff_pos, "## Compaction Handoff must precede the handoff block" ); - assert!( - handoff_pos < working_set_pos, - "handoff block must precede the working-set summary (most-volatile last)" - ); } #[test] From 1a6589c55aff7e015318e673541108a07274ec44 Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Mon, 4 May 2026 22:06:58 -0500 Subject: [PATCH 04/11] perf(tools): anchor tool array with cache control --- crates/tui/src/core/engine.rs | 6 +++--- crates/tui/src/tools/registry.rs | 22 +++++++++++++++++++++- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/crates/tui/src/core/engine.rs b/crates/tui/src/core/engine.rs index aeeb6fb4..e5ff3f01 100644 --- a/crates/tui/src/core/engine.rs +++ b/crates/tui/src/core/engine.rs @@ -887,9 +887,9 @@ impl Engine { } else { Vec::new() }; - let tools = tool_registry - .as_ref() - .map(|registry| build_model_tool_catalog(registry.to_api_tools(), mcp_tools, mode)); + let tools = tool_registry.as_ref().map(|registry| { + build_model_tool_catalog(registry.to_api_tools_with_cache(true), mcp_tools, mode) + }); // Main turn loop let (status, error) = self diff --git a/crates/tui/src/tools/registry.rs b/crates/tui/src/tools/registry.rs index d85031bd..5bbca679 100644 --- a/crates/tui/src/tools/registry.rs +++ b/crates/tui/src/tools/registry.rs @@ -186,7 +186,6 @@ impl ToolRegistry { /// Convert tools to API Tool format with optional cache control on the last tool. #[must_use] - #[allow(dead_code)] pub fn to_api_tools_with_cache(&self, enable_cache: bool) -> Vec { let mut tools = self.to_api_tools(); if enable_cache && let Some(last) = tools.last_mut() { @@ -871,6 +870,27 @@ mod tests { assert_eq!(api_tools[0].description, "A test tool"); } + #[test] + fn api_tools_with_cache_marks_last_tool_ephemeral() { + let tmp = tempdir().expect("tempdir"); + let ctx = ToolContext::new(tmp.path().to_path_buf()); + let mut registry = ToolRegistry::new(ctx); + + registry.register(make_test_tool("tool_a")); + registry.register(make_test_tool("tool_b")); + + let api_tools = registry.to_api_tools_with_cache(true); + assert_eq!(api_tools.len(), 2); + assert!(api_tools[0].cache_control.is_none()); + assert_eq!( + api_tools[1] + .cache_control + .as_ref() + .map(|c| c.cache_type.as_str()), + Some("ephemeral") + ); + } + /// Tool whose `description()` advances through a script of pre-built /// strings, one per call. Used to demonstrate that the api-tools cache /// pins the description bytes on first read instead of re-sampling them From 991ae41571784db492cb22bc8ae279faea1ffca2 Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Mon, 4 May 2026 22:07:04 -0500 Subject: [PATCH 05/11] docs(changelog): describe v0.8.11 cache overhaul --- CHANGELOG.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e7ab259a..65a92712 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,25 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.8.11] - Unreleased + +### Changed +- **Cache-maxing prompt path for DeepSeek V4** — the engine now skips + system-prompt reassignment when the assembled stable prompt is unchanged, + keeps the volatile repo working-set summary out of the system prompt, and + injects it as per-turn metadata on the latest user message instead. +- **Tool catalog cache anchor** — the model-visible tool array now marks + the final native tool with `cache_control: ephemeral` so DeepSeek can + anchor the stable tool prefix explicitly. +- **V4-scale automatic compaction defaults** — automatic compaction keeps a + 500K-token hard floor and the fallback compaction threshold now reflects + the V4-scale late-trigger policy instead of the old 50K-era default. + +### Fixed +- **Legacy 128K context naming** — the 128K fallback is now named and + documented as legacy DeepSeek-only behavior, reducing ambiguity with the + 1M-token DeepSeek V4 defaults. + ## [0.8.10] - 2026-05-04 A patch release: hotfixes, small UX polish, and four whalescale-unblocking From e98efcf31dddde7bb0ffe7432e01de08ee00d74a Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Mon, 4 May 2026 22:08:07 -0500 Subject: [PATCH 06/11] fix(engine): drop dead working set prompt marker --- crates/tui/src/core/engine/context.rs | 1 - crates/tui/src/core/engine/tests.rs | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/tui/src/core/engine/context.rs b/crates/tui/src/core/engine/context.rs index 81f053e0..0a38a063 100644 --- a/crates/tui/src/core/engine/context.rs +++ b/crates/tui/src/core/engine/context.rs @@ -40,7 +40,6 @@ const LARGE_CONTEXT_WINDOW_TOKENS: u32 = 500_000; const TOOL_RESULT_METADATA_SUMMARY_CHARS: usize = 320; pub(super) const COMPACTION_SUMMARY_MARKER: &str = "Conversation Summary (Auto-Generated)"; -pub(super) const WORKING_SET_SUMMARY_MARKER: &str = "## Repo Working Set"; #[derive(Debug, Clone, Copy)] struct ToolResultContextLimits { diff --git a/crates/tui/src/core/engine/tests.rs b/crates/tui/src/core/engine/tests.rs index 8a9751e7..9c28826d 100644 --- a/crates/tui/src/core/engine/tests.rs +++ b/crates/tui/src/core/engine/tests.rs @@ -1,6 +1,5 @@ use super::*; -use super::context::WORKING_SET_SUMMARY_MARKER; use crate::models::SystemBlock; use serde_json::json; use std::collections::HashSet; @@ -9,6 +8,8 @@ use std::path::PathBuf; use std::time::Instant; use tempfile::tempdir; +const WORKING_SET_SUMMARY_MARKER: &str = "## Repo Working Set"; + fn build_engine_with_capacity(capacity: CapacityControllerConfig) -> Engine { let engine_config = EngineConfig { capacity, From d0e95f23b0902d49d96b56e7a02f6f6d4ed70a01 Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Mon, 4 May 2026 22:19:09 -0500 Subject: [PATCH 07/11] refactor(compaction): drop message_threshold, token-only triggering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `message_threshold` field on `CompactionConfig` was a 128K-era heuristic that fired compaction on long sessions of small messages — exactly the case where rewriting V4's prefix cache is most wasteful. Token budget is the only signal that maps to actual model context pressure; counting messages adds nothing. Changes: * Remove `CompactionConfig::message_threshold` field. * Remove the message-count branch in `should_compact` — token threshold + 500K floor is now the sole compaction trigger. * Remove `compaction_message_threshold_for_model`, `DEFAULT_COMPACTION_MESSAGE_THRESHOLD`, `COMPACTION_MESSAGE_DIVISOR`, `MAX_COMPACTION_MESSAGE_THRESHOLD` from `models.rs`. * Drop the `forced_config.message_threshold` tweak in the engine's capacity-guardrail forced-compaction path; that path now also bypasses the floor (`auto_floor_tokens = 0`) because we're at a hard ceiling and have to free budget regardless of cache cost. * Update production constructors (`main.rs`, `runtime_threads.rs`, `app.rs::compaction_config`) to drop the field. * Update tests: keep the floor + token-threshold assertions, delete the two tests that specifically validated message-count triggering, replace `should_compact_respects_message_threshold` with `message_count_no_longer_triggers_compaction` pinning the new contract. Verified locally: * `cargo fmt --all -- --check` clean. * `cargo clippy --workspace --all-targets --all-features --locked -- -D warnings` clean. * `cargo test --workspace --all-features --locked` — 2036 passed in TUI bin (2 ignored), all other crates green. * parity gates: snapshot, parity_protocol, parity_state — all pass. * `git diff --exit-code -- Cargo.lock` — clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/tui/src/compaction.rs | 150 ++++++++++-------------------- crates/tui/src/core/engine.rs | 5 +- crates/tui/src/main.rs | 3 +- crates/tui/src/models.rs | 33 ------- crates/tui/src/runtime_threads.rs | 6 +- crates/tui/src/tui/app.rs | 6 +- 6 files changed, 57 insertions(+), 146 deletions(-) diff --git a/crates/tui/src/compaction.rs b/crates/tui/src/compaction.rs index 0f3acb09..59ecc3c4 100644 --- a/crates/tui/src/compaction.rs +++ b/crates/tui/src/compaction.rs @@ -18,19 +18,25 @@ use crate::models::{ }; /// Configuration for conversation compaction behavior. +/// +/// v0.8.11 simplified this from the prior token-OR-message-count trigger +/// to a token-only trigger gated by an absolute floor. The +/// `message_threshold` field was removed: its only purpose was to fire +/// compaction on long sessions of small messages, which is exactly the +/// case where rewriting the V4 prefix cache is least valuable. Token +/// budget is the right signal; message count was a 128K-era heuristic. #[derive(Debug, Clone, PartialEq)] pub struct CompactionConfig { pub enabled: bool, pub token_threshold: usize, - pub message_threshold: usize, pub model: String, pub cache_summary: bool, /// Hard floor — `should_compact` returns `false` when total session - /// tokens fall below this number, regardless of `enabled`, - /// `token_threshold`, or `message_threshold`. Defaults to - /// [`MINIMUM_AUTO_COMPACTION_TOKENS`] (500K) for v0.8.11+. Tests that - /// want to exercise the older threshold/message-count logic at small - /// fixture sizes can set this to `0` to disable the floor. + /// tokens fall below this number, regardless of `enabled` or + /// `token_threshold`. Defaults to [`MINIMUM_AUTO_COMPACTION_TOKENS`] + /// (500K) for v0.8.11+. Tests that want to exercise the threshold + /// logic at small fixture sizes can set this to `0` to disable the + /// floor. pub auto_floor_tokens: usize, } @@ -50,7 +56,6 @@ impl Default for CompactionConfig { // default no longer lies. Real call sites override this via // `compaction_threshold_for_model_and_effort`. token_threshold: 800_000, - message_threshold: 50, model: DEFAULT_TEXT_MODEL.to_string(), cache_summary: true, auto_floor_tokens: MINIMUM_AUTO_COMPACTION_TOKENS, @@ -61,17 +66,15 @@ impl Default for CompactionConfig { /// Hard floor for automatic compaction in v0.8.11+. /// /// Below this token count, `should_compact` returns `false` regardless of -/// `enabled`, `token_threshold`, or `message_threshold`. The point of the -/// floor is V4 prefix-cache economics: compaction rewrites the stable -/// prefix, which destroys the KV cache. At low token counts the prefix -/// cache is healthy and compaction's cost (full re-prefill at miss prices) -/// dwarfs its benefit (a tiny budget reclaim). Above the floor compaction -/// can still be net-positive — cache is already pressured, the prefix has -/// drifted, and freeing budget matters. +/// `enabled` or `token_threshold`. The point of the floor is V4 prefix-cache +/// economics: compaction rewrites the stable prefix, which destroys the KV +/// cache. At low token counts the prefix cache is healthy and compaction's +/// cost (full re-prefill at miss prices) dwarfs its benefit (a tiny budget +/// reclaim). Above the floor compaction can still be net-positive — cache +/// is already pressured, the prefix has drifted, and freeing budget matters. /// -/// Manual `/compact` slash command and the model-callable `compact_now` -/// tool both bypass this floor with a deliberate refusal message — they -/// represent explicit agency rather than implicit policy. +/// Manual `/compact` slash command bypasses this floor with explicit user +/// agency. /// /// Constant rather than configurable for v0.8.11. If anyone needs to dial /// it (smaller models, opinionated workflows), we can add a setting later. @@ -645,7 +648,6 @@ pub fn should_compact( .iter() .map(|&idx| estimate_tokens_for_message(&messages[idx], false)) .sum(); - let pinned_count = plan.pinned_indices.len(); let token_estimate: usize = plan .summarize_indices @@ -656,21 +658,19 @@ pub fn should_compact( // Pinned messages consume part of the budget, so compact earlier when needed. let effective_token_threshold = config.token_threshold.saturating_sub(pinned_tokens); - let effective_message_threshold = config.message_threshold.saturating_sub(pinned_count); - // Always compact if we exceed the token threshold, even with few unpinned messages. - if token_estimate > effective_token_threshold && effective_token_threshold > 0 { - return true; + // Token-only trigger (v0.8.11): the prior message-count branch was a + // 128K-era heuristic that fired compaction on long chats of small + // messages — exactly the case where rewriting the V4 prefix cache is + // most wasteful. Token budget is the only signal that maps to actual + // model context pressure. + if effective_token_threshold == 0 { + return message_count >= MIN_SUMMARIZE_MESSAGES; } - - let enough_unpinned = message_count >= MIN_SUMMARIZE_MESSAGES - || effective_token_threshold == 0 - || effective_message_threshold == 0; - if !enough_unpinned { + if message_count < MIN_SUMMARIZE_MESSAGES { return false; } - - token_estimate > effective_token_threshold || message_count > effective_message_threshold + token_estimate > effective_token_threshold } fn truncate_chars(text: &str, max_chars: usize) -> &str { @@ -1487,20 +1487,22 @@ mod tests { assert!(!should_compact(&messages, &config, None, None, None)); } + /// v0.8.11: message-count is no longer a compaction trigger. Long + /// chats of small messages stay uncompacted because rewriting the V4 + /// prefix cache for a tiny budget reclaim is net-negative. Only token + /// pressure (and the explicit `/compact` slash command) trigger + /// compaction. #[test] - fn should_compact_respects_message_threshold() { + fn message_count_no_longer_triggers_compaction() { let config = CompactionConfig { enabled: true, - token_threshold: 1_000_000, // Very high - message_threshold: 5, - // Disable the v0.8.11 500K floor so this test exercises the - // pure message-count threshold logic at small fixture sizes. + token_threshold: 1_000_000, auto_floor_tokens: 0, ..Default::default() }; - // Under threshold - let few_messages: Vec = (0..4) + // 200 tiny messages, well above the prior message threshold. + let many_messages: Vec = (0..200) .map(|_| Message { role: "user".to_string(), content: vec![ContentBlock::Text { @@ -1509,19 +1511,9 @@ mod tests { }], }) .collect(); - assert!(!should_compact(&few_messages, &config, None, None, None)); - - // Over threshold - let many_messages: Vec = (0..10) - .map(|_| Message { - role: "user".to_string(), - content: vec![ContentBlock::Text { - text: "x".to_string(), - cache_control: None, - }], - }) - .collect(); - assert!(should_compact(&many_messages, &config, None, None, None)); + // Token total stays minuscule so the token threshold is not hit; + // without the prior message-count trigger, no compaction. + assert!(!should_compact(&many_messages, &config, None, None, None)); } #[test] @@ -1619,7 +1611,6 @@ mod tests { let config = CompactionConfig { enabled: true, token_threshold: 10, - message_threshold: 2, ..Default::default() }; @@ -1630,46 +1621,12 @@ mod tests { assert!(!should_compact(&messages, &config, None, None, None)); } - #[test] - fn should_compact_counts_only_unpinned_messages() { - let config = CompactionConfig { - enabled: true, - token_threshold: 1_000_000, - message_threshold: 5, - auto_floor_tokens: 0, - ..Default::default() - }; - - let mut messages: Vec = (0..7) - .map(|i| msg("user", &format!("noise message {i}"))) - .collect(); - messages.push(msg("user", "Focus on src/core/engine.rs")); - messages.extend((0..4).map(|i| msg("assistant", &format!("recent {i}")))); - - assert!(should_compact(&messages, &config, None, None, None)); - } - - #[test] - fn should_compact_when_pins_consume_budget() { - let config = CompactionConfig { - enabled: true, - token_threshold: 50, - message_threshold: 50, - auto_floor_tokens: 0, - ..Default::default() - }; - - let mut messages = vec![msg("user", "noise 0"), msg("assistant", "noise 1")]; - messages.extend((0..4).map(|_| { - msg( - "assistant", - &format!("{} src/core/engine.rs", "x".repeat(400)), - ) - })); - - // Pinned recent messages exceed the token budget, so unpinned noise should trigger compaction. - assert!(should_compact(&messages, &config, None, None, None)); - } + // v0.8.11: removed `should_compact_counts_only_unpinned_messages` and + // `should_compact_when_pins_consume_budget` — both tested the + // message-count compaction trigger that v0.8.11 deleted. The + // pinned-tokens accounting they exercised is still tested by + // `should_compact_ignores_fully_pinned_context` below; the rest of + // their setup has no contemporary contract to pin. #[test] fn enforce_tool_call_pairs_removes_orphaned_tool_call() { @@ -1925,8 +1882,7 @@ mod tests { fn test_should_compact_token_threshold_triggers() { let config = CompactionConfig { enabled: true, - token_threshold: 100, // Low threshold for testing - message_threshold: 1000, // High message threshold + token_threshold: 100, // Low threshold for testing auto_floor_tokens: 0, ..Default::default() }; @@ -1945,7 +1901,6 @@ mod tests { let config = CompactionConfig { enabled: true, token_threshold: 1000, - message_threshold: 1000, ..Default::default() }; @@ -1963,8 +1918,7 @@ mod tests { fn auto_compaction_floor_blocks_below_500k_even_when_threshold_says_yes() { let config = CompactionConfig { enabled: true, - token_threshold: 100, // would normally fire instantly - message_threshold: 1000, // not the trigger + token_threshold: 100, // would normally fire instantly // Use the production default explicitly so this test pins the // floor's contract rather than relying on `Default`. auto_floor_tokens: MINIMUM_AUTO_COMPACTION_TOKENS, @@ -1983,7 +1937,6 @@ mod tests { let config = CompactionConfig { enabled: true, token_threshold: 2_000_000, - message_threshold: 2_000, auto_floor_tokens: MINIMUM_AUTO_COMPACTION_TOKENS, ..Default::default() }; @@ -2307,7 +2260,6 @@ mod tests { let _config = CompactionConfig { enabled: true, token_threshold: 1000, - message_threshold: 5, ..Default::default() }; @@ -2323,9 +2275,7 @@ mod tests { msg("assistant", "recent 2"), ]; - // Should compact because: - // - More than message_threshold (5) unpinned messages - // - src/main.rs mention pins message 0 + // src/main.rs mention should pin message 0 in the plan. let plan = plan_compaction( &messages, Some(&workspace), diff --git a/crates/tui/src/core/engine.rs b/crates/tui/src/core/engine.rs index e5ff3f01..2f7ef663 100644 --- a/crates/tui/src/core/engine.rs +++ b/crates/tui/src/core/engine.rs @@ -1183,7 +1183,10 @@ impl Engine { .token_threshold .min(target_budget.saturating_sub(1)) .max(1); - forced_config.message_threshold = forced_config.message_threshold.max(1); + // v0.8.11: forced compaction (capacity guardrail) bypasses the floor + // because we're at a hard ceiling and have to free budget regardless + // of cache cost. + forced_config.auto_floor_tokens = 0; match compact_messages_safe( client, diff --git a/crates/tui/src/main.rs b/crates/tui/src/main.rs index a13a8392..445d8110 100644 --- a/crates/tui/src/main.rs +++ b/crates/tui/src/main.rs @@ -3711,7 +3711,7 @@ async fn run_exec_agent( use crate::core::engine::{EngineConfig, spawn_engine}; use crate::core::events::Event; use crate::core::ops::Op; - use crate::models::{compaction_message_threshold_for_model, compaction_threshold_for_model}; + use crate::models::compaction_threshold_for_model; use crate::tools::plan::new_shared_plan_state; use crate::tools::todo::new_shared_todo_list; use crate::tui::app::AppMode; @@ -3725,7 +3725,6 @@ async fn run_exec_agent( enabled: false, model: model.to_string(), token_threshold: compaction_threshold_for_model(model), - message_threshold: compaction_message_threshold_for_model(model), ..Default::default() }; diff --git a/crates/tui/src/models.rs b/crates/tui/src/models.rs index 320d8305..8576cf8a 100644 --- a/crates/tui/src/models.rs +++ b/crates/tui/src/models.rs @@ -14,10 +14,7 @@ pub const DEEPSEEK_V4_CONTEXT_WINDOW_TOKENS: u32 = 1_000_000; /// models resolve to their own scaled value via /// [`compaction_threshold_for_model`] (#664). pub const DEFAULT_COMPACTION_TOKEN_THRESHOLD: usize = 102_400; -pub const DEFAULT_COMPACTION_MESSAGE_THRESHOLD: usize = 50; const COMPACTION_THRESHOLD_PERCENT: u32 = 80; -const COMPACTION_MESSAGE_DIVISOR: u32 = 500; -const MAX_COMPACTION_MESSAGE_THRESHOLD: usize = 2_000; // === Core Message Types === @@ -298,21 +295,6 @@ pub fn compaction_threshold_for_model_and_effort( compaction_threshold_for_model(model) } -/// Derive a compaction message-count threshold from model context window. -#[must_use] -pub fn compaction_message_threshold_for_model(model: &str) -> usize { - let Some(window) = context_window_for_model(model) else { - return DEFAULT_COMPACTION_MESSAGE_THRESHOLD; - }; - - let scaled = usize::try_from(window / COMPACTION_MESSAGE_DIVISOR) - .unwrap_or(DEFAULT_COMPACTION_MESSAGE_THRESHOLD); - scaled.clamp( - DEFAULT_COMPACTION_MESSAGE_THRESHOLD, - MAX_COMPACTION_MESSAGE_THRESHOLD, - ) -} - // === Streaming Structures === #[allow(dead_code)] @@ -469,24 +451,9 @@ mod tests { assert_eq!(compaction_threshold_for_model("unknown-model"), 102_400); } - #[test] - fn compaction_message_threshold_scales_with_context_window() { - assert_eq!( - compaction_message_threshold_for_model("deepseek-v3.2-128k"), - 256 - ); - assert_eq!(compaction_message_threshold_for_model("unknown-model"), 50); - // 200k / 500 = 400, within the 2k cap. - assert_eq!(compaction_message_threshold_for_model("claude-3"), 400); - } - #[test] fn compaction_scales_for_deepseek_v4_1m_context() { assert_eq!(compaction_threshold_for_model("deepseek-v4-pro"), 800_000); - assert_eq!( - compaction_message_threshold_for_model("deepseek-v4-pro"), - 2_000 - ); } #[test] diff --git a/crates/tui/src/runtime_threads.rs b/crates/tui/src/runtime_threads.rs index 33772552..1fb28cd2 100644 --- a/crates/tui/src/runtime_threads.rs +++ b/crates/tui/src/runtime_threads.rs @@ -23,10 +23,7 @@ use crate::core::coherence::CoherenceState; use crate::core::engine::{EngineConfig, EngineHandle, spawn_engine}; use crate::core::events::{Event as EngineEvent, TurnOutcomeStatus}; use crate::core::ops::Op; -use crate::models::{ - ContentBlock, Message, SystemPrompt, Usage, compaction_message_threshold_for_model, - compaction_threshold_for_model, -}; +use crate::models::{ContentBlock, Message, SystemPrompt, Usage, compaction_threshold_for_model}; use crate::tools::plan::new_shared_plan_state; use crate::tools::subagent::SubAgentStatus; use crate::tools::todo::new_shared_todo_list; @@ -1765,7 +1762,6 @@ impl RuntimeThreadManager { enabled: false, model: thread.model.clone(), token_threshold: compaction_threshold_for_model(&thread.model), - message_threshold: compaction_message_threshold_for_model(&thread.model), ..Default::default() }; let network_policy = self.config.network.clone().map(|toml_cfg| { diff --git a/crates/tui/src/tui/app.rs b/crates/tui/src/tui/app.rs index f1d274b8..29f52073 100644 --- a/crates/tui/src/tui/app.rs +++ b/crates/tui/src/tui/app.rs @@ -15,10 +15,7 @@ use crate::core::coherence::CoherenceState; use crate::cycle_manager::{CycleBriefing, CycleConfig}; use crate::hooks::{HookContext, HookEvent, HookExecutor, HookResult}; use crate::localization::{Locale, MessageId, resolve_locale, tr}; -use crate::models::{ - Message, SystemPrompt, compaction_message_threshold_for_model, - compaction_threshold_for_model_and_effort, -}; +use crate::models::{Message, SystemPrompt, compaction_threshold_for_model_and_effort}; use crate::palette::{self, UiTheme}; use crate::session_manager::SessionContextReference; use crate::settings::Settings; @@ -3169,7 +3166,6 @@ impl App { CompactionConfig { enabled: self.auto_compact, token_threshold: self.compact_threshold, - message_threshold: compaction_message_threshold_for_model(&self.model), model: self.model.clone(), ..Default::default() } From 229f02ea2caca67db8fa0bcd694839fed7afbe95 Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Mon, 4 May 2026 22:19:34 -0500 Subject: [PATCH 08/11] feat(npm): install.js network resilience for slow / firewalled networks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A community user from China reported `npm install deepseek-tui` took 18 minutes through a CN npm mirror. The bottleneck is the GitHub Releases binary fetch (~46 MB across two binaries), not the npm tarball (which is 6.9 kB). The CN mirror does NOT proxy GitHub release downloads, so any user behind a slow or lossy connection is hitting the GitHub fetch directly with no resilience. Four behaviors added to `npm/deepseek-tui/scripts/install.js`: 1. **Retry with exponential backoff.** Up to 5 attempts on network errors (ECONNRESET, ECONNREFUSED, ETIMEDOUT, EAI_AGAIN, network/host unreachable, EPIPE, ECONNABORTED) and 5xx upstream responses. Backoff `1s, 2s, 4s, 8s, 16s` with ±20% jitter. 4xx and checksum-mismatch are flagged non-retryable so we don't thrash on permanent failures. Final error includes the underlying message and the attempt count. 2. **Per-attempt total timeout + stall detector.** Total timeout defaults to 5 minutes per attempt (`DEEPSEEK_TUI_DOWNLOAD_TIMEOUT_MS`, alias `DEEPSEEK_DOWNLOAD_TIMEOUT_MS`). A stall detector aborts the request when no bytes arrive for 30 s (`DEEPSEEK_TUI_DOWNLOAD_STALL_MS`, alias `DEEPSEEK_DOWNLOAD_STALL_MS`) so a hung connection doesn't waste the whole timeout. Both budgets are surfaced in the error so the user can dial them up if they're on a slow pipe. 3. **HTTPS_PROXY / HTTP_PROXY support — pure Node, no new dependencies.** Detects `HTTPS_PROXY` / `HTTP_PROXY` (and the lowercase variants) and routes through the proxy via CONNECT tunneling. `NO_PROXY` exclusion list honored, with `*` and dotted- suffix matching. Proxy auth via standard `user:pass@` URL form is passed through as `Proxy-Authorization: Basic ...`. Pure-Node implementation using `net` + `tls` + `http` + `https` builtins — no `https-proxy-agent` dependency added. 4. **Download progress indicator.** Writes to stderr every ~1 MB or every 2 s in TTY mode using `\r` to overwrite a single line. Non-TTY mode (CI, piped) emits one line per 5 MB so logs stay reasonable. Suppressed when `DEEPSEEK_TUI_QUIET_INSTALL=1` or when `npm_config_loglevel` is `silent` or `error`. Falls back to `N MB downloaded` when the response has no `Content-Length`. Public API unchanged: existing callers of `getBinaryPath` and `run` keep working identically when no new env vars are set. The escape hatch `DEEPSEEK_TUI_DISABLE_INSTALL=1` still exits cleanly. Verified locally: * `node -c install.js` and module-load syntax checks. * `DEEPSEEK_TUI_FORCE_DOWNLOAD=1 DEEPSEEK_TUI_VERSION=0.8.10 node install.js` — real GitHub Releases download succeeded with visible progress, both binaries landed. * `HTTPS_PROXY=http://invalid.proxy.local:9999 ... node install.js` — proxy path exercised, fails cleanly with the bad host named in the error message after retries exhausted. Co-Authored-By: Claude Opus 4.7 (1M context) --- npm/deepseek-tui/scripts/install.js | 776 ++++++++++++++++++++++++++-- 1 file changed, 740 insertions(+), 36 deletions(-) diff --git a/npm/deepseek-tui/scripts/install.js b/npm/deepseek-tui/scripts/install.js index 3f23703b..52e0219e 100644 --- a/npm/deepseek-tui/scripts/install.js +++ b/npm/deepseek-tui/scripts/install.js @@ -1,10 +1,12 @@ const fs = require("fs"); const https = require("https"); const http = require("http"); +const net = require("net"); +const tls = require("tls"); const crypto = require("crypto"); +const { URL } = require("url"); const { mkdir, chmod, stat, rename, readFile, unlink, writeFile } = fs.promises; const { createWriteStream } = fs; -const { pipeline } = require("stream/promises"); const path = require("path"); const { @@ -16,6 +18,46 @@ const { const { preflightGlibc } = require("./preflight-glibc"); const pkg = require("../package.json"); +const DEFAULT_TIMEOUT_MS = 300_000; // 5 minutes per attempt +const DEFAULT_STALL_MS = 30_000; // abort if no bytes for 30s +const MAX_ATTEMPTS = 5; +const BASE_BACKOFF_MS = 1_000; + +const RETRYABLE_NET_CODES = new Set([ + "ECONNRESET", + "ECONNREFUSED", + "ETIMEDOUT", + "EAI_AGAIN", + "ENETUNREACH", + "EHOSTUNREACH", + "EPIPE", + "ECONNABORTED", +]); + +class NonRetryableError extends Error { + constructor(message) { + super(message); + this.name = "NonRetryableError"; + this.nonRetryable = true; + } +} + +class HttpStatusError extends Error { + constructor(status, url) { + super(`Request failed with status ${status}: ${url}`); + this.name = "HttpStatusError"; + this.status = status; + } +} + +class DownloadTimeoutError extends Error { + constructor(message) { + super(message); + this.name = "DownloadTimeoutError"; + this.code = "EDOWNLOADTIMEOUT"; + } +} + function resolvePackageVersion() { const configuredVersion = process.env.DEEPSEEK_TUI_VERSION || @@ -44,45 +86,705 @@ function binaryPaths() { }; } -async function httpGet(url) { - const client = url.startsWith("https:") ? https : http; - const response = await new Promise((resolve, reject) => { - client.get(url, (res) => { - const status = res.statusCode || 0; - if (status >= 300 && status < 400 && res.headers.location) { - resolve({ redirect: res.headers.location, response: null }); - return; - } - if (status !== 200) { - reject(new Error(`Request failed with status ${status}: ${url}`)); - return; - } - resolve({ redirect: null, response: res }); - }).on("error", reject); - }); - return response; +// ──────────────────────────────────────────────────────────────────────────── +// Logging / progress +// ──────────────────────────────────────────────────────────────────────────── + +function isQuietInstall() { + if (process.env.DEEPSEEK_TUI_QUIET_INSTALL === "1") { + return true; + } + const level = (process.env.npm_config_loglevel || "").toLowerCase(); + return level === "silent" || level === "error"; } -async function download(url, destination) { - const resolved = await httpGet(url); - if (resolved.redirect) { - return download(resolved.redirect, destination); +function logInfo(message) { + if (isQuietInstall()) { + return; } + process.stderr.write(`deepseek-tui: ${message}\n`); +} + +function envInt(name, fallback) { + const raw = process.env[name]; + if (!raw) { + return fallback; + } + const parsed = Number.parseInt(String(raw).trim(), 10); + if (!Number.isFinite(parsed) || parsed <= 0) { + return fallback; + } + return parsed; +} + +function downloadTimeoutMs() { + return envInt( + "DEEPSEEK_TUI_DOWNLOAD_TIMEOUT_MS", + envInt("DEEPSEEK_DOWNLOAD_TIMEOUT_MS", DEFAULT_TIMEOUT_MS), + ); +} + +function downloadStallMs() { + return envInt( + "DEEPSEEK_TUI_DOWNLOAD_STALL_MS", + envInt("DEEPSEEK_DOWNLOAD_STALL_MS", DEFAULT_STALL_MS), + ); +} + +function formatMb(bytes) { + return (bytes / (1024 * 1024)).toFixed(0); +} + +function createProgressReporter(assetName, totalBytes) { + if (isQuietInstall()) { + return { onChunk: () => {}, finish: () => {} }; + } + const isTty = !!process.stderr.isTTY; + const interactive = isTty; + const tickBytes = interactive ? 1 * 1024 * 1024 : 5 * 1024 * 1024; + const tickMs = 2_000; + + let received = 0; + let lastBytesPrinted = 0; + let lastTimePrinted = 0; + let everPrinted = false; + + const render = (final) => { + if (totalBytes && totalBytes > 0) { + const pct = Math.min(100, Math.round((received / totalBytes) * 100)); + const line = `deepseek-tui: downloading ${assetName}: ${formatMb(received)} / ${formatMb(totalBytes)} MB (${pct}%)`; + if (interactive) { + process.stderr.write(`${line}\r`); + } else { + process.stderr.write(`${line}\n`); + } + } else { + const line = `deepseek-tui: downloading ${assetName}: ${formatMb(received)} MB downloaded`; + if (interactive) { + process.stderr.write(`${line}\r`); + } else { + process.stderr.write(`${line}\n`); + } + } + everPrinted = true; + lastBytesPrinted = received; + lastTimePrinted = Date.now(); + }; + + return { + onChunk(chunkLen) { + received += chunkLen; + const now = Date.now(); + if ( + received - lastBytesPrinted >= tickBytes || + (interactive && now - lastTimePrinted >= tickMs) + ) { + render(false); + } + }, + finish() { + // Final line — always render once. + render(true); + if (interactive && everPrinted) { + // Move past the carriage-return line and emit a "done" footer. + process.stderr.write("\n"); + } + process.stderr.write(`deepseek-tui: ${assetName} ... done.\n`); + }, + }; +} + +// ──────────────────────────────────────────────────────────────────────────── +// Proxy support (HTTPS_PROXY / HTTP_PROXY / NO_PROXY) — pure Node, CONNECT +// tunnel + TLS upgrade for HTTPS targets. +// ──────────────────────────────────────────────────────────────────────────── + +function getProxyUrl(targetUrl) { + const isHttps = targetUrl.protocol === "https:"; + const candidates = isHttps + ? ["HTTPS_PROXY", "https_proxy", "HTTP_PROXY", "http_proxy"] + : ["HTTP_PROXY", "http_proxy"]; + for (const name of candidates) { + const raw = process.env[name]; + if (raw && String(raw).trim() !== "") { + return String(raw).trim(); + } + } + return null; +} + +function shouldBypassProxy(host) { + const raw = process.env.NO_PROXY || process.env.no_proxy; + if (!raw) { + return false; + } + const lower = String(host).toLowerCase(); + for (const part of String(raw).split(",")) { + const entry = part.trim().toLowerCase(); + if (!entry) { + continue; + } + if (entry === "*") { + return true; + } + // Strip leading dot and any explicit port. + const stripped = entry.replace(/^\./, "").replace(/:.*$/, ""); + if (!stripped) { + continue; + } + if (lower === stripped || lower.endsWith(`.${stripped}`)) { + return true; + } + } + return false; +} + +function parseProxy(proxyStr) { + // Accept "http://user:pass@host:port" and bare "host:port". + const normalized = /^[a-z][a-z0-9+\-.]*:\/\//i.test(proxyStr) + ? proxyStr + : `http://${proxyStr}`; + const u = new URL(normalized); + const port = u.port + ? Number.parseInt(u.port, 10) + : u.protocol === "https:" + ? 443 + : 80; + let auth = null; + if (u.username) { + const user = decodeURIComponent(u.username); + const pass = u.password ? decodeURIComponent(u.password) : ""; + auth = Buffer.from(`${user}:${pass}`).toString("base64"); + } + return { + protocol: u.protocol, + host: u.hostname, + port, + auth, + raw: proxyStr, + }; +} + +function connectThroughProxy(proxy, targetHost, targetPort, timeoutMs) { + return new Promise((resolve, reject) => { + const socket = net.connect({ host: proxy.host, port: proxy.port }); + let settled = false; + const fail = (err) => { + if (settled) return; + settled = true; + try { + socket.destroy(); + } catch { + // ignore + } + reject(err); + }; + + const timer = timeoutMs > 0 + ? setTimeout(() => fail(new DownloadTimeoutError( + `proxy CONNECT to ${proxy.host}:${proxy.port} timed out after ${timeoutMs} ms`, + )), timeoutMs) + : null; + + socket.once("error", (err) => { + if (timer) clearTimeout(timer); + // Surface proxy host so the user can fix it. + const wrapped = new Error( + `proxy connection failed (${proxy.host}:${proxy.port}): ${err.message}`, + ); + wrapped.code = err.code; + fail(wrapped); + }); + + socket.once("connect", () => { + const lines = [ + `CONNECT ${targetHost}:${targetPort} HTTP/1.1`, + `Host: ${targetHost}:${targetPort}`, + "User-Agent: deepseek-tui-installer", + "Proxy-Connection: keep-alive", + ]; + if (proxy.auth) { + lines.push(`Proxy-Authorization: Basic ${proxy.auth}`); + } + const req = `${lines.join("\r\n")}\r\n\r\n`; + + let buf = Buffer.alloc(0); + const onData = (chunk) => { + buf = Buffer.concat([buf, chunk]); + const idx = buf.indexOf("\r\n\r\n"); + if (idx === -1) { + if (buf.length > 16 * 1024) { + socket.removeListener("data", onData); + fail(new Error( + `proxy ${proxy.host}:${proxy.port} returned an oversized response header`, + )); + } + return; + } + socket.removeListener("data", onData); + const head = buf.slice(0, idx).toString("utf8"); + const firstLine = head.split(/\r?\n/, 1)[0] || ""; + const m = firstLine.match(/^HTTP\/\d\.\d\s+(\d{3})/); + if (!m) { + fail(new Error(`proxy ${proxy.host}:${proxy.port} returned invalid CONNECT reply: ${firstLine}`)); + return; + } + const code = Number.parseInt(m[1], 10); + if (code !== 200) { + fail(new Error( + `proxy ${proxy.host}:${proxy.port} refused CONNECT to ${targetHost}:${targetPort}: HTTP ${code}`, + )); + return; + } + if (timer) clearTimeout(timer); + if (settled) return; + settled = true; + // Any bytes past the header belong to the tunneled stream — but in + // practice CONNECT 200 has no body; if it did, we'd lose those bytes + // here. Keep it simple: trust well-behaved proxies. + resolve(socket); + }; + socket.on("data", onData); + socket.write(req, "utf8"); + }); + }); +} + +// ──────────────────────────────────────────────────────────────────────────── +// HTTP request with timeout, stall detection, and proxy support. +// ──────────────────────────────────────────────────────────────────────────── + +function httpRequest(rawUrl, opts = {}) { + const totalTimeoutMs = opts.totalTimeoutMs ?? downloadTimeoutMs(); + const stallMs = opts.stallMs ?? downloadStallMs(); + + return new Promise((resolve, reject) => { + let url; + try { + url = new URL(rawUrl); + } catch (err) { + reject(new NonRetryableError(`Invalid URL: ${rawUrl} (${err.message})`)); + return; + } + if (url.protocol !== "https:" && url.protocol !== "http:") { + reject(new NonRetryableError(`Unsupported protocol: ${url.protocol}`)); + return; + } + + const proxyStr = !shouldBypassProxy(url.hostname) ? getProxyUrl(url) : null; + const isHttps = url.protocol === "https:"; + const port = url.port + ? Number.parseInt(url.port, 10) + : isHttps + ? 443 + : 80; + + let totalTimer = null; + let stallTimer = null; + let settled = false; + let req = null; + let res = null; + + const cleanup = () => { + if (totalTimer) { + clearTimeout(totalTimer); + totalTimer = null; + } + if (stallTimer) { + clearTimeout(stallTimer); + stallTimer = null; + } + }; + + const fail = (err) => { + if (settled) return; + settled = true; + cleanup(); + try { + if (req && !req.destroyed) req.destroy(); + } catch { + // ignore + } + try { + if (res && !res.destroyed) res.destroy(); + } catch { + // ignore + } + reject(err); + }; + + if (totalTimeoutMs > 0) { + totalTimer = setTimeout(() => { + fail(new DownloadTimeoutError( + `download exceeded total timeout of ${totalTimeoutMs} ms ` + + `(set DEEPSEEK_TUI_DOWNLOAD_TIMEOUT_MS to raise it; current stall budget is ${stallMs} ms)`, + )); + }, totalTimeoutMs); + } + + const armStallTimer = () => { + if (stallMs <= 0) return; + if (stallTimer) clearTimeout(stallTimer); + stallTimer = setTimeout(() => { + fail(new DownloadTimeoutError( + `download stalled — no bytes received for ${stallMs} ms ` + + `(set DEEPSEEK_TUI_DOWNLOAD_STALL_MS to raise it; total budget is ${totalTimeoutMs} ms)`, + )); + }, stallMs); + }; + + const launch = (socket) => { + const reqOptions = { + method: "GET", + host: url.hostname, + port, + path: `${url.pathname}${url.search || ""}`, + headers: { + Host: url.host, + "User-Agent": "deepseek-tui-installer", + Accept: "*/*", + Connection: "close", + }, + }; + if (socket) { + reqOptions.createConnection = () => socket; + if (isHttps) { + // Wrap raw TCP socket from CONNECT in TLS. + const tlsSocket = tls.connect({ + socket, + servername: url.hostname, + ALPNProtocols: ["http/1.1"], + }); + tlsSocket.once("error", (err) => fail(err)); + reqOptions.createConnection = () => tlsSocket; + } + } + const client = isHttps ? https : http; + try { + req = client.request(reqOptions, (response) => { + res = response; + armStallTimer(); + response.on("data", () => { + armStallTimer(); + }); + response.on("end", () => { + cleanup(); + }); + response.on("error", (err) => fail(err)); + + const status = response.statusCode || 0; + if (status >= 300 && status < 400 && response.headers.location) { + cleanup(); + settled = true; + response.resume(); + resolve({ redirect: response.headers.location, response: null }); + return; + } + if (status < 200 || status >= 300) { + const err = new HttpStatusError(status, rawUrl); + // 4xx: non-retryable; 5xx: retryable. + if (status >= 400 && status < 500) { + err.nonRetryable = true; + } + fail(err); + return; + } + if (settled) return; + settled = true; + // Hand the live response stream to the caller. + resolve({ redirect: null, response }); + }); + req.once("error", (err) => fail(err)); + req.once("socket", (s) => { + // Belt-and-suspenders: surface socket-level errors quickly. + s.once("error", (err) => fail(err)); + }); + req.end(); + } catch (err) { + fail(err); + } + }; + + if (proxyStr) { + let proxy; + try { + proxy = parseProxy(proxyStr); + } catch (err) { + fail(new NonRetryableError( + `Invalid proxy URL "${proxyStr}": ${err.message}`, + )); + return; + } + if (!isHttps) { + // Plain HTTP through proxy — send absolute URI, no CONNECT. + const client = http; + try { + req = client.request( + { + host: proxy.host, + port: proxy.port, + method: "GET", + path: rawUrl, + headers: { + Host: url.host, + "User-Agent": "deepseek-tui-installer", + Accept: "*/*", + Connection: "close", + ...(proxy.auth ? { "Proxy-Authorization": `Basic ${proxy.auth}` } : {}), + }, + }, + (response) => { + res = response; + armStallTimer(); + response.on("data", () => armStallTimer()); + response.on("end", () => cleanup()); + response.on("error", (err) => fail(err)); + const status = response.statusCode || 0; + if (status >= 300 && status < 400 && response.headers.location) { + cleanup(); + settled = true; + response.resume(); + resolve({ redirect: response.headers.location, response: null }); + return; + } + if (status < 200 || status >= 300) { + const err = new HttpStatusError(status, rawUrl); + if (status >= 400 && status < 500) err.nonRetryable = true; + fail(err); + return; + } + if (settled) return; + settled = true; + resolve({ redirect: null, response }); + }, + ); + req.once("error", (err) => fail(err)); + req.end(); + } catch (err) { + fail(err); + } + return; + } + + // HTTPS through proxy: CONNECT tunnel + TLS upgrade. + connectThroughProxy(proxy, url.hostname, port, Math.max(stallMs, 5_000)) + .then((tcpSocket) => { + if (settled) { + try { tcpSocket.destroy(); } catch { /* ignore */ } + return; + } + const tlsSocket = tls.connect({ + socket: tcpSocket, + servername: url.hostname, + ALPNProtocols: ["http/1.1"], + }); + tlsSocket.once("error", (err) => fail(err)); + tlsSocket.once("secureConnect", () => { + if (settled) { + try { tlsSocket.destroy(); } catch { /* ignore */ } + return; + } + const reqOptions = { + method: "GET", + createConnection: () => tlsSocket, + path: `${url.pathname}${url.search || ""}`, + headers: { + Host: url.host, + "User-Agent": "deepseek-tui-installer", + Accept: "*/*", + Connection: "close", + }, + }; + try { + req = https.request(reqOptions, (response) => { + res = response; + armStallTimer(); + response.on("data", () => armStallTimer()); + response.on("end", () => cleanup()); + response.on("error", (err) => fail(err)); + const status = response.statusCode || 0; + if (status >= 300 && status < 400 && response.headers.location) { + cleanup(); + settled = true; + response.resume(); + resolve({ redirect: response.headers.location, response: null }); + return; + } + if (status < 200 || status >= 300) { + const err = new HttpStatusError(status, rawUrl); + if (status >= 400 && status < 500) err.nonRetryable = true; + fail(err); + return; + } + if (settled) return; + settled = true; + resolve({ redirect: null, response }); + }); + req.once("error", (err) => fail(err)); + req.end(); + } catch (err) { + fail(err); + } + }); + }) + .catch((err) => fail(err)); + return; + } + + // No proxy — direct connection. + launch(null); + }); +} + +// ──────────────────────────────────────────────────────────────────────────── +// Retry wrapper +// ──────────────────────────────────────────────────────────────────────────── + +function isRetryable(err) { + if (!err) return false; + if (err.nonRetryable) return false; + if (err instanceof NonRetryableError) return false; + if (err instanceof DownloadTimeoutError) return true; + if (err instanceof HttpStatusError) { + return err.status >= 500; + } + if (err.code && RETRYABLE_NET_CODES.has(err.code)) return true; + // Network-flavored messages we may see without a code. + const msg = String(err.message || "").toLowerCase(); + if (msg.includes("network") && msg.includes("unreachable")) return true; + if (msg.includes("socket hang up")) return true; + if (msg.includes("aborted")) return true; + return false; +} + +function backoffDelay(attempt) { + // attempt is 1-indexed; first retry waits ~1s. + const base = BASE_BACKOFF_MS * 2 ** (attempt - 1); + const jitter = (Math.random() * 0.4 - 0.2) * base; // ±20% + return Math.max(0, Math.round(base + jitter)); +} + +function sleep(ms) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +async function withRetry(label, fn) { + let lastErr; + for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) { + try { + return await fn(attempt); + } catch (err) { + lastErr = err; + if (!isRetryable(err) || attempt === MAX_ATTEMPTS) { + break; + } + const wait = backoffDelay(attempt); + logInfo( + `${label} failed (attempt ${attempt}/${MAX_ATTEMPTS}): ${err.message}; retrying in ${wait} ms`, + ); + await sleep(wait); + } + } + const msg = lastErr && lastErr.message ? lastErr.message : String(lastErr); + const wrapped = new Error( + `${label} failed after ${MAX_ATTEMPTS} attempt(s): ${msg}`, + ); + if (lastErr && lastErr.stack) { + wrapped.cause = lastErr; + } + throw wrapped; +} + +// ──────────────────────────────────────────────────────────────────────────── +// Public download primitives (now retry + progress aware) +// ──────────────────────────────────────────────────────────────────────────── + +async function followRedirects(url, opts) { + const maxRedirects = 10; + let current = url; + for (let hop = 0; hop < maxRedirects; hop++) { + const result = await httpRequest(current, opts); + if (result.redirect) { + try { + current = new URL(result.redirect, current).toString(); + } catch { + current = result.redirect; + } + continue; + } + return result; + } + throw new NonRetryableError(`too many redirects starting at ${url}`); +} + +function streamToFile(response, destination, progress) { + return new Promise((resolve, reject) => { + const sink = createWriteStream(destination); + let done = false; + const finish = (err) => { + if (done) return; + done = true; + if (err) { + sink.destroy(); + reject(err); + } else { + resolve(); + } + }; + response.on("data", (chunk) => { + if (progress) progress.onChunk(chunk.length); + }); + response.on("error", (err) => finish(err)); + sink.on("error", (err) => finish(err)); + sink.on("finish", () => finish(null)); + response.pipe(sink); + }); +} + +async function download(url, destination, options = {}) { await mkdir(path.dirname(destination), { recursive: true }); - await pipeline(resolved.response, createWriteStream(destination)); + const assetName = options.assetName || path.basename(destination); + await withRetry(`download ${assetName}`, async (attempt) => { + const result = await followRedirects(url, { + totalTimeoutMs: downloadTimeoutMs(), + stallMs: downloadStallMs(), + }); + const response = result.response; + const lenHeader = response.headers["content-length"]; + const total = lenHeader ? Number.parseInt(lenHeader, 10) : 0; + const progress = createProgressReporter(assetName, Number.isFinite(total) ? total : 0); + if (attempt > 1) { + logInfo(`retry attempt ${attempt}/${MAX_ATTEMPTS} for ${assetName}`); + } + try { + await streamToFile(response, destination, progress); + } catch (err) { + // Ensure we don't leave a partial file confusing future attempts. + try { + await unlink(destination); + } catch { + // ignore + } + throw err; + } + progress.finish(); + }); } async function downloadText(url) { - const resolved = await httpGet(url); - if (resolved.redirect) { - return downloadText(resolved.redirect); - } - const chunks = []; - resolved.response.setEncoding("utf8"); - for await (const chunk of resolved.response) { - chunks.push(chunk); - } - return chunks.join(""); + return withRetry(`fetch ${url}`, async () => { + const result = await followRedirects(url, { + totalTimeoutMs: downloadTimeoutMs(), + stallMs: downloadStallMs(), + }); + const response = result.response; + const chunks = []; + response.setEncoding("utf8"); + for await (const chunk of response) { + chunks.push(chunk); + } + return chunks.join(""); + }); } async function readLocalVersion(file) { @@ -122,11 +824,13 @@ async function sha256File(filePath) { async function verifyChecksum(filePath, assetName, checksums) { const expected = checksums.get(assetName); if (!expected) { - throw new Error(`Checksum manifest is missing ${assetName}`); + throw new NonRetryableError(`Checksum manifest is missing ${assetName}`); } const actual = await sha256File(filePath); if (actual !== expected) { - throw new Error( + // Bytes are corrupted; another fetch is unlikely to help without a fix + // upstream. Mark non-retryable. + throw new NonRetryableError( `Checksum mismatch for ${assetName}: expected ${expected}, got ${actual}`, ); } @@ -152,7 +856,7 @@ async function ensureBinary(targetPath, assetName, version, repo, getChecksums) const checksums = await getChecksums(); const url = releaseAssetUrl(assetName, version, repo); const destination = `${targetPath}.${process.pid}.${Date.now()}.download`; - await download(url, destination); + await download(url, destination, { assetName }); try { await verifyChecksum(destination, assetName, checksums); preflightGlibc(destination); From a08c8915429a09728f145230900721c41f36be8b Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Mon, 4 May 2026 22:20:07 -0500 Subject: [PATCH 09/11] docs(changelog): cover message_threshold removal + install.js resilience Folds two follow-on changes into the 0.8.11 entry: * The token-only compaction trigger (drops the 128K-era message-count heuristic). * The npm `install.js` network-resilience cluster (retry + timeout + proxy + progress). Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 65a92712..49f5742f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,11 +18,28 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **V4-scale automatic compaction defaults** — automatic compaction keeps a 500K-token hard floor and the fallback compaction threshold now reflects the V4-scale late-trigger policy instead of the old 50K-era default. +- **Token-only compaction trigger** — the message-count compaction trigger + was a 128K-era heuristic that fired on long sessions of small messages + — exactly the case where rewriting V4's prefix cache is most wasteful. + Removed `CompactionConfig::message_threshold` and the message-count + branch in `should_compact`; token budget is now the sole automatic + trigger (gated by the 500K floor). Manual `/compact` is unchanged. ### Fixed - **Legacy 128K context naming** — the 128K fallback is now named and documented as legacy DeepSeek-only behavior, reducing ambiguity with the 1M-token DeepSeek V4 defaults. +- **`npm install` resilience for slow / firewalled networks** — the + postinstall binary fetch from GitHub Releases now retries on transient + errors (5 attempts, 1-16 s exponential backoff with jitter), enforces a + per-attempt timeout (default 5 min, configurable via + `DEEPSEEK_TUI_DOWNLOAD_TIMEOUT_MS`) plus a 30 s stall detector, honors + `HTTPS_PROXY` / `HTTP_PROXY` / `NO_PROXY` env vars (pure-Node CONNECT + tunneling, no new dependencies), and prints a download-progress line + to stderr so users know it isn't hung. Suppressible with + `DEEPSEEK_TUI_QUIET_INSTALL=1`. Reported by a community user from China + whose install through a CN npm mirror took 18 minutes — the bottleneck + was the GitHub fetch, which CN npm mirrors do not proxy. ## [0.8.10] - 2026-05-04 From 76dd924c7fcb791f4d48b1acac509ffaa566c205 Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Mon, 4 May 2026 22:27:11 -0500 Subject: [PATCH 10/11] fix(engine): turn_meta must skip tool-result messages (HTTP 400 fix) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live-test repro: typing a single user message in the TUI triggered a tool call (read_file Cargo.toml), and the *next* request to DeepSeek's API returned HTTP 400: "An assistant message with 'tool_calls' must be followed by tool messages responding to each 'tool_call_id'. (insufficient tool messages following tool_calls message)" Root cause: `messages_with_turn_metadata` walked the message list from the tail and prepended a `` Text block to the *last* message with role="user". But tool-result messages also use role="user" internally (they serialize to role="tool" on the wire). Inserting a Text content block at index 0 of a tool-result message changed the shape from `[ToolResult(...)]` to `[Text("turn_meta..."), ToolResult(...)]`, which on the wire becomes a role="user" message with text instead of the role="tool" message the API needs to satisfy the assistant's prior tool_call. Hence the 400. The fix: * Restrict the injection target to messages that have at least one Text content block AND no ToolResult blocks. This identifies actual user-typed messages and skips tool-result envelopes. * When the trailing slice has no eligible user message (e.g. mid-turn when a tool result is the most recent message), skip injection entirely. The working_set will surface again on the next genuine user prompt; we don't retroactively prepend onto an earlier user message because that would also confuse the API's tool-call continuity checks. Two regression tests pin the contract: * `turn_metadata_skips_tool_result_messages` — assistant tool_call + tool_result + earlier user message: only the user message gets the prefix, the tool_result message stays a single-block ToolResult. * `turn_metadata_skips_when_only_tool_results_trail` — the corner case where the trailing user-role message is solely a tool result (no real user message in the slice): no injection happens, the message returns unchanged. Verified locally: * 2038 tests passed in TUI bin (2 ignored, was 2036 — these are the +2 new regressions). * `cargo fmt`, `cargo clippy --locked -D warnings`, parity gates all clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/tui/src/core/engine/tests.rs | 126 ++++++++++++++++++++++++ crates/tui/src/core/engine/turn_loop.rs | 26 ++++- 2 files changed, 147 insertions(+), 5 deletions(-) diff --git a/crates/tui/src/core/engine/tests.rs b/crates/tui/src/core/engine/tests.rs index 9c28826d..00a82681 100644 --- a/crates/tui/src/core/engine/tests.rs +++ b/crates/tui/src/core/engine/tests.rs @@ -567,6 +567,132 @@ fn working_set_reaches_model_as_turn_metadata() { assert!(text.contains("src/lib.rs")); } +/// v0.8.11 regression: tool-result messages serialize to role="tool" on +/// the wire but are stored as role="user" internally. Prepending +/// `` text onto a tool-result message broke the +/// assistant→tool_result invariant and caused HTTP 400 from DeepSeek's +/// API ("insufficient tool messages following tool_calls"). The fix: +/// inject only into messages that have a Text content block and no +/// ToolResult blocks; mid-turn (tool-result is the trailing user +/// message) the injection skips. +#[test] +fn turn_metadata_skips_tool_result_messages() { + let tmp = tempdir().expect("tempdir"); + fs::create_dir_all(tmp.path().join("src")).expect("mkdir"); + fs::write(tmp.path().join("src/lib.rs"), "pub fn sample() {}").expect("write"); + + let config = EngineConfig { + workspace: tmp.path().to_path_buf(), + ..Default::default() + }; + let (mut engine, _handle) = Engine::new(config, &Config::default()); + engine + .session + .working_set + .observe_user_message("inspect src/lib.rs", tmp.path()); + + // Real user message — should be eligible for injection. + engine.session.add_message(Message { + role: "user".to_string(), + content: vec![ContentBlock::Text { + text: "inspect src/lib.rs".to_string(), + cache_control: None, + }], + }); + // Assistant tool-call. + engine.session.add_message(Message { + role: "assistant".to_string(), + content: vec![ContentBlock::ToolUse { + id: "call_42".to_string(), + name: "read_file".to_string(), + input: serde_json::json!({"path": "src/lib.rs"}), + caller: None, + }], + }); + // Tool result, stored as role="user" internally. + engine.session.add_message(Message { + role: "user".to_string(), + content: vec![ContentBlock::ToolResult { + tool_use_id: "call_42".to_string(), + content: "pub fn sample() {}".to_string(), + is_error: None, + content_blocks: None, + }], + }); + + let messages = engine.messages_with_turn_metadata(); + + // The trailing message is the tool result and MUST be untouched — + // no Text block sneaking in front of the ToolResult block. + let trailing = messages.last().expect("trailing message"); + assert_eq!(trailing.role, "user"); + assert_eq!(trailing.content.len(), 1); + assert!(matches!( + trailing.content.first(), + Some(ContentBlock::ToolResult { .. }) + )); + + // The earlier real user message receives the turn_meta prefix. + let real_user = messages.first().expect("first user message"); + assert_eq!(real_user.role, "user"); + let ContentBlock::Text { text, .. } = real_user + .content + .first() + .expect("user text content") + else { + panic!("expected Text block on real user message"); + }; + assert!(text.starts_with("\n")); + assert!(text.contains("src/lib.rs")); +} + +/// When the turn is mid-execution and the trailing user message is a +/// tool result, no turn_meta is injected at all (rather than landing on +/// some earlier user message and confusing the API's tool-call +/// continuity check). The working_set surfaces again on the next +/// genuine user prompt. +#[test] +fn turn_metadata_skips_when_only_tool_results_trail() { + let tmp = tempdir().expect("tempdir"); + fs::create_dir_all(tmp.path().join("src")).expect("mkdir"); + fs::write(tmp.path().join("src/lib.rs"), "pub fn sample() {}").expect("write"); + + let config = EngineConfig { + workspace: tmp.path().to_path_buf(), + ..Default::default() + }; + let (mut engine, _handle) = Engine::new(config, &Config::default()); + engine + .session + .working_set + .observe_user_message("inspect src/lib.rs", tmp.path()); + + // Only a tool-result message in history — simulates the corner case + // where the prior real user message has already been compacted away + // but a tool-result is still pending. We must not retroactively + // inject. + engine.session.add_message(Message { + role: "user".to_string(), + content: vec![ContentBlock::ToolResult { + tool_use_id: "call_42".to_string(), + content: "pub fn sample() {}".to_string(), + is_error: None, + content_blocks: None, + }], + }); + + let messages = engine.messages_with_turn_metadata(); + + // Returned unchanged: the single tool-result message, no Text + // prefix, content length == 1. + let only = messages.last().expect("trailing message"); + assert_eq!(only.content.len(), 1); + assert!(matches!( + only.content.first(), + Some(ContentBlock::ToolResult { .. }) + )); +} + #[test] fn refresh_system_prompt_is_noop_when_unchanged() { let tmp = tempdir().expect("tempdir"); diff --git a/crates/tui/src/core/engine/turn_loop.rs b/crates/tui/src/core/engine/turn_loop.rs index 2220dc9a..18071ef5 100644 --- a/crates/tui/src/core/engine/turn_loop.rs +++ b/crates/tui/src/core/engine/turn_loop.rs @@ -1607,11 +1607,27 @@ impl Engine { }; let mut messages = self.session.messages.clone(); - let Some(last_user) = messages - .iter_mut() - .rev() - .find(|message| message.role == "user") - else { + // v0.8.11 hotfix: tool-result messages are stored as role="user" in + // our internal representation but serialize to role="tool" on the + // wire. Prepending a Text block onto a tool-result message breaks + // the assistant→tool_result invariant — the API rejects the request + // with `"insufficient tool messages following tool_calls"`. Inject + // only into actual user-typed messages, recognizable by having at + // least one Text content block (and no ToolResult blocks). + let Some(last_user) = messages.iter_mut().rev().find(|message| { + message.role == "user" + && message + .content + .iter() + .all(|block| !matches!(block, ContentBlock::ToolResult { .. })) + && message + .content + .iter() + .any(|block| matches!(block, ContentBlock::Text { .. })) + }) else { + // No real user message in the trailing slice (e.g. mid-turn + // after a tool call). Skip injection — the working_set will + // surface again on the next genuine user prompt. return messages; }; From 9e67e04e4a80f35feb4133a2c0b8f778c77f9b17 Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Mon, 4 May 2026 22:37:23 -0500 Subject: [PATCH 11/11] fix(install,tests): fmt nit + downloadText flowing-mode bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI on PR #684 caught two real issues that local checks missed: **Lint failure (cargo fmt).** A regression test landed with a multi-line `let ContentBlock::Text { text, .. } = real_user.content...` pattern that local rustfmt accepted but CI's pinned toolchain collapsed onto a single line. Reformatted to match. **npm wrapper smoke failure ("Checksum manifest is missing deepseek-").** Subtle Node.js streams interaction in `install.js` introduced by the network-resilience cluster: * `httpRequest` attaches a `data` event listener on the response to re-arm the stall timer. * Attaching a `data` listener on a `Readable` puts the stream into flowing mode immediately. * `downloadText` then ran `for await (const chunk of response)` to collect the body — the async iterator expects paused-mode and silently misses chunks that flow before / between iteration ticks. * For small bodies (the ~100-byte SHA256 manifest), the entire response could flow through the stall listener before the async iterator's `read()` calls landed, leaving the joined body empty. * Result: `parseChecksumManifest("")` returned an empty Map → `verifyChecksum` saw no entries → "manifest is missing X" after the actual binary download succeeded. Binary downloads were unaffected because `download()` uses `response.pipe(sink)` plus a `data` listener for progress — both consume chunks via `data` events, no async iterator involved. Fix: collect the response body in `downloadText` via direct `data`/ `end` event subscription. `data` listeners stack — both the stall re-arm and the body collector fire on every chunk, no flowing-vs- paused conflict. Stall detection still works. Verified locally: `node scripts/release/npm-wrapper-smoke.js` "npm wrapper smoke passed with local assets from ". Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/tui/src/core/engine/tests.rs | 5 +---- npm/deepseek-tui/scripts/install.js | 24 +++++++++++++++++++----- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/crates/tui/src/core/engine/tests.rs b/crates/tui/src/core/engine/tests.rs index 00a82681..d6177f8b 100644 --- a/crates/tui/src/core/engine/tests.rs +++ b/crates/tui/src/core/engine/tests.rs @@ -635,10 +635,7 @@ fn turn_metadata_skips_tool_result_messages() { // The earlier real user message receives the turn_meta prefix. let real_user = messages.first().expect("first user message"); assert_eq!(real_user.role, "user"); - let ContentBlock::Text { text, .. } = real_user - .content - .first() - .expect("user text content") + let ContentBlock::Text { text, .. } = real_user.content.first().expect("user text content") else { panic!("expected Text block on real user message"); }; diff --git a/npm/deepseek-tui/scripts/install.js b/npm/deepseek-tui/scripts/install.js index 52e0219e..9bfd1795 100644 --- a/npm/deepseek-tui/scripts/install.js +++ b/npm/deepseek-tui/scripts/install.js @@ -778,12 +778,26 @@ async function downloadText(url) { stallMs: downloadStallMs(), }); const response = result.response; - const chunks = []; response.setEncoding("utf8"); - for await (const chunk of response) { - chunks.push(chunk); - } - return chunks.join(""); + // NOTE: do NOT use `for await (const chunk of response)` here. + // `httpRequest` attaches a `data` listener on the response to re-arm + // the stall timer, which puts the stream in flowing mode. The async + // iterator expects paused mode and will silently miss every chunk — + // this manifested as an empty checksum manifest in the npm wrapper + // smoke test ("Checksum manifest is missing "). Subscribing + // to `data` events directly stacks alongside the stall listener and + // both fire per chunk, so we collect the body correctly without + // disturbing the stall detection. + return new Promise((resolve, reject) => { + const chunks = []; + response.on("data", (chunk) => { + chunks.push(chunk); + }); + response.on("end", () => { + resolve(chunks.join("")); + }); + response.on("error", reject); + }); }); }