From a4dee56fcc9908dafc14e28102782f1728e850fe Mon Sep 17 00:00:00 2001
From: Hunter Bown <hmbown@gmail.com>
Date: Mon, 4 May 2026 22:06:07 -0500
Subject: [PATCH 01/11] fix(compaction): 500K hard floor plus V4 default

---
 crates/tui/src/compaction.rs | 125 +++++++++++++++++++++++++++++++++--
 1 file changed, 119 insertions(+), 6 deletions(-)

diff --git a/crates/tui/src/compaction.rs b/crates/tui/src/compaction.rs
index 59024426..0f3acb09 100644
--- a/crates/tui/src/compaction.rs
+++ b/crates/tui/src/compaction.rs
@@ -25,25 +25,58 @@ pub struct CompactionConfig {
     pub message_threshold: usize,
     pub model: String,
     pub cache_summary: bool,
+    /// Hard floor — `should_compact` returns `false` when total session
+    /// tokens fall below this number, regardless of `enabled`,
+    /// `token_threshold`, or `message_threshold`. Defaults to
+    /// [`MINIMUM_AUTO_COMPACTION_TOKENS`] (500K) for v0.8.11+. Tests that
+    /// want to exercise the older threshold/message-count logic at small
+    /// fixture sizes can set this to `0` to disable the floor.
+    pub auto_floor_tokens: usize,
 }
 
 impl Default for CompactionConfig {
     fn default() -> Self {
         Self {
-            // ON BY DEFAULT since v0.8.6 (#402 P0 survivability).
-            // Long-running sessions need automatic compaction to stay
-            // within the model's context budget. Users who prefer the
-            // previous behaviour can opt out via `auto_compact = false`
-            // in settings or `compaction.enabled = false` in config.
+            // ON BY DEFAULT since v0.8.6 (#402 P0 survivability) — but the
+            // engine-level `auto_compact` setting was flipped OFF in v0.8.11
+            // (#665) so this default is mostly a fallback for code paths
+            // that build a `CompactionConfig` without going through
+            // `compaction_threshold_for_model_and_effort`. Real per-model
+            // values are still derived through that helper.
             enabled: true,
-            token_threshold: 50000,
+            // v0.8.11: 50K was a 128K-era leftover that biased every
+            // unconfigured caller toward "compact almost immediately on V4."
+            // Bumped to 800K (80% of V4's 1M window) so the dead-code
+            // default no longer lies. Real call sites override this via
+            // `compaction_threshold_for_model_and_effort`.
+            token_threshold: 800_000,
             message_threshold: 50,
             model: DEFAULT_TEXT_MODEL.to_string(),
             cache_summary: true,
+            auto_floor_tokens: MINIMUM_AUTO_COMPACTION_TOKENS,
         }
     }
 }
 
+/// Hard floor for automatic compaction in v0.8.11+.
+///
+/// Below this token count, `should_compact` returns `false` regardless of
+/// `enabled`, `token_threshold`, or `message_threshold`. The point of the
+/// floor is V4 prefix-cache economics: compaction rewrites the stable
+/// prefix, which destroys the KV cache. At low token counts the prefix
+/// cache is healthy and compaction's cost (full re-prefill at miss prices)
+/// dwarfs its benefit (a tiny budget reclaim). Above the floor compaction
+/// can still be net-positive — cache is already pressured, the prefix has
+/// drifted, and freeing budget matters.
+///
+/// Manual `/compact` slash command and the model-callable `compact_now`
+/// tool both bypass this floor with a deliberate refusal message — they
+/// represent explicit agency rather than implicit policy.
+///
+/// Constant rather than configurable for v0.8.11. If anyone needs to dial
+/// it (smaller models, opinionated workflows), we can add a setting later.
+pub const MINIMUM_AUTO_COMPACTION_TOKENS: usize = 500_000;
+
 pub const KEEP_RECENT_MESSAGES: usize = 4;
 const RECENT_WORKING_SET_WINDOW: usize = 12;
 const MAX_WORKING_SET_PATHS: usize = 24;
@@ -585,6 +618,21 @@ pub fn should_compact(
         return false;
     }
 
+    // v0.8.11: hard floor enforcement. Below the floor (default 500K tokens
+    // — see `MINIMUM_AUTO_COMPACTION_TOKENS`), automatic compaction is
+    // refused because rewriting the prefix kills V4's prefix cache for
+    // little budget recovery. Manual `/compact` and the `compact_now` tool
+    // bypass this floor by going through different code paths.
+    if config.auto_floor_tokens > 0 {
+        let total_session_tokens: usize = messages
+            .iter()
+            .map(|m| estimate_tokens_for_message(m, false))
+            .sum();
+        if total_session_tokens < config.auto_floor_tokens {
+            return false;
+        }
+    }
+
     let plan = plan_compaction(
         messages,
         workspace,
@@ -1445,6 +1493,9 @@ mod tests {
             enabled: true,
             token_threshold: 1_000_000, // Very high
             message_threshold: 5,
+            // Disable the v0.8.11 500K floor so this test exercises the
+            // pure message-count threshold logic at small fixture sizes.
+            auto_floor_tokens: 0,
             ..Default::default()
         };
 
@@ -1585,6 +1636,7 @@ mod tests {
             enabled: true,
             token_threshold: 1_000_000,
             message_threshold: 5,
+            auto_floor_tokens: 0,
             ..Default::default()
         };
 
@@ -1603,6 +1655,7 @@ mod tests {
             enabled: true,
             token_threshold: 50,
             message_threshold: 50,
+            auto_floor_tokens: 0,
             ..Default::default()
         };
 
@@ -1874,6 +1927,7 @@ mod tests {
             enabled: true,
             token_threshold: 100,    // Low threshold for testing
             message_threshold: 1000, // High message threshold
+            auto_floor_tokens: 0,
             ..Default::default()
         };
 
@@ -1901,6 +1955,65 @@ mod tests {
         assert!(!should_compact(&messages, &config, None, None, None));
     }
 
+    /// v0.8.11: the 500K hard floor blocks auto-compaction even when the
+    /// token-percentage threshold would otherwise fire. This is the V4
+    /// prefix-cache protection — below 500K total tokens, rewriting the
+    /// prefix loses cache for tiny budget gains.
+    #[test]
+    fn auto_compaction_floor_blocks_below_500k_even_when_threshold_says_yes() {
+        let config = CompactionConfig {
+            enabled: true,
+            token_threshold: 100,    // would normally fire instantly
+            message_threshold: 1000, // not the trigger
+            // Use the production default explicitly so this test pins the
+            // floor's contract rather than relying on `Default`.
+            auto_floor_tokens: MINIMUM_AUTO_COMPACTION_TOKENS,
+            ..Default::default()
+        };
+
+        let messages: Vec<Message> = (0..10).map(|_| msg("user", &"x".repeat(50))).collect();
+        // Total tokens way under 500K, so floor blocks compaction.
+        assert!(!should_compact(&messages, &config, None, None, None));
+    }
+
+    /// v0.8.11: when total tokens cross the 500K floor, the existing
+    /// threshold/message-count logic takes over again.
+    #[test]
+    fn auto_compaction_floor_yields_to_threshold_logic_above_500k() {
+        let config = CompactionConfig {
+            enabled: true,
+            token_threshold: 2_000_000,
+            message_threshold: 2_000,
+            auto_floor_tokens: MINIMUM_AUTO_COMPACTION_TOKENS,
+            ..Default::default()
+        };
+
+        // Each message ~500 tokens; 1100 messages → ~550K total tokens.
+        // That's above the floor (500K) AND below the deliberately high
+        // token_threshold, so auto-compaction stays off — by threshold,
+        // not floor.
+        let messages: Vec<Message> = (0..1100).map(|_| msg("user", &"x".repeat(2000))).collect();
+        assert!(!should_compact(&messages, &config, None, None, None));
+
+        // Crank threshold below total → compaction fires now that we're
+        // past the floor.
+        let config_lower = CompactionConfig {
+            token_threshold: 100_000,
+            ..config
+        };
+        assert!(should_compact(&messages, &config_lower, None, None, None));
+    }
+
+    /// `CompactionConfig::default()` ships with the 500K floor on by
+    /// default — production callers via `..Default::default()` get the
+    /// safety guarantee automatically.
+    #[test]
+    fn compaction_config_default_carries_500k_floor() {
+        let config = CompactionConfig::default();
+        assert_eq!(config.auto_floor_tokens, MINIMUM_AUTO_COMPACTION_TOKENS);
+        assert_eq!(config.auto_floor_tokens, 500_000);
+    }
+
     #[test]
     fn test_plan_compaction_pins_error_messages() {
         let messages = vec![

From a14227edf8169195501160dba1cdf3c400de9c0b Mon Sep 17 00:00:00 2001
From: Hunter Bown <hmbown@gmail.com>
Date: Mon, 4 May 2026 22:06:16 -0500
Subject: [PATCH 02/11] refactor(models): rename legacy DeepSeek context window

---
 crates/tui/src/config.rs                    |  4 +-
 crates/tui/src/core/engine/capacity_flow.rs |  5 ++-
 crates/tui/src/models.rs                    | 45 +++++++++++----------
 crates/tui/src/tui/context_inspector.rs     | 13 +++---
 4 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/crates/tui/src/config.rs b/crates/tui/src/config.rs
index 149fb35f..49fd56f3 100644
--- a/crates/tui/src/config.rs
+++ b/crates/tui/src/config.rs
@@ -225,7 +225,7 @@ pub fn provider_capability(provider: ApiProvider, resolved_model: &str) -> Provi
         crate::models::DEEPSEEK_V4_CONTEXT_WINDOW_TOKENS
     } else {
         crate::models::context_window_for_model(resolved_model)
-            .unwrap_or(crate::models::DEFAULT_CONTEXT_WINDOW_TOKENS)
+            .unwrap_or(crate::models::LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS)
     };
 
     // Max output tokens: DeepSeek V4 models allow 262K; others get 4096.
@@ -4070,7 +4070,7 @@ model = "deepseek-v4-pro"
         let cap = provider_capability(ApiProvider::Deepseek, "deepseek-coder");
         assert_eq!(
             cap.context_window,
-            crate::models::DEFAULT_CONTEXT_WINDOW_TOKENS
+            crate::models::LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS
         );
         assert_eq!(cap.max_output, 4096);
         assert!(!cap.thinking_supported);
diff --git a/crates/tui/src/core/engine/capacity_flow.rs b/crates/tui/src/core/engine/capacity_flow.rs
index f280e644..cee5fb76 100644
--- a/crates/tui/src/core/engine/capacity_flow.rs
+++ b/crates/tui/src/core/engine/capacity_flow.rs
@@ -160,9 +160,10 @@ impl Engine {
         let unique_reference_ids_recent_window =
             self.recent_unique_reference_count(message_window, turn);
         let context_window = usize::try_from(
-            context_window_for_model(&self.session.model).unwrap_or(DEFAULT_CONTEXT_WINDOW_TOKENS),
+            context_window_for_model(&self.session.model)
+                .unwrap_or(LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS),
         )
-        .unwrap_or(usize::try_from(DEFAULT_CONTEXT_WINDOW_TOKENS).unwrap_or(128_000))
+        .unwrap_or(usize::try_from(LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS).unwrap_or(128_000))
         .max(1);
         let context_used_ratio = (self.estimated_input_tokens() as f64) / (context_window as f64);
 
diff --git a/crates/tui/src/models.rs b/crates/tui/src/models.rs
index 8b1413ee..320d8305 100644
--- a/crates/tui/src/models.rs
+++ b/crates/tui/src/models.rs
@@ -2,15 +2,17 @@
 
 use serde::{Deserialize, Serialize};
 
-pub const DEFAULT_CONTEXT_WINDOW_TOKENS: u32 = 128_000;
+/// Context window used only for legacy DeepSeek model IDs that do not name a
+/// newer V4 alias and do not carry an explicit `*k` suffix.
+pub const LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS: u32 = 128_000;
 pub const DEEPSEEK_V4_CONTEXT_WINDOW_TOKENS: u32 = 1_000_000;
 /// Last-resort compaction trigger when [`context_window_for_model`] returns
 /// `None` (an unrecognised model id). v0.8.11 raised this from `50_000` to
-/// `102_400` (80% of [`DEFAULT_CONTEXT_WINDOW_TOKENS`]) so unknown models
-/// inherit the same late-trigger discipline as V4 instead of paying the
-/// prefix-cache hit at 5% of the V4 window. Known DeepSeek / Claude models
-/// resolve to their own scaled value via [`compaction_threshold_for_model`]
-/// (#664).
+/// `102_400` (80% of [`LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS`]) so unknown
+/// models inherit the same late-trigger discipline as V4 instead of paying
+/// the prefix-cache hit at 5% of the V4 window. Known DeepSeek / Claude
+/// models resolve to their own scaled value via
+/// [`compaction_threshold_for_model`] (#664).
 pub const DEFAULT_COMPACTION_TOKEN_THRESHOLD: usize = 102_400;
 pub const DEFAULT_COMPACTION_MESSAGE_THRESHOLD: usize = 50;
 const COMPACTION_THRESHOLD_PERCENT: u32 = 80;
@@ -212,8 +214,9 @@ pub struct Usage {
 #[must_use]
 pub fn context_window_for_model(model: &str) -> Option<u32> {
     let lower = model.to_lowercase();
-    // Unknown DeepSeek model IDs default to 128k unless an explicit *k suffix is present.
-    // DeepSeek-V4 family and current legacy aliases ship with a 1M context window.
+    // Unknown legacy DeepSeek model IDs default to 128K unless an explicit
+    // *k suffix is present. DeepSeek-V4 family and current compatibility
+    // aliases ship with a 1M context window.
     if lower.contains("deepseek") {
         if let Some(explicit_window) = deepseek_context_window_hint(&lower) {
             return Some(explicit_window);
@@ -221,7 +224,7 @@ pub fn context_window_for_model(model: &str) -> Option<u32> {
         if lower.contains("v4") || is_current_deepseek_v4_alias(&lower) {
             return Some(DEEPSEEK_V4_CONTEXT_WINDOW_TOKENS);
         }
-        return Some(DEFAULT_CONTEXT_WINDOW_TOKENS);
+        return Some(LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS);
     }
     if lower.contains("claude") {
         return Some(200_000);
@@ -411,14 +414,14 @@ mod tests {
     }
 
     #[test]
-    fn unknown_deepseek_models_map_to_128k_context_window() {
+    fn unknown_legacy_deepseek_models_map_to_128k_context_window() {
         assert_eq!(
             context_window_for_model("deepseek-coder"),
-            Some(DEFAULT_CONTEXT_WINDOW_TOKENS)
+            Some(LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS)
         );
         assert_eq!(
             context_window_for_model("deepseek-v3.2-0324"),
-            Some(DEFAULT_CONTEXT_WINDOW_TOKENS)
+            Some(LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS)
         );
     }
 
@@ -447,7 +450,7 @@ mod tests {
         );
         assert_eq!(
             context_window_for_model("deepseek-v3.2-2k-preview"),
-            Some(DEFAULT_CONTEXT_WINDOW_TOKENS)
+            Some(LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS)
         );
     }
 
@@ -458,11 +461,11 @@ mod tests {
             102_400
         );
         // v0.8.11 (#664): unknown-model fallback also resolves to 80% of
-        // `DEFAULT_CONTEXT_WINDOW_TOKENS` (128k) — same late-trigger
-        // discipline as the V4 path. Was `50_000` pre-v0.8.11; that
-        // hardcoded value compacted at ~5% of a 1M window when the model
-        // detection silently fell through, which is exactly the
-        // prefix-cache-burning behaviour we're getting away from.
+        // `LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS` (128K legacy DeepSeek
+        // fallback) — same late-trigger discipline as the V4 path. Was
+        // `50_000` pre-v0.8.11; that hardcoded value compacted at ~5% of a
+        // 1M window when model detection silently fell through, which is
+        // exactly the prefix-cache-burning behaviour we're getting away from.
         assert_eq!(compaction_threshold_for_model("unknown-model"), 102_400);
     }
 
@@ -509,9 +512,9 @@ mod tests {
             102_400
         );
         // v0.8.11 (#664): unknown-model fallback also lands on the
-        // 80%-of-128K floor instead of the legacy hardcoded 50K, so
-        // model-detection-fall-through doesn't quietly burn V4 prefix
-        // cache at 5%-of-window.
+        // 80%-of-128K legacy DeepSeek fallback instead of the legacy
+        // hardcoded 50K, so model-detection-fall-through doesn't quietly
+        // burn V4 prefix cache at 5%-of-window.
         assert_eq!(
             compaction_threshold_for_model_and_effort("unknown-model", Some("max")),
             102_400
diff --git a/crates/tui/src/tui/context_inspector.rs b/crates/tui/src/tui/context_inspector.rs
index 7f0e139b..12a6cca4 100644
--- a/crates/tui/src/tui/context_inspector.rs
+++ b/crates/tui/src/tui/context_inspector.rs
@@ -4,16 +4,17 @@ use std::collections::HashSet;
 use std::fmt::Write;
 
 use crate::compaction::estimate_input_tokens_conservative;
-use crate::models::{DEFAULT_CONTEXT_WINDOW_TOKENS, SystemPrompt, context_window_for_model};
+use crate::models::{
+    LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS, SystemPrompt, context_window_for_model,
+};
 use crate::session_manager::SessionContextReference;
 use crate::tui::app::{App, ToolDetailRecord};
 use crate::tui::file_mention::ContextReferenceSource;
 use crate::utils::estimate_message_chars;
 
-/// Marker used by the engine's `append_working_set_summary` to tag the
-/// volatile tail block in the system prompt. Replicated here so the
-/// context inspector can distinguish stable prefix blocks from the
-/// ephemeral working-set block without importing engine internals.
+/// Marker used by per-turn working-set metadata. Replicated here so the
+/// context inspector can distinguish stable prompt blocks from volatile
+/// working-set context without importing engine internals.
 const WORKING_SET_MARKER: &str = "## Repo Working Set";
 
 const CONTEXT_WARNING_THRESHOLD_PERCENT: f64 = 85.0;
@@ -68,7 +69,7 @@ pub fn build_context_inspector_text(app: &App) -> String {
 }
 
 fn context_usage(app: &App) -> (usize, u32, f64) {
-    let max = context_window_for_model(&app.model).unwrap_or(DEFAULT_CONTEXT_WINDOW_TOKENS);
+    let max = context_window_for_model(&app.model).unwrap_or(LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS);
     let estimated =
         estimate_input_tokens_conservative(&app.api_messages, app.system_prompt.as_ref());
     let total_chars = estimate_message_chars(&app.api_messages);

From b48b68f078c61cf97d07354d32468c7346f3d950 Mon Sep 17 00:00:00 2001
From: Hunter Bown <hmbown@gmail.com>
Date: Mon, 4 May 2026 22:06:55 -0500
Subject: [PATCH 03/11] perf(engine): stabilize system prompt and move working
 set metadata

---
 crates/tui/src/core/engine.rs           |  68 ++++++++++------
 crates/tui/src/core/engine/context.rs   |  52 +-----------
 crates/tui/src/core/engine/tests.rs     | 100 ++++++++++++++++++------
 crates/tui/src/core/engine/turn_loop.rs |  33 +++++++-
 crates/tui/src/core/session.rs          |   4 +
 crates/tui/src/prompts.rs               |  69 +++++++---------
 6 files changed, 185 insertions(+), 141 deletions(-)

diff --git a/crates/tui/src/core/engine.rs b/crates/tui/src/core/engine.rs
index ff053451..aeeb6fb4 100644
--- a/crates/tui/src/core/engine.rs
+++ b/crates/tui/src/core/engine.rs
@@ -8,6 +8,8 @@
 //! - Tool execution orchestration
 
 use std::collections::HashMap;
+use std::collections::hash_map::DefaultHasher;
+use std::hash::{Hash, Hasher};
 use std::path::PathBuf;
 use std::sync::{Arc, Mutex as StdMutex};
 use std::time::{Duration, Instant};
@@ -35,8 +37,8 @@ use crate::mcp::McpPool;
 #[cfg(test)]
 use crate::models::ToolCaller;
 use crate::models::{
-    ContentBlock, ContentBlockStart, DEFAULT_CONTEXT_WINDOW_TOKENS, Delta, Message, MessageRequest,
-    StreamEvent, SystemPrompt, Tool, Usage,
+    ContentBlock, ContentBlockStart, Delta, LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS, Message,
+    MessageRequest, StreamEvent, SystemPrompt, Tool, Usage,
 };
 use crate::prompts;
 use crate::seam_manager::{SeamConfig, SeamManager};
@@ -353,8 +355,9 @@ impl Engine {
             config.mcp_config_path.clone(),
         );
 
-        // Set up system prompt with project context (default to agent mode)
-        let working_set_summary = session.working_set.summary_block(&config.workspace);
+        // Set up stable system prompt with project context (default to agent mode).
+        // Per-turn working-set metadata is injected into the latest user
+        // message at request time so file churn does not rewrite this prefix.
         let user_memory_block =
             crate::memory::compose_block(config.memory_enabled, &config.memory_path);
         let system_prompt = prompts::system_prompt_for_mode_with_context_skills_and_session(
@@ -368,8 +371,9 @@ impl Engine {
                 goal_objective: config.goal_objective.as_deref(),
             },
         );
-        session.system_prompt =
-            append_working_set_summary(Some(system_prompt), working_set_summary.as_deref());
+        let stable_prompt = Some(system_prompt);
+        session.last_system_prompt_hash = Some(system_prompt_hash(stable_prompt.as_ref()));
+        session.system_prompt = stable_prompt;
 
         let subagent_manager =
             new_shared_subagent_manager(config.workspace.clone(), config.max_subagents);
@@ -1645,10 +1649,6 @@ impl Engine {
 
     /// Refresh the system prompt based on current mode and context.
     fn refresh_system_prompt(&mut self, mode: AppMode) {
-        let working_set_summary = self
-            .session
-            .working_set
-            .summary_block(&self.config.workspace);
         let user_memory_block =
             crate::memory::compose_block(self.config.memory_enabled, &self.config.memory_path);
         let base = prompts::system_prompt_for_mode_with_context_skills_and_session(
@@ -1664,8 +1664,11 @@ impl Engine {
         );
         let stable_prompt =
             merge_system_prompts(Some(&base), self.session.compaction_summary_prompt.clone());
-        self.session.system_prompt =
-            append_working_set_summary(stable_prompt, working_set_summary.as_deref());
+        let stable_hash = system_prompt_hash(stable_prompt.as_ref());
+        if self.session.last_system_prompt_hash != Some(stable_hash) {
+            self.session.system_prompt = stable_prompt;
+            self.session.last_system_prompt_hash = Some(stable_hash);
+        }
     }
 
     fn merge_compaction_summary(&mut self, summary_prompt: Option<SystemPrompt>) {
@@ -1676,18 +1679,36 @@ impl Engine {
             self.session.compaction_summary_prompt.as_ref(),
             summary_prompt.clone(),
         );
-        let current_without_working_set =
-            remove_working_set_summary(self.session.system_prompt.as_ref());
-        let merged = merge_system_prompts(current_without_working_set.as_ref(), summary_prompt);
-        let working_set_summary = self
-            .session
-            .working_set
-            .summary_block(&self.config.workspace);
-        self.session.system_prompt =
-            append_working_set_summary(merged, working_set_summary.as_deref());
+        let merged = merge_system_prompts(self.session.system_prompt.as_ref(), summary_prompt);
+        self.session.last_system_prompt_hash = Some(system_prompt_hash(merged.as_ref()));
+        self.session.system_prompt = merged;
     }
 }
 
+fn system_prompt_hash(prompt: Option<&SystemPrompt>) -> u64 {
+    let mut hasher = DefaultHasher::new();
+    match prompt {
+        Some(SystemPrompt::Text(text)) => {
+            0u8.hash(&mut hasher);
+            text.hash(&mut hasher);
+        }
+        Some(SystemPrompt::Blocks(blocks)) => {
+            1u8.hash(&mut hasher);
+            for block in blocks {
+                block.block_type.hash(&mut hasher);
+                block.text.hash(&mut hasher);
+                if let Some(cache_control) = &block.cache_control {
+                    cache_control.cache_type.hash(&mut hasher);
+                }
+            }
+        }
+        None => {
+            2u8.hash(&mut hasher);
+        }
+    }
+    hasher.finish()
+}
+
 /// Spawn the engine in a background task
 pub fn spawn_engine(config: EngineConfig, api_config: &Config) -> EngineHandle {
     let (engine, handle) = Engine::new(config, api_config);
@@ -1775,9 +1796,8 @@ mod context;
 pub(crate) use context::compact_tool_result_for_context;
 use context::{
     COMPACTION_SUMMARY_MARKER, MAX_CONTEXT_RECOVERY_ATTEMPTS, MIN_RECENT_MESSAGES_TO_KEEP,
-    TURN_MAX_OUTPUT_TOKENS, append_working_set_summary, context_input_budget,
-    estimate_input_tokens_conservative, extract_compaction_summary_prompt,
-    is_context_length_error_message, remove_working_set_summary, summarize_text,
+    TURN_MAX_OUTPUT_TOKENS, context_input_budget, estimate_input_tokens_conservative,
+    extract_compaction_summary_prompt, is_context_length_error_message, summarize_text,
     turn_response_headroom_tokens,
 };
 mod dispatch;
diff --git a/crates/tui/src/core/engine/context.rs b/crates/tui/src/core/engine/context.rs
index 7e80008b..81f053e0 100644
--- a/crates/tui/src/core/engine/context.rs
+++ b/crates/tui/src/core/engine/context.rs
@@ -6,7 +6,7 @@
 
 use crate::compaction::estimate_tokens;
 use crate::error_taxonomy::ErrorCategory;
-use crate::models::{Message, SystemBlock, SystemPrompt, context_window_for_model};
+use crate::models::{Message, SystemPrompt, context_window_for_model};
 use crate::tools::spec::ToolResult;
 
 /// Max output tokens requested for normal agent turns. Generous on purpose:
@@ -288,56 +288,6 @@ pub(super) fn extract_compaction_summary_prompt(
     }
 }
 
-pub(super) fn remove_working_set_summary(prompt: Option<&SystemPrompt>) -> Option<SystemPrompt> {
-    match prompt {
-        Some(SystemPrompt::Blocks(blocks)) => {
-            let filtered: Vec<SystemBlock> = blocks
-                .iter()
-                .filter(|block| !block.text.contains(WORKING_SET_SUMMARY_MARKER))
-                .cloned()
-                .collect();
-            if filtered.is_empty() {
-                None
-            } else {
-                Some(SystemPrompt::Blocks(filtered))
-            }
-        }
-        Some(SystemPrompt::Text(text)) => Some(SystemPrompt::Text(text.clone())),
-        None => None,
-    }
-}
-
-pub(super) fn append_working_set_summary(
-    prompt: Option<SystemPrompt>,
-    working_set_summary: Option<&str>,
-) -> Option<SystemPrompt> {
-    let Some(summary) = working_set_summary.map(str::trim).filter(|s| !s.is_empty()) else {
-        return prompt;
-    };
-    let working_set_block = SystemBlock {
-        block_type: "text".to_string(),
-        text: summary.to_string(),
-        cache_control: None,
-    };
-
-    match prompt {
-        Some(SystemPrompt::Text(text)) => Some(SystemPrompt::Blocks(vec![
-            SystemBlock {
-                block_type: "text".to_string(),
-                text,
-                cache_control: None,
-            },
-            working_set_block,
-        ])),
-        Some(SystemPrompt::Blocks(mut blocks)) => {
-            blocks.retain(|block| !block.text.contains(WORKING_SET_SUMMARY_MARKER));
-            blocks.push(working_set_block);
-            Some(SystemPrompt::Blocks(blocks))
-        }
-        None => Some(SystemPrompt::Blocks(vec![working_set_block])),
-    }
-}
-
 fn estimate_text_tokens_conservative(text: &str) -> usize {
     text.chars().count().div_ceil(3)
 }
diff --git a/crates/tui/src/core/engine/tests.rs b/crates/tui/src/core/engine/tests.rs
index 1e5e0349..8a9751e7 100644
--- a/crates/tui/src/core/engine/tests.rs
+++ b/crates/tui/src/core/engine/tests.rs
@@ -501,7 +501,7 @@ fn subagent_results_are_summarized_before_parent_context_insertion() {
 }
 
 #[test]
-fn refresh_system_prompt_places_working_set_after_stable_prefix() {
+fn refresh_system_prompt_leaves_working_set_out_of_system_prompt() {
     let tmp = tempdir().expect("tempdir");
     fs::create_dir_all(tmp.path().join("src")).expect("mkdir");
     fs::write(tmp.path().join("src/lib.rs"), "pub fn sample() {}").expect("write");
@@ -518,20 +518,74 @@ fn refresh_system_prompt_places_working_set_after_stable_prefix() {
 
     engine.refresh_system_prompt(AppMode::Agent);
 
-    let Some(SystemPrompt::Blocks(blocks)) = &engine.session.system_prompt else {
-        panic!("expected structured prompt blocks");
-    };
-    let last = blocks.last().expect("working-set block");
-    assert!(last.text.contains(WORKING_SET_SUMMARY_MARKER));
-    assert!(
-        blocks[..blocks.len() - 1]
+    let prompt = match &engine.session.system_prompt {
+        Some(SystemPrompt::Text(text)) => text.clone(),
+        Some(SystemPrompt::Blocks(blocks)) => blocks
             .iter()
-            .all(|block| !block.text.contains(WORKING_SET_SUMMARY_MARKER))
-    );
+            .map(|block| block.text.as_str())
+            .collect::<Vec<_>>()
+            .join("\n"),
+        None => panic!("expected system prompt"),
+    };
+    assert!(!prompt.contains(WORKING_SET_SUMMARY_MARKER));
 }
 
 #[test]
-fn compaction_summary_stays_before_volatile_working_set() {
+fn working_set_reaches_model_as_turn_metadata() {
+    let tmp = tempdir().expect("tempdir");
+    fs::create_dir_all(tmp.path().join("src")).expect("mkdir");
+    fs::write(tmp.path().join("src/lib.rs"), "pub fn sample() {}").expect("write");
+
+    let config = EngineConfig {
+        workspace: tmp.path().to_path_buf(),
+        ..Default::default()
+    };
+    let (mut engine, _handle) = Engine::new(config, &Config::default());
+    engine
+        .session
+        .working_set
+        .observe_user_message("please inspect src/lib.rs", tmp.path());
+    engine.session.add_message(Message {
+        role: "user".to_string(),
+        content: vec![ContentBlock::Text {
+            text: "please inspect src/lib.rs".to_string(),
+            cache_control: None,
+        }],
+    });
+
+    let messages = engine.messages_with_turn_metadata();
+    let first_block = messages
+        .last()
+        .and_then(|message| message.content.first())
+        .expect("turn metadata block");
+    let ContentBlock::Text { text, .. } = first_block else {
+        panic!("expected text metadata block");
+    };
+    assert!(text.starts_with("<turn_meta>\n"));
+    assert!(text.contains(WORKING_SET_SUMMARY_MARKER));
+    assert!(text.contains("src/lib.rs"));
+}
+
+#[test]
+fn refresh_system_prompt_is_noop_when_unchanged() {
+    let tmp = tempdir().expect("tempdir");
+    let config = EngineConfig {
+        workspace: tmp.path().to_path_buf(),
+        ..Default::default()
+    };
+    let (mut engine, _handle) = Engine::new(config, &Config::default());
+
+    engine.refresh_system_prompt(AppMode::Agent);
+    let first_hash = engine.session.last_system_prompt_hash;
+    let first_prompt = engine.session.system_prompt.clone();
+    engine.refresh_system_prompt(AppMode::Agent);
+
+    assert_eq!(engine.session.last_system_prompt_hash, first_hash);
+    assert_eq!(engine.session.system_prompt, first_prompt);
+}
+
+#[test]
+fn compaction_summary_stays_in_stable_system_prompt() {
     let tmp = tempdir().expect("tempdir");
     fs::create_dir_all(tmp.path().join("src")).expect("mkdir");
     fs::write(tmp.path().join("src/main.rs"), "fn main() {}").expect("write");
@@ -552,20 +606,18 @@ fn compaction_summary_stays_before_volatile_working_set() {
         cache_control: None,
     }])));
 
-    let Some(SystemPrompt::Blocks(blocks)) = &engine.session.system_prompt else {
-        panic!("expected structured prompt blocks");
+    let prompt = match &engine.session.system_prompt {
+        Some(SystemPrompt::Text(text)) => text.clone(),
+        Some(SystemPrompt::Blocks(blocks)) => blocks
+            .iter()
+            .map(|block| block.text.as_str())
+            .collect::<Vec<_>>()
+            .join("\n"),
+        None => panic!("expected system prompt"),
     };
-    let summary_index = blocks
-        .iter()
-        .position(|block| block.text.contains(COMPACTION_SUMMARY_MARKER))
-        .expect("summary block");
-    let working_set_index = blocks
-        .iter()
-        .position(|block| block.text.contains(WORKING_SET_SUMMARY_MARKER))
-        .expect("working-set block");
 
-    assert!(summary_index < working_set_index);
-    assert_eq!(working_set_index, blocks.len() - 1);
+    assert!(prompt.contains(COMPACTION_SUMMARY_MARKER));
+    assert!(!prompt.contains(WORKING_SET_SUMMARY_MARKER));
 }
 
 #[tokio::test]
@@ -635,7 +687,7 @@ async fn pre_request_refresh_invoked_when_medium_risk() {
     engine.config.model = "deepseek-v3.2-128k".to_string();
 
     let long = "x".repeat(5_000);
-    for _ in 0..200 {
+    for _ in 0..900 {
         engine.session.messages.push(Message {
             role: "user".to_string(),
             content: vec![ContentBlock::Text {
diff --git a/crates/tui/src/core/engine/turn_loop.rs b/crates/tui/src/core/engine/turn_loop.rs
index 603309d0..2220dc9a 100644
--- a/crates/tui/src/core/engine/turn_loop.rs
+++ b/crates/tui/src/core/engine/turn_loop.rs
@@ -230,7 +230,7 @@ impl Engine {
             };
             let request = MessageRequest {
                 model: self.session.model.clone(),
-                messages: self.session.messages.clone(),
+                messages: self.messages_with_turn_metadata(),
                 max_tokens: TURN_MAX_OUTPUT_TOKENS,
                 system: self.session.system_prompt.clone(),
                 tools: active_tools.clone(),
@@ -1594,4 +1594,35 @@ impl Engine {
         }
         (TurnOutcomeStatus::Completed, None)
     }
+
+    pub(super) fn messages_with_turn_metadata(&self) -> Vec<Message> {
+        let Some(summary) = self
+            .session
+            .working_set
+            .summary_block(&self.config.workspace)
+            .map(|s| s.trim().to_string())
+            .filter(|s| !s.is_empty())
+        else {
+            return self.session.messages.clone();
+        };
+
+        let mut messages = self.session.messages.clone();
+        let Some(last_user) = messages
+            .iter_mut()
+            .rev()
+            .find(|message| message.role == "user")
+        else {
+            return messages;
+        };
+
+        let turn_meta = format!("<turn_meta>\n{summary}\n</turn_meta>");
+        last_user.content.insert(
+            0,
+            ContentBlock::Text {
+                text: turn_meta,
+                cache_control: None,
+            },
+        );
+        messages
+    }
 }
diff --git a/crates/tui/src/core/session.rs b/crates/tui/src/core/session.rs
index 94791c40..347b1fa5 100644
--- a/crates/tui/src/core/session.rs
+++ b/crates/tui/src/core/session.rs
@@ -25,6 +25,9 @@ pub struct Session {
 
     /// System prompt (optional)
     pub system_prompt: Option<SystemPrompt>,
+    /// Hash of the last assembled stable system prompt. Used to avoid
+    /// replacing `system_prompt` when unchanged.
+    pub last_system_prompt_hash: Option<u64>,
     /// Persisted summary blocks generated by context compaction.
     pub compaction_summary_prompt: Option<SystemPrompt>,
 
@@ -131,6 +134,7 @@ impl Session {
             } else {
                 None
             },
+            last_system_prompt_hash: None,
             working_set: WorkingSet::default(),
             cycle_count: 0,
             current_cycle_started: Utc::now(),
diff --git a/crates/tui/src/prompts.rs b/crates/tui/src/prompts.rs
index 9b2b2ecc..eec2646f 100644
--- a/crates/tui/src/prompts.rs
+++ b/crates/tui/src/prompts.rs
@@ -254,11 +254,11 @@ pub fn system_prompt_for_mode_with_context(
 ///   4. `## Context Management` (compile-time constant, Agent/Yolo only)
 ///   5. compaction handoff template (compile-time constant)
 ///   6. handoff block — file-backed; rewritten by `/compact` and on exit
-///   7. working-set summary — drifts when a new path is observed
 ///
 /// Anything appended after a volatile block forfeits the cache for the rest
-/// of the request. New blocks belong above the handoff/working-set boundary
-/// unless they themselves are turn-volatile.
+/// of the request. New blocks belong above the handoff boundary unless they
+/// themselves are turn-volatile. Working-set metadata is now injected into the
+/// latest user message as per-turn metadata instead of this system prompt.
 pub fn system_prompt_for_mode_with_context_and_skills(
     mode: AppMode,
     workspace: &Path,
@@ -283,7 +283,7 @@ pub fn system_prompt_for_mode_with_context_and_skills(
 pub fn system_prompt_for_mode_with_context_skills_and_session(
     mode: AppMode,
     workspace: &Path,
-    working_set_summary: Option<&str>,
+    _working_set_summary: Option<&str>,
     skills_dir: Option<&Path>,
     instructions: Option<&[PathBuf]>,
     session_context: PromptSessionContext<'_>,
@@ -360,6 +360,7 @@ pub fn system_prompt_for_mode_with_context_skills_and_session(
              If you notice context is getting long (>80%), proactively suggest using `/compact` to the user.\n\n\
              ### Prompt-cache awareness\n\n\
              DeepSeek caches the longest *byte-stable prefix* of every request and charges roughly 100× less for cache-hit tokens than miss tokens. The system prompt above is layered most-static-first specifically so the prefix stays stable turn-over-turn. To keep cache hits high:\n\
+             - **Working set location:** the current repo working set is injected into the latest user message inside a `<turn_meta>` block. Treat it as high-priority turn metadata, not as a stable system-prompt section.\n\
              - **Append, don't reorder.** New context goes at the end (latest user / tool messages). Reshuffling earlier messages or rewriting their content invalidates the cache for everything after the change.\n\
              - **Don't paraphrase quoted content.** If you've already read a file, refer to it by path or line range instead of re-quoting it with different formatting.\n\
              - **Use `/compact` as a hard reset, not a tweak.** Compaction is meant for when the cache is already losing — it intentionally rewrites the prefix to a shorter summary. Don't trigger it for small wins.\n\
@@ -382,13 +383,6 @@ pub fn system_prompt_for_mode_with_context_skills_and_session(
         full_prompt = format!("{full_prompt}\n\n{handoff_block}");
     }
 
-    // 7. Working-set summary (drifts when a new path is observed).
-    if let Some(summary) = working_set_summary
-        && !summary.trim().is_empty()
-    {
-        full_prompt = format!("{full_prompt}\n\n{summary}");
-    }
-
     SystemPrompt::Text(full_prompt)
 }
 
@@ -547,7 +541,7 @@ mod tests {
     }
 
     #[test]
-    fn session_goal_is_injected_above_volatile_prompt_tail() {
+    fn session_goal_is_injected_above_handoff_tail() {
         let tmp = tempdir().expect("tempdir");
         let prompt = match system_prompt_for_mode_with_context_skills_and_session(
             AppMode::Agent,
@@ -566,11 +560,10 @@ mod tests {
 
         let goal_pos = prompt.find("<session_goal>").expect("goal block");
         let compact_pos = prompt.find("## Compaction Handoff").expect("compact block");
-        let working_set_pos = prompt.find("## Repo Working Set").expect("working set");
 
         assert!(prompt.contains("Fix transcript corruption"));
         assert!(goal_pos < compact_pos);
-        assert!(goal_pos < working_set_pos);
+        assert!(!prompt.contains("src/lib.rs"));
     }
 
     #[test]
@@ -729,12 +722,10 @@ mod tests {
     }
 
     #[test]
-    fn system_prompt_with_working_set_summary_is_byte_stable_for_constant_summary() {
-        // The `working_set_summary` argument is the volatile surface (suspect
-        // #1 in #263). Independently verifying THIS surface needs a separate
-        // test in working_set.rs; here we just pin that the surrounding
-        // prompt construction faithfully embeds whatever summary it's given
-        // without injecting any non-determinism on its own.
+    fn system_prompt_ignores_working_set_summary_argument() {
+        // Working-set metadata is now injected into the latest user message
+        // per turn. The legacy argument remains for call-site compatibility
+        // but must not reintroduce volatile bytes into the system prompt.
         let tmp = tempdir().expect("tempdir");
         let workspace = tmp.path();
         let summary = "## Repo Working Set\nWorkspace: /tmp/x\n";
@@ -754,16 +745,18 @@ mod tests {
             &a,
             &b,
         );
-        assert!(a.contains(summary), "summary must be embedded as-is");
+        assert!(
+            !a.contains(summary),
+            "summary must not be embedded in system prompt"
+        );
     }
 
     #[test]
     fn system_prompt_with_handoff_file_is_byte_stable_when_file_is_unchanged() {
-        // Companion to the working-set stability test: if `.deepseek/handoff.md`
-        // hasn't moved between two builds, the rendered prompt must produce
-        // identical bytes. The handoff block is the second volatile surface
-        // (the first is the working-set summary) — both land below the static
-        // boundary in `system_prompt_for_mode_with_context_and_skills`.
+        // If `.deepseek/handoff.md` hasn't moved between two builds, the
+        // rendered prompt must produce identical bytes. The handoff block
+        // lands below the static boundary in
+        // `system_prompt_for_mode_with_context_and_skills`.
         let tmp = tempdir().expect("tempdir");
         let workspace = tmp.path();
         let handoff_dir = workspace.join(".deepseek");
@@ -792,14 +785,11 @@ mod tests {
     }
 
     #[test]
-    fn handoff_and_working_set_appear_after_static_blocks() {
-        // Cache-prefix invariant: the volatile blocks (handoff, working_set)
-        // must come *after* the static `## Context Management` and the
-        // compaction handoff template (`## Compaction Handoff`) so a churn
-        // in either volatile section doesn't drag the static blocks out of
-        // the cached prefix. Pre-fix ordering placed handoff between the
-        // skills block and `## Context Management`, which busted the cache
-        // every time `/compact` rewrote the file.
+    fn handoff_appears_after_static_blocks_without_working_set() {
+        // Cache-prefix invariant: the handoff block must come after static
+        // `## Context Management` and the compaction handoff template
+        // (`## Compaction Handoff`). Working-set metadata is per-turn user
+        // metadata now, not a system-prompt tail block.
         let tmp = tempdir().expect("tempdir");
         let workspace = tmp.path();
         let handoff_dir = workspace.join(".deepseek");
@@ -822,9 +812,10 @@ mod tests {
         let handoff_pos = prompt
             .find(HANDOFF_BLOCK_MARKER)
             .expect("handoff block present when fixture file exists");
-        let working_set_pos = prompt
-            .find("## Repo Working Set")
-            .expect("working-set summary present when supplied");
+        assert!(
+            !prompt.contains("## Repo Working Set"),
+            "working-set summary must stay out of the system prompt"
+        );
 
         assert!(
             context_pos < handoff_pos,
@@ -834,10 +825,6 @@ mod tests {
             compact_pos < handoff_pos,
             "## Compaction Handoff must precede the handoff block"
         );
-        assert!(
-            handoff_pos < working_set_pos,
-            "handoff block must precede the working-set summary (most-volatile last)"
-        );
     }
 
     #[test]

From 1a6589c55aff7e015318e673541108a07274ec44 Mon Sep 17 00:00:00 2001
From: Hunter Bown <hmbown@gmail.com>
Date: Mon, 4 May 2026 22:06:58 -0500
Subject: [PATCH 04/11] perf(tools): anchor tool array with cache control

---
 crates/tui/src/core/engine.rs    |  6 +++---
 crates/tui/src/tools/registry.rs | 22 +++++++++++++++++++++-
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/crates/tui/src/core/engine.rs b/crates/tui/src/core/engine.rs
index aeeb6fb4..e5ff3f01 100644
--- a/crates/tui/src/core/engine.rs
+++ b/crates/tui/src/core/engine.rs
@@ -887,9 +887,9 @@ impl Engine {
         } else {
             Vec::new()
         };
-        let tools = tool_registry
-            .as_ref()
-            .map(|registry| build_model_tool_catalog(registry.to_api_tools(), mcp_tools, mode));
+        let tools = tool_registry.as_ref().map(|registry| {
+            build_model_tool_catalog(registry.to_api_tools_with_cache(true), mcp_tools, mode)
+        });
 
         // Main turn loop
         let (status, error) = self
diff --git a/crates/tui/src/tools/registry.rs b/crates/tui/src/tools/registry.rs
index d85031bd..5bbca679 100644
--- a/crates/tui/src/tools/registry.rs
+++ b/crates/tui/src/tools/registry.rs
@@ -186,7 +186,6 @@ impl ToolRegistry {
 
     /// Convert tools to API Tool format with optional cache control on the last tool.
     #[must_use]
-    #[allow(dead_code)]
     pub fn to_api_tools_with_cache(&self, enable_cache: bool) -> Vec<Tool> {
         let mut tools = self.to_api_tools();
         if enable_cache && let Some(last) = tools.last_mut() {
@@ -871,6 +870,27 @@ mod tests {
         assert_eq!(api_tools[0].description, "A test tool");
     }
 
+    #[test]
+    fn api_tools_with_cache_marks_last_tool_ephemeral() {
+        let tmp = tempdir().expect("tempdir");
+        let ctx = ToolContext::new(tmp.path().to_path_buf());
+        let mut registry = ToolRegistry::new(ctx);
+
+        registry.register(make_test_tool("tool_a"));
+        registry.register(make_test_tool("tool_b"));
+
+        let api_tools = registry.to_api_tools_with_cache(true);
+        assert_eq!(api_tools.len(), 2);
+        assert!(api_tools[0].cache_control.is_none());
+        assert_eq!(
+            api_tools[1]
+                .cache_control
+                .as_ref()
+                .map(|c| c.cache_type.as_str()),
+            Some("ephemeral")
+        );
+    }
+
     /// Tool whose `description()` advances through a script of pre-built
     /// strings, one per call. Used to demonstrate that the api-tools cache
     /// pins the description bytes on first read instead of re-sampling them

From 991ae41571784db492cb22bc8ae279faea1ffca2 Mon Sep 17 00:00:00 2001
From: Hunter Bown <hmbown@gmail.com>
Date: Mon, 4 May 2026 22:07:04 -0500
Subject: [PATCH 05/11] docs(changelog): describe v0.8.11 cache overhaul

---
 CHANGELOG.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e7ab259a..65a92712 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,25 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.8.11] - Unreleased
+
+### Changed
+- **Cache-maxing prompt path for DeepSeek V4** — the engine now skips
+  system-prompt reassignment when the assembled stable prompt is unchanged,
+  keeps the volatile repo working-set summary out of the system prompt, and
+  injects it as per-turn metadata on the latest user message instead.
+- **Tool catalog cache anchor** — the model-visible tool array now marks
+  the final native tool with `cache_control: ephemeral` so DeepSeek can
+  anchor the stable tool prefix explicitly.
+- **V4-scale automatic compaction defaults** — automatic compaction keeps a
+  500K-token hard floor and the fallback compaction threshold now reflects
+  the V4-scale late-trigger policy instead of the old 50K-era default.
+
+### Fixed
+- **Legacy 128K context naming** — the 128K fallback is now named and
+  documented as legacy DeepSeek-only behavior, reducing ambiguity with the
+  1M-token DeepSeek V4 defaults.
+
 ## [0.8.10] - 2026-05-04
 
 A patch release: hotfixes, small UX polish, and four whalescale-unblocking

From e98efcf31dddde7bb0ffe7432e01de08ee00d74a Mon Sep 17 00:00:00 2001
From: Hunter Bown <hmbown@gmail.com>
Date: Mon, 4 May 2026 22:08:07 -0500
Subject: [PATCH 06/11] fix(engine): drop dead working set prompt marker

---
 crates/tui/src/core/engine/context.rs | 1 -
 crates/tui/src/core/engine/tests.rs   | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/tui/src/core/engine/context.rs b/crates/tui/src/core/engine/context.rs
index 81f053e0..0a38a063 100644
--- a/crates/tui/src/core/engine/context.rs
+++ b/crates/tui/src/core/engine/context.rs
@@ -40,7 +40,6 @@ const LARGE_CONTEXT_WINDOW_TOKENS: u32 = 500_000;
 const TOOL_RESULT_METADATA_SUMMARY_CHARS: usize = 320;
 
 pub(super) const COMPACTION_SUMMARY_MARKER: &str = "Conversation Summary (Auto-Generated)";
-pub(super) const WORKING_SET_SUMMARY_MARKER: &str = "## Repo Working Set";
 
 #[derive(Debug, Clone, Copy)]
 struct ToolResultContextLimits {
diff --git a/crates/tui/src/core/engine/tests.rs b/crates/tui/src/core/engine/tests.rs
index 8a9751e7..9c28826d 100644
--- a/crates/tui/src/core/engine/tests.rs
+++ b/crates/tui/src/core/engine/tests.rs
@@ -1,6 +1,5 @@
 use super::*;
 
-use super::context::WORKING_SET_SUMMARY_MARKER;
 use crate::models::SystemBlock;
 use serde_json::json;
 use std::collections::HashSet;
@@ -9,6 +8,8 @@ use std::path::PathBuf;
 use std::time::Instant;
 use tempfile::tempdir;
 
+const WORKING_SET_SUMMARY_MARKER: &str = "## Repo Working Set";
+
 fn build_engine_with_capacity(capacity: CapacityControllerConfig) -> Engine {
     let engine_config = EngineConfig {
         capacity,

From d0e95f23b0902d49d96b56e7a02f6f6d4ed70a01 Mon Sep 17 00:00:00 2001
From: Hunter Bown <hmbown@gmail.com>
Date: Mon, 4 May 2026 22:19:09 -0500
Subject: [PATCH 07/11] refactor(compaction): drop message_threshold,
 token-only triggering
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The `message_threshold` field on `CompactionConfig` was a 128K-era
heuristic that fired compaction on long sessions of small messages
— exactly the case where rewriting V4's prefix cache is most wasteful.
Token budget is the only signal that maps to actual model context
pressure; counting messages adds nothing.

Changes:

* Remove `CompactionConfig::message_threshold` field.
* Remove the message-count branch in `should_compact` — token
  threshold + 500K floor is now the sole compaction trigger.
* Remove `compaction_message_threshold_for_model`,
  `DEFAULT_COMPACTION_MESSAGE_THRESHOLD`,
  `COMPACTION_MESSAGE_DIVISOR`, `MAX_COMPACTION_MESSAGE_THRESHOLD`
  from `models.rs`.
* Drop the `forced_config.message_threshold` tweak in the engine's
  capacity-guardrail forced-compaction path; that path now also
  bypasses the floor (`auto_floor_tokens = 0`) because we're at a
  hard ceiling and have to free budget regardless of cache cost.
* Update production constructors (`main.rs`, `runtime_threads.rs`,
  `app.rs::compaction_config`) to drop the field.
* Update tests: keep the floor + token-threshold assertions, delete
  the two tests that specifically validated message-count
  triggering, replace `should_compact_respects_message_threshold`
  with `message_count_no_longer_triggers_compaction` pinning the
  new contract.

Verified locally:

* `cargo fmt --all -- --check` clean.
* `cargo clippy --workspace --all-targets --all-features --locked
  -- -D warnings` clean.
* `cargo test --workspace --all-features --locked` — 2036 passed
  in TUI bin (2 ignored), all other crates green.
* parity gates: snapshot, parity_protocol, parity_state — all pass.
* `git diff --exit-code -- Cargo.lock` — clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/tui/src/compaction.rs      | 150 ++++++++++--------------------
 crates/tui/src/core/engine.rs     |   5 +-
 crates/tui/src/main.rs            |   3 +-
 crates/tui/src/models.rs          |  33 -------
 crates/tui/src/runtime_threads.rs |   6 +-
 crates/tui/src/tui/app.rs         |   6 +-
 6 files changed, 57 insertions(+), 146 deletions(-)

diff --git a/crates/tui/src/compaction.rs b/crates/tui/src/compaction.rs
index 0f3acb09..59ecc3c4 100644
--- a/crates/tui/src/compaction.rs
+++ b/crates/tui/src/compaction.rs
@@ -18,19 +18,25 @@ use crate::models::{
 };
 
 /// Configuration for conversation compaction behavior.
+///
+/// v0.8.11 simplified this from the prior token-OR-message-count trigger
+/// to a token-only trigger gated by an absolute floor. The
+/// `message_threshold` field was removed: its only purpose was to fire
+/// compaction on long sessions of small messages, which is exactly the
+/// case where rewriting the V4 prefix cache is least valuable. Token
+/// budget is the right signal; message count was a 128K-era heuristic.
 #[derive(Debug, Clone, PartialEq)]
 pub struct CompactionConfig {
     pub enabled: bool,
     pub token_threshold: usize,
-    pub message_threshold: usize,
     pub model: String,
     pub cache_summary: bool,
     /// Hard floor — `should_compact` returns `false` when total session
-    /// tokens fall below this number, regardless of `enabled`,
-    /// `token_threshold`, or `message_threshold`. Defaults to
-    /// [`MINIMUM_AUTO_COMPACTION_TOKENS`] (500K) for v0.8.11+. Tests that
-    /// want to exercise the older threshold/message-count logic at small
-    /// fixture sizes can set this to `0` to disable the floor.
+    /// tokens fall below this number, regardless of `enabled` or
+    /// `token_threshold`. Defaults to [`MINIMUM_AUTO_COMPACTION_TOKENS`]
+    /// (500K) for v0.8.11+. Tests that want to exercise the threshold
+    /// logic at small fixture sizes can set this to `0` to disable the
+    /// floor.
     pub auto_floor_tokens: usize,
 }
 
@@ -50,7 +56,6 @@ impl Default for CompactionConfig {
             // default no longer lies. Real call sites override this via
             // `compaction_threshold_for_model_and_effort`.
             token_threshold: 800_000,
-            message_threshold: 50,
             model: DEFAULT_TEXT_MODEL.to_string(),
             cache_summary: true,
             auto_floor_tokens: MINIMUM_AUTO_COMPACTION_TOKENS,
@@ -61,17 +66,15 @@ impl Default for CompactionConfig {
 /// Hard floor for automatic compaction in v0.8.11+.
 ///
 /// Below this token count, `should_compact` returns `false` regardless of
-/// `enabled`, `token_threshold`, or `message_threshold`. The point of the
-/// floor is V4 prefix-cache economics: compaction rewrites the stable
-/// prefix, which destroys the KV cache. At low token counts the prefix
-/// cache is healthy and compaction's cost (full re-prefill at miss prices)
-/// dwarfs its benefit (a tiny budget reclaim). Above the floor compaction
-/// can still be net-positive — cache is already pressured, the prefix has
-/// drifted, and freeing budget matters.
+/// `enabled` or `token_threshold`. The point of the floor is V4 prefix-cache
+/// economics: compaction rewrites the stable prefix, which destroys the KV
+/// cache. At low token counts the prefix cache is healthy and compaction's
+/// cost (full re-prefill at miss prices) dwarfs its benefit (a tiny budget
+/// reclaim). Above the floor compaction can still be net-positive — cache
+/// is already pressured, the prefix has drifted, and freeing budget matters.
 ///
-/// Manual `/compact` slash command and the model-callable `compact_now`
-/// tool both bypass this floor with a deliberate refusal message — they
-/// represent explicit agency rather than implicit policy.
+/// Manual `/compact` slash command bypasses this floor with explicit user
+/// agency.
 ///
 /// Constant rather than configurable for v0.8.11. If anyone needs to dial
 /// it (smaller models, opinionated workflows), we can add a setting later.
@@ -645,7 +648,6 @@ pub fn should_compact(
         .iter()
         .map(|&idx| estimate_tokens_for_message(&messages[idx], false))
         .sum();
-    let pinned_count = plan.pinned_indices.len();
 
     let token_estimate: usize = plan
         .summarize_indices
@@ -656,21 +658,19 @@ pub fn should_compact(
 
     // Pinned messages consume part of the budget, so compact earlier when needed.
     let effective_token_threshold = config.token_threshold.saturating_sub(pinned_tokens);
-    let effective_message_threshold = config.message_threshold.saturating_sub(pinned_count);
 
-    // Always compact if we exceed the token threshold, even with few unpinned messages.
-    if token_estimate > effective_token_threshold && effective_token_threshold > 0 {
-        return true;
+    // Token-only trigger (v0.8.11): the prior message-count branch was a
+    // 128K-era heuristic that fired compaction on long chats of small
+    // messages — exactly the case where rewriting the V4 prefix cache is
+    // most wasteful. Token budget is the only signal that maps to actual
+    // model context pressure.
+    if effective_token_threshold == 0 {
+        return message_count >= MIN_SUMMARIZE_MESSAGES;
     }
-
-    let enough_unpinned = message_count >= MIN_SUMMARIZE_MESSAGES
-        || effective_token_threshold == 0
-        || effective_message_threshold == 0;
-    if !enough_unpinned {
+    if message_count < MIN_SUMMARIZE_MESSAGES {
         return false;
     }
-
-    token_estimate > effective_token_threshold || message_count > effective_message_threshold
+    token_estimate > effective_token_threshold
 }
 
 fn truncate_chars(text: &str, max_chars: usize) -> &str {
@@ -1487,20 +1487,22 @@ mod tests {
         assert!(!should_compact(&messages, &config, None, None, None));
     }
 
+    /// v0.8.11: message-count is no longer a compaction trigger. Long
+    /// chats of small messages stay uncompacted because rewriting the V4
+    /// prefix cache for a tiny budget reclaim is net-negative. Only token
+    /// pressure (and the explicit `/compact` slash command) trigger
+    /// compaction.
     #[test]
-    fn should_compact_respects_message_threshold() {
+    fn message_count_no_longer_triggers_compaction() {
         let config = CompactionConfig {
             enabled: true,
-            token_threshold: 1_000_000, // Very high
-            message_threshold: 5,
-            // Disable the v0.8.11 500K floor so this test exercises the
-            // pure message-count threshold logic at small fixture sizes.
+            token_threshold: 1_000_000,
             auto_floor_tokens: 0,
             ..Default::default()
         };
 
-        // Under threshold
-        let few_messages: Vec<Message> = (0..4)
+        // 200 tiny messages, well above the prior message threshold.
+        let many_messages: Vec<Message> = (0..200)
             .map(|_| Message {
                 role: "user".to_string(),
                 content: vec![ContentBlock::Text {
@@ -1509,19 +1511,9 @@ mod tests {
                 }],
             })
             .collect();
-        assert!(!should_compact(&few_messages, &config, None, None, None));
-
-        // Over threshold
-        let many_messages: Vec<Message> = (0..10)
-            .map(|_| Message {
-                role: "user".to_string(),
-                content: vec![ContentBlock::Text {
-                    text: "x".to_string(),
-                    cache_control: None,
-                }],
-            })
-            .collect();
-        assert!(should_compact(&many_messages, &config, None, None, None));
+        // Token total stays minuscule so the token threshold is not hit;
+        // without the prior message-count trigger, no compaction.
+        assert!(!should_compact(&many_messages, &config, None, None, None));
     }
 
     #[test]
@@ -1619,7 +1611,6 @@ mod tests {
         let config = CompactionConfig {
             enabled: true,
             token_threshold: 10,
-            message_threshold: 2,
             ..Default::default()
         };
 
@@ -1630,46 +1621,12 @@ mod tests {
         assert!(!should_compact(&messages, &config, None, None, None));
     }
 
-    #[test]
-    fn should_compact_counts_only_unpinned_messages() {
-        let config = CompactionConfig {
-            enabled: true,
-            token_threshold: 1_000_000,
-            message_threshold: 5,
-            auto_floor_tokens: 0,
-            ..Default::default()
-        };
-
-        let mut messages: Vec<Message> = (0..7)
-            .map(|i| msg("user", &format!("noise message {i}")))
-            .collect();
-        messages.push(msg("user", "Focus on src/core/engine.rs"));
-        messages.extend((0..4).map(|i| msg("assistant", &format!("recent {i}"))));
-
-        assert!(should_compact(&messages, &config, None, None, None));
-    }
-
-    #[test]
-    fn should_compact_when_pins_consume_budget() {
-        let config = CompactionConfig {
-            enabled: true,
-            token_threshold: 50,
-            message_threshold: 50,
-            auto_floor_tokens: 0,
-            ..Default::default()
-        };
-
-        let mut messages = vec![msg("user", "noise 0"), msg("assistant", "noise 1")];
-        messages.extend((0..4).map(|_| {
-            msg(
-                "assistant",
-                &format!("{} src/core/engine.rs", "x".repeat(400)),
-            )
-        }));
-
-        // Pinned recent messages exceed the token budget, so unpinned noise should trigger compaction.
-        assert!(should_compact(&messages, &config, None, None, None));
-    }
+    // v0.8.11: removed `should_compact_counts_only_unpinned_messages` and
+    // `should_compact_when_pins_consume_budget` — both tested the
+    // message-count compaction trigger that v0.8.11 deleted. The
+    // pinned-tokens accounting they exercised is still tested by
+    // `should_compact_ignores_fully_pinned_context` below; the rest of
+    // their setup has no contemporary contract to pin.
 
     #[test]
     fn enforce_tool_call_pairs_removes_orphaned_tool_call() {
@@ -1925,8 +1882,7 @@ mod tests {
     fn test_should_compact_token_threshold_triggers() {
         let config = CompactionConfig {
             enabled: true,
-            token_threshold: 100,    // Low threshold for testing
-            message_threshold: 1000, // High message threshold
+            token_threshold: 100, // Low threshold for testing
             auto_floor_tokens: 0,
             ..Default::default()
         };
@@ -1945,7 +1901,6 @@ mod tests {
         let config = CompactionConfig {
             enabled: true,
             token_threshold: 1000,
-            message_threshold: 1000,
             ..Default::default()
         };
 
@@ -1963,8 +1918,7 @@ mod tests {
     fn auto_compaction_floor_blocks_below_500k_even_when_threshold_says_yes() {
         let config = CompactionConfig {
             enabled: true,
-            token_threshold: 100,    // would normally fire instantly
-            message_threshold: 1000, // not the trigger
+            token_threshold: 100, // would normally fire instantly
             // Use the production default explicitly so this test pins the
             // floor's contract rather than relying on `Default`.
             auto_floor_tokens: MINIMUM_AUTO_COMPACTION_TOKENS,
@@ -1983,7 +1937,6 @@ mod tests {
         let config = CompactionConfig {
             enabled: true,
             token_threshold: 2_000_000,
-            message_threshold: 2_000,
             auto_floor_tokens: MINIMUM_AUTO_COMPACTION_TOKENS,
             ..Default::default()
         };
@@ -2307,7 +2260,6 @@ mod tests {
         let _config = CompactionConfig {
             enabled: true,
             token_threshold: 1000,
-            message_threshold: 5,
             ..Default::default()
         };
 
@@ -2323,9 +2275,7 @@ mod tests {
             msg("assistant", "recent 2"),
         ];
 
-        // Should compact because:
-        // - More than message_threshold (5) unpinned messages
-        // - src/main.rs mention pins message 0
+        // src/main.rs mention should pin message 0 in the plan.
         let plan = plan_compaction(
             &messages,
             Some(&workspace),
diff --git a/crates/tui/src/core/engine.rs b/crates/tui/src/core/engine.rs
index e5ff3f01..2f7ef663 100644
--- a/crates/tui/src/core/engine.rs
+++ b/crates/tui/src/core/engine.rs
@@ -1183,7 +1183,10 @@ impl Engine {
             .token_threshold
             .min(target_budget.saturating_sub(1))
             .max(1);
-        forced_config.message_threshold = forced_config.message_threshold.max(1);
+        // v0.8.11: forced compaction (capacity guardrail) bypasses the floor
+        // because we're at a hard ceiling and have to free budget regardless
+        // of cache cost.
+        forced_config.auto_floor_tokens = 0;
 
         match compact_messages_safe(
             client,
diff --git a/crates/tui/src/main.rs b/crates/tui/src/main.rs
index a13a8392..445d8110 100644
--- a/crates/tui/src/main.rs
+++ b/crates/tui/src/main.rs
@@ -3711,7 +3711,7 @@ async fn run_exec_agent(
     use crate::core::engine::{EngineConfig, spawn_engine};
     use crate::core::events::Event;
     use crate::core::ops::Op;
-    use crate::models::{compaction_message_threshold_for_model, compaction_threshold_for_model};
+    use crate::models::compaction_threshold_for_model;
     use crate::tools::plan::new_shared_plan_state;
     use crate::tools::todo::new_shared_todo_list;
     use crate::tui::app::AppMode;
@@ -3725,7 +3725,6 @@ async fn run_exec_agent(
         enabled: false,
         model: model.to_string(),
         token_threshold: compaction_threshold_for_model(model),
-        message_threshold: compaction_message_threshold_for_model(model),
         ..Default::default()
     };
 
diff --git a/crates/tui/src/models.rs b/crates/tui/src/models.rs
index 320d8305..8576cf8a 100644
--- a/crates/tui/src/models.rs
+++ b/crates/tui/src/models.rs
@@ -14,10 +14,7 @@ pub const DEEPSEEK_V4_CONTEXT_WINDOW_TOKENS: u32 = 1_000_000;
 /// models resolve to their own scaled value via
 /// [`compaction_threshold_for_model`] (#664).
 pub const DEFAULT_COMPACTION_TOKEN_THRESHOLD: usize = 102_400;
-pub const DEFAULT_COMPACTION_MESSAGE_THRESHOLD: usize = 50;
 const COMPACTION_THRESHOLD_PERCENT: u32 = 80;
-const COMPACTION_MESSAGE_DIVISOR: u32 = 500;
-const MAX_COMPACTION_MESSAGE_THRESHOLD: usize = 2_000;
 
 // === Core Message Types ===
 
@@ -298,21 +295,6 @@ pub fn compaction_threshold_for_model_and_effort(
     compaction_threshold_for_model(model)
 }
 
-/// Derive a compaction message-count threshold from model context window.
-#[must_use]
-pub fn compaction_message_threshold_for_model(model: &str) -> usize {
-    let Some(window) = context_window_for_model(model) else {
-        return DEFAULT_COMPACTION_MESSAGE_THRESHOLD;
-    };
-
-    let scaled = usize::try_from(window / COMPACTION_MESSAGE_DIVISOR)
-        .unwrap_or(DEFAULT_COMPACTION_MESSAGE_THRESHOLD);
-    scaled.clamp(
-        DEFAULT_COMPACTION_MESSAGE_THRESHOLD,
-        MAX_COMPACTION_MESSAGE_THRESHOLD,
-    )
-}
-
 // === Streaming Structures ===
 
 #[allow(dead_code)]
@@ -469,24 +451,9 @@ mod tests {
         assert_eq!(compaction_threshold_for_model("unknown-model"), 102_400);
     }
 
-    #[test]
-    fn compaction_message_threshold_scales_with_context_window() {
-        assert_eq!(
-            compaction_message_threshold_for_model("deepseek-v3.2-128k"),
-            256
-        );
-        assert_eq!(compaction_message_threshold_for_model("unknown-model"), 50);
-        // 200k / 500 = 400, within the 2k cap.
-        assert_eq!(compaction_message_threshold_for_model("claude-3"), 400);
-    }
-
     #[test]
     fn compaction_scales_for_deepseek_v4_1m_context() {
         assert_eq!(compaction_threshold_for_model("deepseek-v4-pro"), 800_000);
-        assert_eq!(
-            compaction_message_threshold_for_model("deepseek-v4-pro"),
-            2_000
-        );
     }
 
     #[test]
diff --git a/crates/tui/src/runtime_threads.rs b/crates/tui/src/runtime_threads.rs
index 33772552..1fb28cd2 100644
--- a/crates/tui/src/runtime_threads.rs
+++ b/crates/tui/src/runtime_threads.rs
@@ -23,10 +23,7 @@ use crate::core::coherence::CoherenceState;
 use crate::core::engine::{EngineConfig, EngineHandle, spawn_engine};
 use crate::core::events::{Event as EngineEvent, TurnOutcomeStatus};
 use crate::core::ops::Op;
-use crate::models::{
-    ContentBlock, Message, SystemPrompt, Usage, compaction_message_threshold_for_model,
-    compaction_threshold_for_model,
-};
+use crate::models::{ContentBlock, Message, SystemPrompt, Usage, compaction_threshold_for_model};
 use crate::tools::plan::new_shared_plan_state;
 use crate::tools::subagent::SubAgentStatus;
 use crate::tools::todo::new_shared_todo_list;
@@ -1765,7 +1762,6 @@ impl RuntimeThreadManager {
             enabled: false,
             model: thread.model.clone(),
             token_threshold: compaction_threshold_for_model(&thread.model),
-            message_threshold: compaction_message_threshold_for_model(&thread.model),
             ..Default::default()
         };
         let network_policy = self.config.network.clone().map(|toml_cfg| {
diff --git a/crates/tui/src/tui/app.rs b/crates/tui/src/tui/app.rs
index f1d274b8..29f52073 100644
--- a/crates/tui/src/tui/app.rs
+++ b/crates/tui/src/tui/app.rs
@@ -15,10 +15,7 @@ use crate::core::coherence::CoherenceState;
 use crate::cycle_manager::{CycleBriefing, CycleConfig};
 use crate::hooks::{HookContext, HookEvent, HookExecutor, HookResult};
 use crate::localization::{Locale, MessageId, resolve_locale, tr};
-use crate::models::{
-    Message, SystemPrompt, compaction_message_threshold_for_model,
-    compaction_threshold_for_model_and_effort,
-};
+use crate::models::{Message, SystemPrompt, compaction_threshold_for_model_and_effort};
 use crate::palette::{self, UiTheme};
 use crate::session_manager::SessionContextReference;
 use crate::settings::Settings;
@@ -3169,7 +3166,6 @@ impl App {
         CompactionConfig {
             enabled: self.auto_compact,
             token_threshold: self.compact_threshold,
-            message_threshold: compaction_message_threshold_for_model(&self.model),
             model: self.model.clone(),
             ..Default::default()
         }

From 229f02ea2caca67db8fa0bcd694839fed7afbe95 Mon Sep 17 00:00:00 2001
From: Hunter Bown <hmbown@gmail.com>
Date: Mon, 4 May 2026 22:19:34 -0500
Subject: [PATCH 08/11] feat(npm): install.js network resilience for slow /
 firewalled networks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A community user from China reported `npm install deepseek-tui`
took 18 minutes through a CN npm mirror. The bottleneck is the
GitHub Releases binary fetch (~46 MB across two binaries), not the
npm tarball (which is 6.9 kB). The CN mirror does NOT proxy GitHub
release downloads, so any user behind a slow or lossy connection
is hitting the GitHub fetch directly with no resilience.

Four behaviors added to `npm/deepseek-tui/scripts/install.js`:

1. **Retry with exponential backoff.** Up to 5 attempts on network
   errors (ECONNRESET, ECONNREFUSED, ETIMEDOUT, EAI_AGAIN,
   network/host unreachable, EPIPE, ECONNABORTED) and 5xx upstream
   responses. Backoff `1s, 2s, 4s, 8s, 16s` with ±20% jitter. 4xx
   and checksum-mismatch are flagged non-retryable so we don't
   thrash on permanent failures. Final error includes the underlying
   message and the attempt count.

2. **Per-attempt total timeout + stall detector.** Total timeout
   defaults to 5 minutes per attempt (`DEEPSEEK_TUI_DOWNLOAD_TIMEOUT_MS`,
   alias `DEEPSEEK_DOWNLOAD_TIMEOUT_MS`). A stall detector aborts
   the request when no bytes arrive for 30 s
   (`DEEPSEEK_TUI_DOWNLOAD_STALL_MS`, alias
   `DEEPSEEK_DOWNLOAD_STALL_MS`) so a hung connection doesn't waste
   the whole timeout. Both budgets are surfaced in the error so the
   user can dial them up if they're on a slow pipe.

3. **HTTPS_PROXY / HTTP_PROXY support — pure Node, no new
   dependencies.** Detects `HTTPS_PROXY` / `HTTP_PROXY` (and the
   lowercase variants) and routes through the proxy via CONNECT
   tunneling. `NO_PROXY` exclusion list honored, with `*` and dotted-
   suffix matching. Proxy auth via standard `user:pass@` URL form is
   passed through as `Proxy-Authorization: Basic ...`. Pure-Node
   implementation using `net` + `tls` + `http` + `https` builtins —
   no `https-proxy-agent` dependency added.

4. **Download progress indicator.** Writes to stderr every ~1 MB
   or every 2 s in TTY mode using `\r` to overwrite a single line.
   Non-TTY mode (CI, piped) emits one line per 5 MB so logs stay
   reasonable. Suppressed when `DEEPSEEK_TUI_QUIET_INSTALL=1` or
   when `npm_config_loglevel` is `silent` or `error`. Falls back to
   `N MB downloaded` when the response has no `Content-Length`.

Public API unchanged: existing callers of `getBinaryPath` and `run`
keep working identically when no new env vars are set. The escape
hatch `DEEPSEEK_TUI_DISABLE_INSTALL=1` still exits cleanly.

Verified locally:

* `node -c install.js` and module-load syntax checks.
* `DEEPSEEK_TUI_FORCE_DOWNLOAD=1 DEEPSEEK_TUI_VERSION=0.8.10 node
  install.js` — real GitHub Releases download succeeded with
  visible progress, both binaries landed.
* `HTTPS_PROXY=http://invalid.proxy.local:9999 ... node install.js`
  — proxy path exercised, fails cleanly with the bad host named
  in the error message after retries exhausted.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 npm/deepseek-tui/scripts/install.js | 776 ++++++++++++++++++++++++++--
 1 file changed, 740 insertions(+), 36 deletions(-)

diff --git a/npm/deepseek-tui/scripts/install.js b/npm/deepseek-tui/scripts/install.js
index 3f23703b..52e0219e 100644
--- a/npm/deepseek-tui/scripts/install.js
+++ b/npm/deepseek-tui/scripts/install.js
@@ -1,10 +1,12 @@
 const fs = require("fs");
 const https = require("https");
 const http = require("http");
+const net = require("net");
+const tls = require("tls");
 const crypto = require("crypto");
+const { URL } = require("url");
 const { mkdir, chmod, stat, rename, readFile, unlink, writeFile } = fs.promises;
 const { createWriteStream } = fs;
-const { pipeline } = require("stream/promises");
 const path = require("path");
 
 const {
@@ -16,6 +18,46 @@ const {
 const { preflightGlibc } = require("./preflight-glibc");
 const pkg = require("../package.json");
 
+const DEFAULT_TIMEOUT_MS = 300_000; // 5 minutes per attempt
+const DEFAULT_STALL_MS = 30_000; // abort if no bytes for 30s
+const MAX_ATTEMPTS = 5;
+const BASE_BACKOFF_MS = 1_000;
+
+const RETRYABLE_NET_CODES = new Set([
+  "ECONNRESET",
+  "ECONNREFUSED",
+  "ETIMEDOUT",
+  "EAI_AGAIN",
+  "ENETUNREACH",
+  "EHOSTUNREACH",
+  "EPIPE",
+  "ECONNABORTED",
+]);
+
+class NonRetryableError extends Error {
+  constructor(message) {
+    super(message);
+    this.name = "NonRetryableError";
+    this.nonRetryable = true;
+  }
+}
+
+class HttpStatusError extends Error {
+  constructor(status, url) {
+    super(`Request failed with status ${status}: ${url}`);
+    this.name = "HttpStatusError";
+    this.status = status;
+  }
+}
+
+class DownloadTimeoutError extends Error {
+  constructor(message) {
+    super(message);
+    this.name = "DownloadTimeoutError";
+    this.code = "EDOWNLOADTIMEOUT";
+  }
+}
+
 function resolvePackageVersion() {
   const configuredVersion =
     process.env.DEEPSEEK_TUI_VERSION ||
@@ -44,45 +86,705 @@ function binaryPaths() {
   };
 }
 
-async function httpGet(url) {
-  const client = url.startsWith("https:") ? https : http;
-  const response = await new Promise((resolve, reject) => {
-    client.get(url, (res) => {
-      const status = res.statusCode || 0;
-      if (status >= 300 && status < 400 && res.headers.location) {
-        resolve({ redirect: res.headers.location, response: null });
-        return;
-      }
-      if (status !== 200) {
-        reject(new Error(`Request failed with status ${status}: ${url}`));
-        return;
-      }
-      resolve({ redirect: null, response: res });
-    }).on("error", reject);
-  });
-  return response;
+// ────────────────────────────────────────────────────────────────────────────
+// Logging / progress
+// ────────────────────────────────────────────────────────────────────────────
+
+function isQuietInstall() {
+  if (process.env.DEEPSEEK_TUI_QUIET_INSTALL === "1") {
+    return true;
+  }
+  const level = (process.env.npm_config_loglevel || "").toLowerCase();
+  return level === "silent" || level === "error";
 }
 
-async function download(url, destination) {
-  const resolved = await httpGet(url);
-  if (resolved.redirect) {
-    return download(resolved.redirect, destination);
+function logInfo(message) {
+  if (isQuietInstall()) {
+    return;
   }
+  process.stderr.write(`deepseek-tui: ${message}\n`);
+}
+
+function envInt(name, fallback) {
+  const raw = process.env[name];
+  if (!raw) {
+    return fallback;
+  }
+  const parsed = Number.parseInt(String(raw).trim(), 10);
+  if (!Number.isFinite(parsed) || parsed <= 0) {
+    return fallback;
+  }
+  return parsed;
+}
+
+function downloadTimeoutMs() {
+  return envInt(
+    "DEEPSEEK_TUI_DOWNLOAD_TIMEOUT_MS",
+    envInt("DEEPSEEK_DOWNLOAD_TIMEOUT_MS", DEFAULT_TIMEOUT_MS),
+  );
+}
+
+function downloadStallMs() {
+  return envInt(
+    "DEEPSEEK_TUI_DOWNLOAD_STALL_MS",
+    envInt("DEEPSEEK_DOWNLOAD_STALL_MS", DEFAULT_STALL_MS),
+  );
+}
+
+function formatMb(bytes) {
+  return (bytes / (1024 * 1024)).toFixed(0);
+}
+
+function createProgressReporter(assetName, totalBytes) {
+  if (isQuietInstall()) {
+    return { onChunk: () => {}, finish: () => {} };
+  }
+  const isTty = !!process.stderr.isTTY;
+  const interactive = isTty;
+  const tickBytes = interactive ? 1 * 1024 * 1024 : 5 * 1024 * 1024;
+  const tickMs = 2_000;
+
+  let received = 0;
+  let lastBytesPrinted = 0;
+  let lastTimePrinted = 0;
+  let everPrinted = false;
+
+  const render = (final) => {
+    if (totalBytes && totalBytes > 0) {
+      const pct = Math.min(100, Math.round((received / totalBytes) * 100));
+      const line = `deepseek-tui: downloading ${assetName}: ${formatMb(received)} / ${formatMb(totalBytes)} MB (${pct}%)`;
+      if (interactive) {
+        process.stderr.write(`${line}\r`);
+      } else {
+        process.stderr.write(`${line}\n`);
+      }
+    } else {
+      const line = `deepseek-tui: downloading ${assetName}: ${formatMb(received)} MB downloaded`;
+      if (interactive) {
+        process.stderr.write(`${line}\r`);
+      } else {
+        process.stderr.write(`${line}\n`);
+      }
+    }
+    everPrinted = true;
+    lastBytesPrinted = received;
+    lastTimePrinted = Date.now();
+  };
+
+  return {
+    onChunk(chunkLen) {
+      received += chunkLen;
+      const now = Date.now();
+      if (
+        received - lastBytesPrinted >= tickBytes ||
+        (interactive && now - lastTimePrinted >= tickMs)
+      ) {
+        render(false);
+      }
+    },
+    finish() {
+      // Final line — always render once.
+      render(true);
+      if (interactive && everPrinted) {
+        // Move past the carriage-return line and emit a "done" footer.
+        process.stderr.write("\n");
+      }
+      process.stderr.write(`deepseek-tui: ${assetName} ... done.\n`);
+    },
+  };
+}
+
+// ────────────────────────────────────────────────────────────────────────────
+// Proxy support (HTTPS_PROXY / HTTP_PROXY / NO_PROXY) — pure Node, CONNECT
+// tunnel + TLS upgrade for HTTPS targets.
+// ────────────────────────────────────────────────────────────────────────────
+
+function getProxyUrl(targetUrl) {
+  const isHttps = targetUrl.protocol === "https:";
+  const candidates = isHttps
+    ? ["HTTPS_PROXY", "https_proxy", "HTTP_PROXY", "http_proxy"]
+    : ["HTTP_PROXY", "http_proxy"];
+  for (const name of candidates) {
+    const raw = process.env[name];
+    if (raw && String(raw).trim() !== "") {
+      return String(raw).trim();
+    }
+  }
+  return null;
+}
+
+function shouldBypassProxy(host) {
+  const raw = process.env.NO_PROXY || process.env.no_proxy;
+  if (!raw) {
+    return false;
+  }
+  const lower = String(host).toLowerCase();
+  for (const part of String(raw).split(",")) {
+    const entry = part.trim().toLowerCase();
+    if (!entry) {
+      continue;
+    }
+    if (entry === "*") {
+      return true;
+    }
+    // Strip leading dot and any explicit port.
+    const stripped = entry.replace(/^\./, "").replace(/:.*$/, "");
+    if (!stripped) {
+      continue;
+    }
+    if (lower === stripped || lower.endsWith(`.${stripped}`)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+function parseProxy(proxyStr) {
+  // Accept "http://user:pass@host:port" and bare "host:port".
+  const normalized = /^[a-z][a-z0-9+\-.]*:\/\//i.test(proxyStr)
+    ? proxyStr
+    : `http://${proxyStr}`;
+  const u = new URL(normalized);
+  const port = u.port
+    ? Number.parseInt(u.port, 10)
+    : u.protocol === "https:"
+      ? 443
+      : 80;
+  let auth = null;
+  if (u.username) {
+    const user = decodeURIComponent(u.username);
+    const pass = u.password ? decodeURIComponent(u.password) : "";
+    auth = Buffer.from(`${user}:${pass}`).toString("base64");
+  }
+  return {
+    protocol: u.protocol,
+    host: u.hostname,
+    port,
+    auth,
+    raw: proxyStr,
+  };
+}
+
+function connectThroughProxy(proxy, targetHost, targetPort, timeoutMs) {
+  return new Promise((resolve, reject) => {
+    const socket = net.connect({ host: proxy.host, port: proxy.port });
+    let settled = false;
+    const fail = (err) => {
+      if (settled) return;
+      settled = true;
+      try {
+        socket.destroy();
+      } catch {
+        // ignore
+      }
+      reject(err);
+    };
+
+    const timer = timeoutMs > 0
+      ? setTimeout(() => fail(new DownloadTimeoutError(
+          `proxy CONNECT to ${proxy.host}:${proxy.port} timed out after ${timeoutMs} ms`,
+        )), timeoutMs)
+      : null;
+
+    socket.once("error", (err) => {
+      if (timer) clearTimeout(timer);
+      // Surface proxy host so the user can fix it.
+      const wrapped = new Error(
+        `proxy connection failed (${proxy.host}:${proxy.port}): ${err.message}`,
+      );
+      wrapped.code = err.code;
+      fail(wrapped);
+    });
+
+    socket.once("connect", () => {
+      const lines = [
+        `CONNECT ${targetHost}:${targetPort} HTTP/1.1`,
+        `Host: ${targetHost}:${targetPort}`,
+        "User-Agent: deepseek-tui-installer",
+        "Proxy-Connection: keep-alive",
+      ];
+      if (proxy.auth) {
+        lines.push(`Proxy-Authorization: Basic ${proxy.auth}`);
+      }
+      const req = `${lines.join("\r\n")}\r\n\r\n`;
+
+      let buf = Buffer.alloc(0);
+      const onData = (chunk) => {
+        buf = Buffer.concat([buf, chunk]);
+        const idx = buf.indexOf("\r\n\r\n");
+        if (idx === -1) {
+          if (buf.length > 16 * 1024) {
+            socket.removeListener("data", onData);
+            fail(new Error(
+              `proxy ${proxy.host}:${proxy.port} returned an oversized response header`,
+            ));
+          }
+          return;
+        }
+        socket.removeListener("data", onData);
+        const head = buf.slice(0, idx).toString("utf8");
+        const firstLine = head.split(/\r?\n/, 1)[0] || "";
+        const m = firstLine.match(/^HTTP\/\d\.\d\s+(\d{3})/);
+        if (!m) {
+          fail(new Error(`proxy ${proxy.host}:${proxy.port} returned invalid CONNECT reply: ${firstLine}`));
+          return;
+        }
+        const code = Number.parseInt(m[1], 10);
+        if (code !== 200) {
+          fail(new Error(
+            `proxy ${proxy.host}:${proxy.port} refused CONNECT to ${targetHost}:${targetPort}: HTTP ${code}`,
+          ));
+          return;
+        }
+        if (timer) clearTimeout(timer);
+        if (settled) return;
+        settled = true;
+        // Any bytes past the header belong to the tunneled stream — but in
+        // practice CONNECT 200 has no body; if it did, we'd lose those bytes
+        // here. Keep it simple: trust well-behaved proxies.
+        resolve(socket);
+      };
+      socket.on("data", onData);
+      socket.write(req, "utf8");
+    });
+  });
+}
+
+// ────────────────────────────────────────────────────────────────────────────
+// HTTP request with timeout, stall detection, and proxy support.
+// ────────────────────────────────────────────────────────────────────────────
+
+function httpRequest(rawUrl, opts = {}) {
+  const totalTimeoutMs = opts.totalTimeoutMs ?? downloadTimeoutMs();
+  const stallMs = opts.stallMs ?? downloadStallMs();
+
+  return new Promise((resolve, reject) => {
+    let url;
+    try {
+      url = new URL(rawUrl);
+    } catch (err) {
+      reject(new NonRetryableError(`Invalid URL: ${rawUrl} (${err.message})`));
+      return;
+    }
+    if (url.protocol !== "https:" && url.protocol !== "http:") {
+      reject(new NonRetryableError(`Unsupported protocol: ${url.protocol}`));
+      return;
+    }
+
+    const proxyStr = !shouldBypassProxy(url.hostname) ? getProxyUrl(url) : null;
+    const isHttps = url.protocol === "https:";
+    const port = url.port
+      ? Number.parseInt(url.port, 10)
+      : isHttps
+        ? 443
+        : 80;
+
+    let totalTimer = null;
+    let stallTimer = null;
+    let settled = false;
+    let req = null;
+    let res = null;
+
+    const cleanup = () => {
+      if (totalTimer) {
+        clearTimeout(totalTimer);
+        totalTimer = null;
+      }
+      if (stallTimer) {
+        clearTimeout(stallTimer);
+        stallTimer = null;
+      }
+    };
+
+    const fail = (err) => {
+      if (settled) return;
+      settled = true;
+      cleanup();
+      try {
+        if (req && !req.destroyed) req.destroy();
+      } catch {
+        // ignore
+      }
+      try {
+        if (res && !res.destroyed) res.destroy();
+      } catch {
+        // ignore
+      }
+      reject(err);
+    };
+
+    if (totalTimeoutMs > 0) {
+      totalTimer = setTimeout(() => {
+        fail(new DownloadTimeoutError(
+          `download exceeded total timeout of ${totalTimeoutMs} ms ` +
+          `(set DEEPSEEK_TUI_DOWNLOAD_TIMEOUT_MS to raise it; current stall budget is ${stallMs} ms)`,
+        ));
+      }, totalTimeoutMs);
+    }
+
+    const armStallTimer = () => {
+      if (stallMs <= 0) return;
+      if (stallTimer) clearTimeout(stallTimer);
+      stallTimer = setTimeout(() => {
+        fail(new DownloadTimeoutError(
+          `download stalled — no bytes received for ${stallMs} ms ` +
+          `(set DEEPSEEK_TUI_DOWNLOAD_STALL_MS to raise it; total budget is ${totalTimeoutMs} ms)`,
+        ));
+      }, stallMs);
+    };
+
+    const launch = (socket) => {
+      const reqOptions = {
+        method: "GET",
+        host: url.hostname,
+        port,
+        path: `${url.pathname}${url.search || ""}`,
+        headers: {
+          Host: url.host,
+          "User-Agent": "deepseek-tui-installer",
+          Accept: "*/*",
+          Connection: "close",
+        },
+      };
+      if (socket) {
+        reqOptions.createConnection = () => socket;
+        if (isHttps) {
+          // Wrap raw TCP socket from CONNECT in TLS.
+          const tlsSocket = tls.connect({
+            socket,
+            servername: url.hostname,
+            ALPNProtocols: ["http/1.1"],
+          });
+          tlsSocket.once("error", (err) => fail(err));
+          reqOptions.createConnection = () => tlsSocket;
+        }
+      }
+      const client = isHttps ? https : http;
+      try {
+        req = client.request(reqOptions, (response) => {
+          res = response;
+          armStallTimer();
+          response.on("data", () => {
+            armStallTimer();
+          });
+          response.on("end", () => {
+            cleanup();
+          });
+          response.on("error", (err) => fail(err));
+
+          const status = response.statusCode || 0;
+          if (status >= 300 && status < 400 && response.headers.location) {
+            cleanup();
+            settled = true;
+            response.resume();
+            resolve({ redirect: response.headers.location, response: null });
+            return;
+          }
+          if (status < 200 || status >= 300) {
+            const err = new HttpStatusError(status, rawUrl);
+            // 4xx: non-retryable; 5xx: retryable.
+            if (status >= 400 && status < 500) {
+              err.nonRetryable = true;
+            }
+            fail(err);
+            return;
+          }
+          if (settled) return;
+          settled = true;
+          // Hand the live response stream to the caller.
+          resolve({ redirect: null, response });
+        });
+        req.once("error", (err) => fail(err));
+        req.once("socket", (s) => {
+          // Belt-and-suspenders: surface socket-level errors quickly.
+          s.once("error", (err) => fail(err));
+        });
+        req.end();
+      } catch (err) {
+        fail(err);
+      }
+    };
+
+    if (proxyStr) {
+      let proxy;
+      try {
+        proxy = parseProxy(proxyStr);
+      } catch (err) {
+        fail(new NonRetryableError(
+          `Invalid proxy URL "${proxyStr}": ${err.message}`,
+        ));
+        return;
+      }
+      if (!isHttps) {
+        // Plain HTTP through proxy — send absolute URI, no CONNECT.
+        const client = http;
+        try {
+          req = client.request(
+            {
+              host: proxy.host,
+              port: proxy.port,
+              method: "GET",
+              path: rawUrl,
+              headers: {
+                Host: url.host,
+                "User-Agent": "deepseek-tui-installer",
+                Accept: "*/*",
+                Connection: "close",
+                ...(proxy.auth ? { "Proxy-Authorization": `Basic ${proxy.auth}` } : {}),
+              },
+            },
+            (response) => {
+              res = response;
+              armStallTimer();
+              response.on("data", () => armStallTimer());
+              response.on("end", () => cleanup());
+              response.on("error", (err) => fail(err));
+              const status = response.statusCode || 0;
+              if (status >= 300 && status < 400 && response.headers.location) {
+                cleanup();
+                settled = true;
+                response.resume();
+                resolve({ redirect: response.headers.location, response: null });
+                return;
+              }
+              if (status < 200 || status >= 300) {
+                const err = new HttpStatusError(status, rawUrl);
+                if (status >= 400 && status < 500) err.nonRetryable = true;
+                fail(err);
+                return;
+              }
+              if (settled) return;
+              settled = true;
+              resolve({ redirect: null, response });
+            },
+          );
+          req.once("error", (err) => fail(err));
+          req.end();
+        } catch (err) {
+          fail(err);
+        }
+        return;
+      }
+
+      // HTTPS through proxy: CONNECT tunnel + TLS upgrade.
+      connectThroughProxy(proxy, url.hostname, port, Math.max(stallMs, 5_000))
+        .then((tcpSocket) => {
+          if (settled) {
+            try { tcpSocket.destroy(); } catch { /* ignore */ }
+            return;
+          }
+          const tlsSocket = tls.connect({
+            socket: tcpSocket,
+            servername: url.hostname,
+            ALPNProtocols: ["http/1.1"],
+          });
+          tlsSocket.once("error", (err) => fail(err));
+          tlsSocket.once("secureConnect", () => {
+            if (settled) {
+              try { tlsSocket.destroy(); } catch { /* ignore */ }
+              return;
+            }
+            const reqOptions = {
+              method: "GET",
+              createConnection: () => tlsSocket,
+              path: `${url.pathname}${url.search || ""}`,
+              headers: {
+                Host: url.host,
+                "User-Agent": "deepseek-tui-installer",
+                Accept: "*/*",
+                Connection: "close",
+              },
+            };
+            try {
+              req = https.request(reqOptions, (response) => {
+                res = response;
+                armStallTimer();
+                response.on("data", () => armStallTimer());
+                response.on("end", () => cleanup());
+                response.on("error", (err) => fail(err));
+                const status = response.statusCode || 0;
+                if (status >= 300 && status < 400 && response.headers.location) {
+                  cleanup();
+                  settled = true;
+                  response.resume();
+                  resolve({ redirect: response.headers.location, response: null });
+                  return;
+                }
+                if (status < 200 || status >= 300) {
+                  const err = new HttpStatusError(status, rawUrl);
+                  if (status >= 400 && status < 500) err.nonRetryable = true;
+                  fail(err);
+                  return;
+                }
+                if (settled) return;
+                settled = true;
+                resolve({ redirect: null, response });
+              });
+              req.once("error", (err) => fail(err));
+              req.end();
+            } catch (err) {
+              fail(err);
+            }
+          });
+        })
+        .catch((err) => fail(err));
+      return;
+    }
+
+    // No proxy — direct connection.
+    launch(null);
+  });
+}
+
+// ────────────────────────────────────────────────────────────────────────────
+// Retry wrapper
+// ────────────────────────────────────────────────────────────────────────────
+
+function isRetryable(err) {
+  if (!err) return false;
+  if (err.nonRetryable) return false;
+  if (err instanceof NonRetryableError) return false;
+  if (err instanceof DownloadTimeoutError) return true;
+  if (err instanceof HttpStatusError) {
+    return err.status >= 500;
+  }
+  if (err.code && RETRYABLE_NET_CODES.has(err.code)) return true;
+  // Network-flavored messages we may see without a code.
+  const msg = String(err.message || "").toLowerCase();
+  if (msg.includes("network") && msg.includes("unreachable")) return true;
+  if (msg.includes("socket hang up")) return true;
+  if (msg.includes("aborted")) return true;
+  return false;
+}
+
+function backoffDelay(attempt) {
+  // attempt is 1-indexed; first retry waits ~1s.
+  const base = BASE_BACKOFF_MS * 2 ** (attempt - 1);
+  const jitter = (Math.random() * 0.4 - 0.2) * base; // ±20%
+  return Math.max(0, Math.round(base + jitter));
+}
+
+function sleep(ms) {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+async function withRetry(label, fn) {
+  let lastErr;
+  for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) {
+    try {
+      return await fn(attempt);
+    } catch (err) {
+      lastErr = err;
+      if (!isRetryable(err) || attempt === MAX_ATTEMPTS) {
+        break;
+      }
+      const wait = backoffDelay(attempt);
+      logInfo(
+        `${label} failed (attempt ${attempt}/${MAX_ATTEMPTS}): ${err.message}; retrying in ${wait} ms`,
+      );
+      await sleep(wait);
+    }
+  }
+  const msg = lastErr && lastErr.message ? lastErr.message : String(lastErr);
+  const wrapped = new Error(
+    `${label} failed after ${MAX_ATTEMPTS} attempt(s): ${msg}`,
+  );
+  if (lastErr && lastErr.stack) {
+    wrapped.cause = lastErr;
+  }
+  throw wrapped;
+}
+
+// ────────────────────────────────────────────────────────────────────────────
+// Public download primitives (now retry + progress aware)
+// ────────────────────────────────────────────────────────────────────────────
+
+async function followRedirects(url, opts) {
+  const maxRedirects = 10;
+  let current = url;
+  for (let hop = 0; hop < maxRedirects; hop++) {
+    const result = await httpRequest(current, opts);
+    if (result.redirect) {
+      try {
+        current = new URL(result.redirect, current).toString();
+      } catch {
+        current = result.redirect;
+      }
+      continue;
+    }
+    return result;
+  }
+  throw new NonRetryableError(`too many redirects starting at ${url}`);
+}
+
+function streamToFile(response, destination, progress) {
+  return new Promise((resolve, reject) => {
+    const sink = createWriteStream(destination);
+    let done = false;
+    const finish = (err) => {
+      if (done) return;
+      done = true;
+      if (err) {
+        sink.destroy();
+        reject(err);
+      } else {
+        resolve();
+      }
+    };
+    response.on("data", (chunk) => {
+      if (progress) progress.onChunk(chunk.length);
+    });
+    response.on("error", (err) => finish(err));
+    sink.on("error", (err) => finish(err));
+    sink.on("finish", () => finish(null));
+    response.pipe(sink);
+  });
+}
+
+async function download(url, destination, options = {}) {
   await mkdir(path.dirname(destination), { recursive: true });
-  await pipeline(resolved.response, createWriteStream(destination));
+  const assetName = options.assetName || path.basename(destination);
+  await withRetry(`download ${assetName}`, async (attempt) => {
+    const result = await followRedirects(url, {
+      totalTimeoutMs: downloadTimeoutMs(),
+      stallMs: downloadStallMs(),
+    });
+    const response = result.response;
+    const lenHeader = response.headers["content-length"];
+    const total = lenHeader ? Number.parseInt(lenHeader, 10) : 0;
+    const progress = createProgressReporter(assetName, Number.isFinite(total) ? total : 0);
+    if (attempt > 1) {
+      logInfo(`retry attempt ${attempt}/${MAX_ATTEMPTS} for ${assetName}`);
+    }
+    try {
+      await streamToFile(response, destination, progress);
+    } catch (err) {
+      // Ensure we don't leave a partial file confusing future attempts.
+      try {
+        await unlink(destination);
+      } catch {
+        // ignore
+      }
+      throw err;
+    }
+    progress.finish();
+  });
 }
 
 async function downloadText(url) {
-  const resolved = await httpGet(url);
-  if (resolved.redirect) {
-    return downloadText(resolved.redirect);
-  }
-  const chunks = [];
-  resolved.response.setEncoding("utf8");
-  for await (const chunk of resolved.response) {
-    chunks.push(chunk);
-  }
-  return chunks.join("");
+  return withRetry(`fetch ${url}`, async () => {
+    const result = await followRedirects(url, {
+      totalTimeoutMs: downloadTimeoutMs(),
+      stallMs: downloadStallMs(),
+    });
+    const response = result.response;
+    const chunks = [];
+    response.setEncoding("utf8");
+    for await (const chunk of response) {
+      chunks.push(chunk);
+    }
+    return chunks.join("");
+  });
 }
 
 async function readLocalVersion(file) {
@@ -122,11 +824,13 @@ async function sha256File(filePath) {
 async function verifyChecksum(filePath, assetName, checksums) {
   const expected = checksums.get(assetName);
   if (!expected) {
-    throw new Error(`Checksum manifest is missing ${assetName}`);
+    throw new NonRetryableError(`Checksum manifest is missing ${assetName}`);
   }
   const actual = await sha256File(filePath);
   if (actual !== expected) {
-    throw new Error(
+    // Bytes are corrupted; another fetch is unlikely to help without a fix
+    // upstream. Mark non-retryable.
+    throw new NonRetryableError(
       `Checksum mismatch for ${assetName}: expected ${expected}, got ${actual}`,
     );
   }
@@ -152,7 +856,7 @@ async function ensureBinary(targetPath, assetName, version, repo, getChecksums)
   const checksums = await getChecksums();
   const url = releaseAssetUrl(assetName, version, repo);
   const destination = `${targetPath}.${process.pid}.${Date.now()}.download`;
-  await download(url, destination);
+  await download(url, destination, { assetName });
   try {
     await verifyChecksum(destination, assetName, checksums);
     preflightGlibc(destination);

From a08c8915429a09728f145230900721c41f36be8b Mon Sep 17 00:00:00 2001
From: Hunter Bown <hmbown@gmail.com>
Date: Mon, 4 May 2026 22:20:07 -0500
Subject: [PATCH 09/11] docs(changelog): cover message_threshold removal +
 install.js resilience

Folds two follow-on changes into the 0.8.11 entry:

* The token-only compaction trigger (drops the 128K-era message-count
  heuristic).
* The npm `install.js` network-resilience cluster (retry + timeout +
  proxy + progress).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 65a92712..49f5742f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,11 +18,28 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - **V4-scale automatic compaction defaults** — automatic compaction keeps a
   500K-token hard floor and the fallback compaction threshold now reflects
   the V4-scale late-trigger policy instead of the old 50K-era default.
+- **Token-only compaction trigger** — the message-count compaction trigger
+  was a 128K-era heuristic that fired on long sessions of small messages
+  — exactly the case where rewriting V4's prefix cache is most wasteful.
+  Removed `CompactionConfig::message_threshold` and the message-count
+  branch in `should_compact`; token budget is now the sole automatic
+  trigger (gated by the 500K floor). Manual `/compact` is unchanged.
 
 ### Fixed
 - **Legacy 128K context naming** — the 128K fallback is now named and
   documented as legacy DeepSeek-only behavior, reducing ambiguity with the
   1M-token DeepSeek V4 defaults.
+- **`npm install` resilience for slow / firewalled networks** — the
+  postinstall binary fetch from GitHub Releases now retries on transient
+  errors (5 attempts, 1-16 s exponential backoff with jitter), enforces a
+  per-attempt timeout (default 5 min, configurable via
+  `DEEPSEEK_TUI_DOWNLOAD_TIMEOUT_MS`) plus a 30 s stall detector, honors
+  `HTTPS_PROXY` / `HTTP_PROXY` / `NO_PROXY` env vars (pure-Node CONNECT
+  tunneling, no new dependencies), and prints a download-progress line
+  to stderr so users know it isn't hung. Suppressible with
+  `DEEPSEEK_TUI_QUIET_INSTALL=1`. Reported by a community user from China
+  whose install through a CN npm mirror took 18 minutes — the bottleneck
+  was the GitHub fetch, which CN npm mirrors do not proxy.
 
 ## [0.8.10] - 2026-05-04
 

From 76dd924c7fcb791f4d48b1acac509ffaa566c205 Mon Sep 17 00:00:00 2001
From: Hunter Bown <hmbown@gmail.com>
Date: Mon, 4 May 2026 22:27:11 -0500
Subject: [PATCH 10/11] fix(engine): turn_meta must skip tool-result messages
 (HTTP 400 fix)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Live-test repro: typing a single user message in the TUI triggered a
tool call (read_file Cargo.toml), and the *next* request to DeepSeek's
API returned HTTP 400:

    "An assistant message with 'tool_calls' must be followed by tool
    messages responding to each 'tool_call_id'. (insufficient tool
    messages following tool_calls message)"

Root cause: `messages_with_turn_metadata` walked the message list from
the tail and prepended a `<turn_meta>` Text block to the *last* message
with role="user". But tool-result messages also use role="user"
internally (they serialize to role="tool" on the wire). Inserting a
Text content block at index 0 of a tool-result message changed the
shape from `[ToolResult(...)]` to `[Text("turn_meta..."), ToolResult(...)]`,
which on the wire becomes a role="user" message with text instead of
the role="tool" message the API needs to satisfy the assistant's
prior tool_call. Hence the 400.

The fix:

* Restrict the injection target to messages that have at least one
  Text content block AND no ToolResult blocks. This identifies actual
  user-typed messages and skips tool-result envelopes.
* When the trailing slice has no eligible user message (e.g. mid-turn
  when a tool result is the most recent message), skip injection
  entirely. The working_set will surface again on the next genuine
  user prompt; we don't retroactively prepend onto an earlier user
  message because that would also confuse the API's tool-call
  continuity checks.

Two regression tests pin the contract:

* `turn_metadata_skips_tool_result_messages` — assistant tool_call +
  tool_result + earlier user message: only the user message gets the
  prefix, the tool_result message stays a single-block ToolResult.
* `turn_metadata_skips_when_only_tool_results_trail` — the corner
  case where the trailing user-role message is solely a tool result
  (no real user message in the slice): no injection happens, the
  message returns unchanged.

Verified locally:

* 2038 tests passed in TUI bin (2 ignored, was 2036 — these are the
  +2 new regressions).
* `cargo fmt`, `cargo clippy --locked -D warnings`, parity gates all
  clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/tui/src/core/engine/tests.rs     | 126 ++++++++++++++++++++++++
 crates/tui/src/core/engine/turn_loop.rs |  26 ++++-
 2 files changed, 147 insertions(+), 5 deletions(-)

diff --git a/crates/tui/src/core/engine/tests.rs b/crates/tui/src/core/engine/tests.rs
index 9c28826d..00a82681 100644
--- a/crates/tui/src/core/engine/tests.rs
+++ b/crates/tui/src/core/engine/tests.rs
@@ -567,6 +567,132 @@ fn working_set_reaches_model_as_turn_metadata() {
     assert!(text.contains("src/lib.rs"));
 }
 
+/// v0.8.11 regression: tool-result messages serialize to role="tool" on
+/// the wire but are stored as role="user" internally. Prepending
+/// `<turn_meta>` text onto a tool-result message broke the
+/// assistant→tool_result invariant and caused HTTP 400 from DeepSeek's
+/// API ("insufficient tool messages following tool_calls"). The fix:
+/// inject only into messages that have a Text content block and no
+/// ToolResult blocks; mid-turn (tool-result is the trailing user
+/// message) the injection skips.
+#[test]
+fn turn_metadata_skips_tool_result_messages() {
+    let tmp = tempdir().expect("tempdir");
+    fs::create_dir_all(tmp.path().join("src")).expect("mkdir");
+    fs::write(tmp.path().join("src/lib.rs"), "pub fn sample() {}").expect("write");
+
+    let config = EngineConfig {
+        workspace: tmp.path().to_path_buf(),
+        ..Default::default()
+    };
+    let (mut engine, _handle) = Engine::new(config, &Config::default());
+    engine
+        .session
+        .working_set
+        .observe_user_message("inspect src/lib.rs", tmp.path());
+
+    // Real user message — should be eligible for injection.
+    engine.session.add_message(Message {
+        role: "user".to_string(),
+        content: vec![ContentBlock::Text {
+            text: "inspect src/lib.rs".to_string(),
+            cache_control: None,
+        }],
+    });
+    // Assistant tool-call.
+    engine.session.add_message(Message {
+        role: "assistant".to_string(),
+        content: vec![ContentBlock::ToolUse {
+            id: "call_42".to_string(),
+            name: "read_file".to_string(),
+            input: serde_json::json!({"path": "src/lib.rs"}),
+            caller: None,
+        }],
+    });
+    // Tool result, stored as role="user" internally.
+    engine.session.add_message(Message {
+        role: "user".to_string(),
+        content: vec![ContentBlock::ToolResult {
+            tool_use_id: "call_42".to_string(),
+            content: "pub fn sample() {}".to_string(),
+            is_error: None,
+            content_blocks: None,
+        }],
+    });
+
+    let messages = engine.messages_with_turn_metadata();
+
+    // The trailing message is the tool result and MUST be untouched —
+    // no Text block sneaking in front of the ToolResult block.
+    let trailing = messages.last().expect("trailing message");
+    assert_eq!(trailing.role, "user");
+    assert_eq!(trailing.content.len(), 1);
+    assert!(matches!(
+        trailing.content.first(),
+        Some(ContentBlock::ToolResult { .. })
+    ));
+
+    // The earlier real user message receives the turn_meta prefix.
+    let real_user = messages.first().expect("first user message");
+    assert_eq!(real_user.role, "user");
+    let ContentBlock::Text { text, .. } = real_user
+        .content
+        .first()
+        .expect("user text content")
+    else {
+        panic!("expected Text block on real user message");
+    };
+    assert!(text.starts_with("<turn_meta>\n"));
+    assert!(text.contains("src/lib.rs"));
+}
+
+/// When the turn is mid-execution and the trailing user message is a
+/// tool result, no turn_meta is injected at all (rather than landing on
+/// some earlier user message and confusing the API's tool-call
+/// continuity check). The working_set surfaces again on the next
+/// genuine user prompt.
+#[test]
+fn turn_metadata_skips_when_only_tool_results_trail() {
+    let tmp = tempdir().expect("tempdir");
+    fs::create_dir_all(tmp.path().join("src")).expect("mkdir");
+    fs::write(tmp.path().join("src/lib.rs"), "pub fn sample() {}").expect("write");
+
+    let config = EngineConfig {
+        workspace: tmp.path().to_path_buf(),
+        ..Default::default()
+    };
+    let (mut engine, _handle) = Engine::new(config, &Config::default());
+    engine
+        .session
+        .working_set
+        .observe_user_message("inspect src/lib.rs", tmp.path());
+
+    // Only a tool-result message in history — simulates the corner case
+    // where the prior real user message has already been compacted away
+    // but a tool-result is still pending. We must not retroactively
+    // inject.
+    engine.session.add_message(Message {
+        role: "user".to_string(),
+        content: vec![ContentBlock::ToolResult {
+            tool_use_id: "call_42".to_string(),
+            content: "pub fn sample() {}".to_string(),
+            is_error: None,
+            content_blocks: None,
+        }],
+    });
+
+    let messages = engine.messages_with_turn_metadata();
+
+    // Returned unchanged: the single tool-result message, no Text
+    // prefix, content length == 1.
+    let only = messages.last().expect("trailing message");
+    assert_eq!(only.content.len(), 1);
+    assert!(matches!(
+        only.content.first(),
+        Some(ContentBlock::ToolResult { .. })
+    ));
+}
+
 #[test]
 fn refresh_system_prompt_is_noop_when_unchanged() {
     let tmp = tempdir().expect("tempdir");
diff --git a/crates/tui/src/core/engine/turn_loop.rs b/crates/tui/src/core/engine/turn_loop.rs
index 2220dc9a..18071ef5 100644
--- a/crates/tui/src/core/engine/turn_loop.rs
+++ b/crates/tui/src/core/engine/turn_loop.rs
@@ -1607,11 +1607,27 @@ impl Engine {
         };
 
         let mut messages = self.session.messages.clone();
-        let Some(last_user) = messages
-            .iter_mut()
-            .rev()
-            .find(|message| message.role == "user")
-        else {
+        // v0.8.11 hotfix: tool-result messages are stored as role="user" in
+        // our internal representation but serialize to role="tool" on the
+        // wire. Prepending a Text block onto a tool-result message breaks
+        // the assistant→tool_result invariant — the API rejects the request
+        // with `"insufficient tool messages following tool_calls"`. Inject
+        // only into actual user-typed messages, recognizable by having at
+        // least one Text content block (and no ToolResult blocks).
+        let Some(last_user) = messages.iter_mut().rev().find(|message| {
+            message.role == "user"
+                && message
+                    .content
+                    .iter()
+                    .all(|block| !matches!(block, ContentBlock::ToolResult { .. }))
+                && message
+                    .content
+                    .iter()
+                    .any(|block| matches!(block, ContentBlock::Text { .. }))
+        }) else {
+            // No real user message in the trailing slice (e.g. mid-turn
+            // after a tool call). Skip injection — the working_set will
+            // surface again on the next genuine user prompt.
             return messages;
         };
 

From 9e67e04e4a80f35feb4133a2c0b8f778c77f9b17 Mon Sep 17 00:00:00 2001
From: Hunter Bown <hmbown@gmail.com>
Date: Mon, 4 May 2026 22:37:23 -0500
Subject: [PATCH 11/11] fix(install,tests): fmt nit + downloadText flowing-mode
 bug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CI on PR #684 caught two real issues that local checks missed:

**Lint failure (cargo fmt).** A regression test landed with a multi-line
`let ContentBlock::Text { text, .. } = real_user.content...` pattern
that local rustfmt accepted but CI's pinned toolchain collapsed onto a
single line. Reformatted to match.

**npm wrapper smoke failure ("Checksum manifest is missing
deepseek-<platform>").** Subtle Node.js streams interaction in
`install.js` introduced by the network-resilience cluster:

* `httpRequest` attaches a `data` event listener on the response to
  re-arm the stall timer.
* Attaching a `data` listener on a `Readable` puts the stream into
  flowing mode immediately.
* `downloadText` then ran `for await (const chunk of response)` to
  collect the body — the async iterator expects paused-mode and
  silently misses chunks that flow before / between iteration ticks.
* For small bodies (the ~100-byte SHA256 manifest), the entire
  response could flow through the stall listener before the async
  iterator's `read()` calls landed, leaving the joined body empty.
* Result: `parseChecksumManifest("")` returned an empty Map →
  `verifyChecksum` saw no entries → "manifest is missing X" after
  the actual binary download succeeded.

Binary downloads were unaffected because `download()` uses
`response.pipe(sink)` plus a `data` listener for progress — both
consume chunks via `data` events, no async iterator involved.

Fix: collect the response body in `downloadText` via direct `data`/
`end` event subscription. `data` listeners stack — both the stall
re-arm and the body collector fire on every chunk, no flowing-vs-
paused conflict. Stall detection still works.

Verified locally: `node scripts/release/npm-wrapper-smoke.js`
"npm wrapper smoke passed with local assets from <url>".

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/tui/src/core/engine/tests.rs |  5 +----
 npm/deepseek-tui/scripts/install.js | 24 +++++++++++++++++++-----
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/crates/tui/src/core/engine/tests.rs b/crates/tui/src/core/engine/tests.rs
index 00a82681..d6177f8b 100644
--- a/crates/tui/src/core/engine/tests.rs
+++ b/crates/tui/src/core/engine/tests.rs
@@ -635,10 +635,7 @@ fn turn_metadata_skips_tool_result_messages() {
     // The earlier real user message receives the turn_meta prefix.
     let real_user = messages.first().expect("first user message");
     assert_eq!(real_user.role, "user");
-    let ContentBlock::Text { text, .. } = real_user
-        .content
-        .first()
-        .expect("user text content")
+    let ContentBlock::Text { text, .. } = real_user.content.first().expect("user text content")
     else {
         panic!("expected Text block on real user message");
     };
diff --git a/npm/deepseek-tui/scripts/install.js b/npm/deepseek-tui/scripts/install.js
index 52e0219e..9bfd1795 100644
--- a/npm/deepseek-tui/scripts/install.js
+++ b/npm/deepseek-tui/scripts/install.js
@@ -778,12 +778,26 @@ async function downloadText(url) {
       stallMs: downloadStallMs(),
     });
     const response = result.response;
-    const chunks = [];
     response.setEncoding("utf8");
-    for await (const chunk of response) {
-      chunks.push(chunk);
-    }
-    return chunks.join("");
+    // NOTE: do NOT use `for await (const chunk of response)` here.
+    // `httpRequest` attaches a `data` listener on the response to re-arm
+    // the stall timer, which puts the stream in flowing mode. The async
+    // iterator expects paused mode and will silently miss every chunk —
+    // this manifested as an empty checksum manifest in the npm wrapper
+    // smoke test ("Checksum manifest is missing <asset>"). Subscribing
+    // to `data` events directly stacks alongside the stall listener and
+    // both fire per chunk, so we collect the body correctly without
+    // disturbing the stall detection.
+    return new Promise((resolve, reject) => {
+      const chunks = [];
+      response.on("data", (chunk) => {
+        chunks.push(chunk);
+      });
+      response.on("end", () => {
+        resolve(chunks.join(""));
+      });
+      response.on("error", reject);
+    });
   });
 }