release: v0.7.5 — token-basis fixes, shell timeout recovery, context/cache policy

Issues #202, #203, #204, #205: - Cycle/seam triggers use active request input size + response headroom reserve, not lifetime cumulative API usage. - V4 hard-cycle headroom calibrated around fixed TURN_MAX_OUTPUT_TOKENS plus CONTEXT_HEADROOM_TOKENS safety buffer. - /tokens, /cost, footer/header labels, and docs now separate active context, turn telemetry, cumulative usage, cache hit/miss, context percent, and cost. - Foreground exec_shell timeout output tells the model the process was killed and suggests task_shell_start or background exec_shell plus poll/wait. - Added regression tests for active-token basis, V4 headroom, seam trigger basis, footer label behavior, and shell timeout recovery metadata. - Preserved #200/#201 policy: V4 default is append-only, prefix-cache preserving; replacement compaction, Flash seams, and capacity intervention remain opt-in.
2026-04-29 10:13:27 -05:00
parent 0578eb701e
commit c2b2c284f6
28 changed files with 412 additions and 181 deletions
@@ -1011,7 +1011,7 @@ dependencies = [

 [[package]]
 name = "deepseek-agent"
-version = "0.7.4"
+version = "0.7.5"
 dependencies = [
 "deepseek-config",
 "serde",
@@ -1019,7 +1019,7 @@ dependencies = [

 [[package]]
 name = "deepseek-app-server"
-version = "0.7.4"
+version = "0.7.5"
 dependencies = [
 "anyhow",
 "axum",
@@ -1042,7 +1042,7 @@ dependencies = [

 [[package]]
 name = "deepseek-config"
-version = "0.7.4"
+version = "0.7.5"
 dependencies = [
 "anyhow",
 "deepseek-secrets",
@@ -1055,7 +1055,7 @@ dependencies = [

 [[package]]
 name = "deepseek-core"
-version = "0.7.4"
+version = "0.7.5"
 dependencies = [
 "anyhow",
 "chrono",
@@ -1074,7 +1074,7 @@ dependencies = [

 [[package]]
 name = "deepseek-execpolicy"
-version = "0.7.4"
+version = "0.7.5"
 dependencies = [
 "anyhow",
 "deepseek-protocol",
@@ -1083,7 +1083,7 @@ dependencies = [

 [[package]]
 name = "deepseek-hooks"
-version = "0.7.4"
+version = "0.7.5"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -1097,7 +1097,7 @@ dependencies = [

 [[package]]
 name = "deepseek-mcp"
-version = "0.7.4"
+version = "0.7.5"
 dependencies = [
 "anyhow",
 "deepseek-protocol",
@@ -1107,7 +1107,7 @@ dependencies = [

 [[package]]
 name = "deepseek-protocol"
-version = "0.7.4"
+version = "0.7.5"
 dependencies = [
 "serde",
 "serde_json",
@@ -1115,7 +1115,7 @@ dependencies = [

 [[package]]
 name = "deepseek-secrets"
-version = "0.7.4"
+version = "0.7.5"
 dependencies = [
 "dirs",
 "keyring",
@@ -1128,7 +1128,7 @@ dependencies = [

 [[package]]
 name = "deepseek-state"
-version = "0.7.4"
+version = "0.7.5"
 dependencies = [
 "anyhow",
 "chrono",
@@ -1140,7 +1140,7 @@ dependencies = [

 [[package]]
 name = "deepseek-tools"
-version = "0.7.4"
+version = "0.7.5"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -1153,7 +1153,7 @@ dependencies = [

 [[package]]
 name = "deepseek-tui"
-version = "0.7.4"
+version = "0.7.5"
 dependencies = [
 "anyhow",
 "arboard",
@@ -1213,7 +1213,7 @@ dependencies = [

 [[package]]
 name = "deepseek-tui-cli"
-version = "0.7.4"
+version = "0.7.5"
 dependencies = [
 "anyhow",
 "chrono",
@@ -1236,7 +1236,7 @@ dependencies = [

 [[package]]
 name = "deepseek-tui-core"
-version = "0.7.4"
+version = "0.7.5"

 [[package]]
 name = "deranged"
@@ -19,7 +19,7 @@ default-members = ["crates/cli", "crates/app-server", "crates/tui"]
 resolver = "2"

 [workspace.package]
-version = "0.7.4"
+version = "0.7.5"
 edition = "2024"
 license = "MIT"
 repository = "https://github.com/Hmbown/DeepSeek-TUI"
@@ -200,9 +200,13 @@ exponential_base = 2.0
 [context]
 enabled = false
 verbatim_window_turns = 16
+# Thresholds are based on the active request input estimate, not lifetime
+# summed API usage.
 l1_threshold = 192000
 l2_threshold = 384000
 l3_threshold = 576000
+# Hard cycle also reserves the normal 262144-token output budget plus 1024
+# safety tokens against the model window.
 cycle_threshold = 768000
 seam_model = "deepseek-v4-flash"

@@ -7,5 +7,5 @@ repository.workspace = true
 description = "Model/provider registry and fallback strategy for DeepSeek workspace architecture"

 [dependencies]
-deepseek-config = { path = "../config", version = "0.7.4" }
+deepseek-config = { path = "../config", version = "0.7.5" }
 serde.workspace = true
@@ -10,15 +10,15 @@ description = "Codex-style app-server transport for DeepSeek workspace architect
 anyhow.workspace = true
 axum.workspace = true
 clap.workspace = true
-deepseek-agent = { path = "../agent", version = "0.7.4" }
-deepseek-config = { path = "../config", version = "0.7.4" }
-deepseek-core = { path = "../core", version = "0.7.4" }
-deepseek-execpolicy = { path = "../execpolicy", version = "0.7.4" }
-deepseek-hooks = { path = "../hooks", version = "0.7.4" }
-deepseek-mcp = { path = "../mcp", version = "0.7.4" }
-deepseek-protocol = { path = "../protocol", version = "0.7.4" }
-deepseek-state = { path = "../state", version = "0.7.4" }
-deepseek-tools = { path = "../tools", version = "0.7.4" }
+deepseek-agent = { path = "../agent", version = "0.7.5" }
+deepseek-config = { path = "../config", version = "0.7.5" }
+deepseek-core = { path = "../core", version = "0.7.5" }
+deepseek-execpolicy = { path = "../execpolicy", version = "0.7.5" }
+deepseek-hooks = { path = "../hooks", version = "0.7.5" }
+deepseek-mcp = { path = "../mcp", version = "0.7.5" }
+deepseek-protocol = { path = "../protocol", version = "0.7.5" }
+deepseek-state = { path = "../state", version = "0.7.5" }
+deepseek-tools = { path = "../tools", version = "0.7.5" }
 serde.workspace = true
 serde_json.workspace = true
 tokio.workspace = true
@@ -14,13 +14,13 @@ path = "src/main.rs"
 anyhow.workspace = true
 clap.workspace = true
 clap_complete.workspace = true
-deepseek-agent = { path = "../agent", version = "0.7.4" }
-deepseek-app-server = { path = "../app-server", version = "0.7.4" }
-deepseek-config = { path = "../config", version = "0.7.4" }
-deepseek-execpolicy = { path = "../execpolicy", version = "0.7.4" }
-deepseek-mcp = { path = "../mcp", version = "0.7.4" }
-deepseek-secrets = { path = "../secrets", version = "0.7.4" }
-deepseek-state = { path = "../state", version = "0.7.4" }
+deepseek-agent = { path = "../agent", version = "0.7.5" }
+deepseek-app-server = { path = "../app-server", version = "0.7.5" }
+deepseek-config = { path = "../config", version = "0.7.5" }
+deepseek-execpolicy = { path = "../execpolicy", version = "0.7.5" }
+deepseek-mcp = { path = "../mcp", version = "0.7.5" }
+deepseek-secrets = { path = "../secrets", version = "0.7.5" }
+deepseek-state = { path = "../state", version = "0.7.5" }
 chrono.workspace = true
 dirs.workspace = true
 serde.workspace = true
@@ -8,7 +8,7 @@ description = "Config schema and precedence model for DeepSeek workspace archite

 [dependencies]
 anyhow.workspace = true
-deepseek-secrets = { path = "../secrets", version = "0.7.4" }
+deepseek-secrets = { path = "../secrets", version = "0.7.5" }
 dirs.workspace = true
 serde.workspace = true
 serde_json.workspace = true
@@ -9,14 +9,14 @@ description = "Core runtime boundaries for DeepSeek workspace architecture"
 [dependencies]
 anyhow.workspace = true
 chrono.workspace = true
-deepseek-agent = { path = "../agent", version = "0.7.4" }
-deepseek-config = { path = "../config", version = "0.7.4" }
-deepseek-execpolicy = { path = "../execpolicy", version = "0.7.4" }
-deepseek-hooks = { path = "../hooks", version = "0.7.4" }
-deepseek-mcp = { path = "../mcp", version = "0.7.4" }
-deepseek-protocol = { path = "../protocol", version = "0.7.4" }
-deepseek-state = { path = "../state", version = "0.7.4" }
-deepseek-tools = { path = "../tools", version = "0.7.4" }
+deepseek-agent = { path = "../agent", version = "0.7.5" }
+deepseek-config = { path = "../config", version = "0.7.5" }
+deepseek-execpolicy = { path = "../execpolicy", version = "0.7.5" }
+deepseek-hooks = { path = "../hooks", version = "0.7.5" }
+deepseek-mcp = { path = "../mcp", version = "0.7.5" }
+deepseek-protocol = { path = "../protocol", version = "0.7.5" }
+deepseek-state = { path = "../state", version = "0.7.5" }
+deepseek-tools = { path = "../tools", version = "0.7.5" }
 serde_json.workspace = true
 tokio.workspace = true
 uuid.workspace = true
@@ -8,5 +8,5 @@ description = "Execution policy and approval model parity for DeepSeek workspace

 [dependencies]
 anyhow.workspace = true
-deepseek-protocol = { path = "../protocol", version = "0.7.4" }
+deepseek-protocol = { path = "../protocol", version = "0.7.5" }
 serde.workspace = true
@@ -10,7 +10,7 @@ description = "Hook dispatch and notifications parity for DeepSeek workspace arc
 anyhow.workspace = true
 async-trait.workspace = true
 chrono.workspace = true
-deepseek-protocol = { path = "../protocol", version = "0.7.4" }
+deepseek-protocol = { path = "../protocol", version = "0.7.5" }
 reqwest.workspace = true
 serde.workspace = true
 serde_json.workspace = true
@@ -8,6 +8,6 @@ description = "MCP server lifecycle and tool proxy compatibility for DeepSeek wo

 [dependencies]
 anyhow.workspace = true
-deepseek-protocol = { path = "../protocol", version = "0.7.4" }
+deepseek-protocol = { path = "../protocol", version = "0.7.5" }
 serde.workspace = true
 serde_json.workspace = true
@@ -9,7 +9,7 @@ description = "Tool invocation lifecycle, schema validation, and scheduler paral
 [dependencies]
 anyhow.workspace = true
 async-trait.workspace = true
-deepseek-protocol = { path = "../protocol", version = "0.7.4" }
+deepseek-protocol = { path = "../protocol", version = "0.7.5" }
 serde.workspace = true
 serde_json.workspace = true
 tokio.workspace = true
@@ -13,8 +13,8 @@ path = "src/main.rs"
 [dependencies]
 anyhow = "1.0.100"
 arboard = "3.4"
-deepseek-secrets = { path = "../secrets", version = "0.7.4" }
-deepseek-tools = { path = "../tools", version = "0.7.4" }
+deepseek-secrets = { path = "../secrets", version = "0.7.5" }
+deepseek-tools = { path = "../tools", version = "0.7.5" }
 async-stream = "0.3.6"
 async-trait = "0.1"
 bytes = "1.11.0"
@@ -3,10 +3,40 @@
 //! Debug commands: tokens, cost, system, context, undo, retry

 use super::CommandResult;
-use crate::models::SystemPrompt;
+use crate::compaction::estimate_input_tokens_conservative;
+use crate::models::{SystemPrompt, context_window_for_model};
 use crate::tui::app::{App, AppAction};
 use crate::tui::history::HistoryCell;

+fn token_count(value: Option<u32>) -> String {
+    value.map_or_else(|| "not reported".to_string(), |tokens| tokens.to_string())
+}
+
+fn active_context_summary(app: &App) -> String {
+    let estimated =
+        estimate_input_tokens_conservative(&app.api_messages, app.system_prompt.as_ref());
+    match context_window_for_model(&app.model) {
+        Some(window) => {
+            let used = estimated.min(window as usize);
+            let percent = (used as f64 / f64::from(window) * 100.0).clamp(0.0, 100.0);
+            format!("~{used} / {window} ({percent:.1}%)")
+        }
+        None => format!("~{estimated} / unknown window"),
+    }
+}
+
+fn cache_summary(app: &App) -> String {
+    match (
+        app.last_prompt_cache_hit_tokens,
+        app.last_prompt_cache_miss_tokens,
+    ) {
+        (Some(hit), Some(miss)) => format!("{hit} hit / {miss} miss"),
+        (Some(hit), None) => format!("{hit} hit / miss not reported"),
+        (None, Some(miss)) => format!("hit not reported / {miss} miss"),
+        (None, None) => "not reported".to_string(),
+    }
+}
+
 /// Show token usage for session
 pub fn tokens(app: &mut App) -> CommandResult {
    let message_count = app.api_messages.len();
@@ -15,12 +45,24 @@ pub fn tokens(app: &mut App) -> CommandResult {
    CommandResult::message(format!(
        "Token Usage:\n\
         ─────────────────────────────\n\
-         Total tokens:     {}\n\
-         Session cost:     ${:.4}\n\
-         API messages:     {}\n\
-         Chat messages:    {}\n\
-         Model:            {}",
-        app.total_tokens, app.session_cost, message_count, chat_count, app.model,
+         Active context:        {}\n\
+         Last API input:        {} (turn telemetry; may count repeated prefix across tool rounds)\n\
+         Last API output:       {}\n\
+         Cache hit/miss:        {} (telemetry/cost only)\n\
+         Cumulative tokens:     {} (session usage telemetry)\n\
+         Approx session cost:   ${:.4}\n\
+         API messages:          {}\n\
+         Chat messages:         {}\n\
+         Model:                 {}",
+        active_context_summary(app),
+        token_count(app.last_prompt_tokens),
+        token_count(app.last_completion_tokens),
+        cache_summary(app),
+        app.total_tokens,
+        app.session_cost,
+        message_count,
+        chat_count,
+        app.model,
    ))
 }

@@ -29,7 +71,8 @@ pub fn cost(app: &mut App) -> CommandResult {
    CommandResult::message(format!(
        "Session Cost:\n\
         ─────────────────────────────\n\
-         Total spent:      ${:.4}\n\n\
+         Approx total spent: ${:.4}\n\n\
+         Cost estimates are approximate and use provider usage telemetry when available.\n\n\
         DeepSeek API Pricing:\n\
         ─────────────────────────────\n\
         Pricing details are not configured in this CLI.",
@@ -113,9 +156,16 @@ mod tests {
        let mut app = create_test_app();
        app.total_tokens = 1234;
        app.session_cost = 0.05;
+        app.last_prompt_tokens = Some(100);
+        app.last_completion_tokens = Some(25);
+        app.last_prompt_cache_hit_tokens = Some(70);
+        app.last_prompt_cache_miss_tokens = Some(30);
        app.api_messages.push(Message {
            role: "user".to_string(),
-            content: vec![],
+            content: vec![ContentBlock::Text {
+                text: "test".to_string(),
+                cache_control: None,
+            }],
        });
        app.history.push(HistoryCell::User {
            content: "test".to_string(),
@@ -125,8 +175,13 @@ mod tests {
        assert!(result.message.is_some());
        let msg = result.message.unwrap();
        assert!(msg.contains("Token Usage"));
-        assert!(msg.contains("Total tokens:"));
-        assert!(msg.contains("Session cost:"));
+        assert!(msg.contains("Active context:"));
+        assert!(msg.contains("Last API input:"));
+        assert!(msg.contains("Last API output:"));
+        assert!(msg.contains("Cache hit/miss:"));
+        assert!(msg.contains("70 hit / 30 miss"));
+        assert!(msg.contains("Cumulative tokens:"));
+        assert!(msg.contains("Approx session cost:"));
        assert!(msg.contains("API messages:"));
        assert!(msg.contains("Chat messages:"));
        assert!(msg.contains("Model:"));
@@ -140,7 +195,8 @@ mod tests {
        assert!(result.message.is_some());
        let msg = result.message.unwrap();
        assert!(msg.contains("Session Cost"));
-        assert!(msg.contains("Total spent:"));
+        assert!(msg.contains("Approx total spent:"));
+        assert!(msg.contains("approximate"));
        assert!(msg.contains("$0.1234"));
    }

@@ -445,7 +445,7 @@ pub struct ContextConfig {
    /// Verbatim window: last N turns never summarized. Default: 16.
    #[serde(default)]
    pub verbatim_window_turns: Option<usize>,
-    /// Soft seam thresholds (cumulative input+output tokens).
+    /// Soft seam thresholds based on the active request input estimate.
    #[serde(default)]
    pub l1_threshold: Option<usize>,
    #[serde(default)]
@@ -354,9 +354,9 @@ fn should_transparently_retry_stream(
 /// Max output tokens requested for normal agent turns. Generous on purpose:
 /// V4 thinking models can produce tens of thousands of reasoning tokens on
 /// hard prompts before the visible reply, and DeepSeek V4 ships with a 1M
-/// context window. 256K leaves the model effectively unconstrained on
-/// output without us imposing artificial per-turn caps that surfaced as the
-/// assistant "stopping mid-response" when reasoning consumed the budget.
+/// context window. v0.7.5 keeps this cap fixed instead of silently lowering
+/// `max_tokens` near pressure; hard-cycle/preflight checks reserve this budget
+/// plus safety headroom before sending the next request.
 const TURN_MAX_OUTPUT_TOKENS: u32 = 262_144;
 /// Keep this many most recent messages when emergency trimming is required.
 const MIN_RECENT_MESSAGES_TO_KEEP: usize = 4;
@@ -1199,6 +1199,10 @@ fn context_input_budget(model: &str, requested_output_tokens: u32) -> Option<usi
        .and_then(|v| v.checked_sub(CONTEXT_HEADROOM_TOKENS))
 }

+fn turn_response_headroom_tokens() -> u64 {
+    u64::from(TURN_MAX_OUTPUT_TOKENS).saturating_add(CONTEXT_HEADROOM_TOKENS as u64)
+}
+
 fn is_context_length_error_message(message: &str) -> bool {
    crate::error_taxonomy::classify_error_message(message) == ErrorCategory::InvalidInput
 }
@@ -2440,7 +2444,7 @@ impl Engine {
    /// Handle a turn using the DeepSeek API.
    #[allow(clippy::too_many_lines)]
    /// Run the pre-request layered-context checkpoint (#159). Checks whether
-    /// cumulative tokens have crossed a soft-seam threshold and, if so,
+    /// the active input estimate has crossed a soft-seam threshold and, if so,
    /// produces an `<archived_context>` block via Flash and appends it as an
    /// assistant message. Called from `handle_deepseek_turn` before each API
    /// request so the model always has the latest navigation aids.
@@ -2452,18 +2456,8 @@ impl Engine {
            return;
        }

-        // Cumulative tokens: session total (all turns so far) + current
-        // estimated input (the messages that will be sent next).
-        let cumulative_input = self
-            .session
-            .total_usage
-            .input_tokens
-            .saturating_add(self.session.total_usage.output_tokens);
-        let cumulative_estimate =
-            cumulative_input.saturating_add(self.estimated_input_tokens() as u64);
-
        let highest = seam_mgr.highest_level().await;
-        let Some(level) = seam_mgr.seam_level_for(cumulative_estimate as usize, highest) else {
+        let Some(level) = seam_mgr.seam_level_for(self.estimated_input_tokens(), highest) else {
            return;
        };

@@ -2563,8 +2557,8 @@ impl Engine {
    /// they're still running.
    async fn maybe_advance_cycle(&mut self, mode: AppMode) {
        if !should_advance_cycle(
-            self.session.total_usage.input_tokens,
-            self.session.total_usage.output_tokens,
+            self.estimated_input_tokens() as u64,
+            turn_response_headroom_tokens(),
            &self.session.model,
            &self.config.cycle,
            false,
@@ -29,9 +29,12 @@
 //!
 //! ## Trigger
 //!
-//! - Token threshold: **768K** by default (~75% of the 1M window). This is a
-//!   rare overflow safety net. Optional soft seams at 192K/384K/576K are
-//!   controlled by the opt-in layered context manager (#159).
+//! - Token threshold: **768K** active input by default (~75% of the 1M window).
+//!   This is a rare overflow safety net. The trigger is based on the next
+//!   request's live input estimate, not lifetime summed API usage, with
+//!   assistant-output and safety headroom considered against the model window.
+//!   Optional soft seams at 192K/384K/576K are controlled by the opt-in layered
+//!   context manager (#159).
 //! - Phase guard: callers only invoke `should_advance_cycle` at clean turn
 //!   boundaries (no in-flight tool, no streaming, no approval modal).
 //! - Per-model overrides: `[cycle.per_model]` in config.toml lets operators
@@ -48,7 +51,9 @@ use serde::{Deserialize, Serialize};

 use crate::client::DeepSeekClient;
 use crate::llm_client::LlmClient;
-use crate::models::{ContentBlock, Message, MessageRequest, SystemBlock, SystemPrompt};
+use crate::models::{
+    ContentBlock, Message, MessageRequest, SystemBlock, SystemPrompt, context_window_for_model,
+};
 use crate::tools::plan::{PlanSnapshot, SharedPlanState};
 use crate::tools::subagent::{SharedSubAgentManager, SubAgentResult, SubAgentStatus};
 use crate::tools::todo::{SharedTodoList, TodoListSnapshot};
@@ -151,14 +156,20 @@ pub struct CycleBriefing {

 /// Decide whether a cycle boundary should fire.
 ///
-/// `usage` is the *cumulative* session input+output tokens (both `u64` to
-/// match `SessionUsage`). `in_flight` is true when a tool is mid-execution,
-/// stream is open, or an approval modal is pending — in those cases the
-/// caller must wait until the next clean boundary.
+/// `active_input_tokens` is the estimated token count of the next request's
+/// current input, including previous assistant/tool output that is now part of
+/// the transcript. `reserved_response_headroom_tokens` is the max output budget
+/// plus any provider safety headroom reserved for that next request. Lifetime
+/// API usage is intentionally not used here because it repeatedly counts the
+/// same stable prefix across requests.
+///
+/// `in_flight` is true when a tool is mid-execution, stream is open, or an
+/// approval modal is pending — in those cases the caller must wait until the
+/// next clean boundary.
 #[must_use]
 pub fn should_advance_cycle(
-    cumulative_input_tokens: u64,
-    cumulative_output_tokens: u64,
+    active_input_tokens: u64,
+    reserved_response_headroom_tokens: u64,
    model: &str,
    cfg: &CycleConfig,
    in_flight: bool,
@@ -166,12 +177,14 @@ pub fn should_advance_cycle(
    if !cfg.enabled || in_flight {
        return false;
    }
-    let total = cumulative_input_tokens.saturating_add(cumulative_output_tokens);
    let threshold = cfg.threshold_for(model) as u64;
    if threshold == 0 {
        return false;
    }
-    total >= threshold
+    let trigger_floor = context_window_for_model(model)
+        .map(|window| u64::from(window).saturating_sub(reserved_response_headroom_tokens))
+        .map_or(threshold, |window_floor| threshold.min(window_floor));
+    active_input_tokens >= trigger_floor
 }

 /// Roll-up of state that survives a cycle boundary deterministically.
@@ -759,12 +772,60 @@ mod tests {
    }

    #[test]
-    fn should_advance_combines_input_and_output() {
+    fn should_advance_considers_output_plus_safety_headroom() {
        let cfg = CycleConfig::default();
-        // 400K + 400K = 800K > 768K threshold
+        // Below the 768K active-input threshold, but too close to the 1M
+        // model window once the next assistant response and safety headroom are
+        // included.
        assert!(should_advance_cycle(
-            400_000,
-            400_000,
+            737_000,
+            263_168,
+            "deepseek-v4-pro",
+            &cfg,
+            false
+        ));
+    }
+
+    #[test]
+    fn should_not_count_lifetime_api_usage_as_active_context() {
+        let cfg = CycleConfig::default();
+        assert!(!should_advance_cycle(
+            120_000,
+            64_000,
+            "deepseek-v4-pro",
+            &cfg,
+            false
+        ));
+    }
+
+    #[test]
+    fn should_advance_v4_calibrates_threshold_against_output_reserve() {
+        let cfg = CycleConfig::default();
+        let reserve = 263_168;
+        assert!(!should_advance_cycle(
+            700_000,
+            reserve,
+            "deepseek-v4-pro",
+            &cfg,
+            false
+        ));
+        assert!(should_advance_cycle(
+            738_000,
+            reserve,
+            "deepseek-v4-pro",
+            &cfg,
+            false
+        ));
+        assert!(should_advance_cycle(
+            768_000,
+            reserve,
+            "deepseek-v4-pro",
+            &cfg,
+            false
+        ));
+        assert!(should_advance_cycle(
+            900_000,
+            reserve,
            "deepseek-v4-pro",
            &cfg,
            false
@@ -75,7 +75,7 @@ When context is deep (past a soft seam): cache reasoning conclusions in concise

 - **Planning / tracking**: `update_plan` (high-level strategy), `task_create` / `task_list` / `task_read` / `task_cancel` (durable work objects), `checklist_write` (granular progress under the active task/thread), `checklist_add` / `checklist_update` / `checklist_list`, `todo_*` aliases (legacy compatibility), `note` (persistent memory).
 - **File I/O**: `read_file` (PDFs auto-extracted), `list_dir`, `write_file`, `edit_file`, `apply_patch`.
- **Shell**: `task_shell_start` + `task_shell_wait` for long-running commands, diagnostics, tests, searches, and servers; `exec_shell` for bounded cancellable foreground commands; `exec_shell_wait`, `exec_shell_interact`.
+- **Shell**: `task_shell_start` + `task_shell_wait` for long-running commands, diagnostics, tests, searches, and servers; `exec_shell` for bounded cancellable foreground commands; `exec_shell_wait`, `exec_shell_interact`. If foreground `exec_shell` times out, the process was killed; rerun long work with `task_shell_start` or `exec_shell` using `background: true`, then poll/wait.
 - **Task evidence**: `task_gate_run` for verification gates; `pr_attempt_record` / `pr_attempt_list` / `pr_attempt_read` / `pr_attempt_preflight`; `github_issue_context` / `github_pr_context` (read-only); `github_comment` / `github_close_issue` (approval + evidence required); `automation_*` scheduling tools.
 - **Structured search**: `grep_files`, `file_search`, `web_search`, `fetch_url`, `web.run` (browse).
 - **Git / diag / tests**: `git_status`, `git_diff`, `git_show`, `git_log`, `git_blame`, `diagnostics`, `run_tests`, `review`.
@@ -108,6 +108,7 @@ Don't reach for `exec_shell` when:
 - You just need to read or write a file — `read_file` / `write_file` are faster and show up in the tool log.
 - The command is a single `cat`, `ls`, or `echo` — use `read_file`, `list_dir`, or just state the result.
 - You're tempted to pipe `curl` for a web lookup — `web_search` or `fetch_url` give structured results.
+- The command may run for minutes, start a server, run a full test suite, or perform a scientific/release computation — use `task_shell_start` or `exec_shell` with `background: true`, then poll with `task_shell_wait` or `exec_shell_wait`.

 ### `agent_spawn`
 Don't reach for `agent_spawn` when:
@@ -17,7 +17,7 @@
 //!
 //! ## Soft seam levels
 //!
-//! | Level | Trigger (tokens) | Covers messages    | Density        |
+//! | Level | Active input trigger | Covers messages    | Density        |
 //! |-------|------------------|--------------------|----------------|
 //! | L1    | 192K             | 0–128K             | ~2,500 tokens  |
 //! | L2    | 384K             | 0–320K             | ~1,800 tokens  |
@@ -45,7 +45,7 @@ use crate::models::{ContentBlock, Message, MessageRequest, SystemBlock, SystemPr
 /// Default seam model — Flash is cheap and fast, ideal for summarization.
 pub const DEFAULT_SEAM_MODEL: &str = "deepseek-v4-flash";

-/// Default thresholds (cumulative input+output tokens).
+/// Default thresholds based on the active request input estimate.
 pub const DEFAULT_L1_THRESHOLD: usize = 192_000;
 pub const DEFAULT_L2_THRESHOLD: usize = 384_000;
 pub const DEFAULT_L3_THRESHOLD: usize = 576_000;
@@ -66,7 +66,7 @@ pub struct SeamConfig {
    pub enabled: bool,
    /// Verbatim window: last N turns never summarized.
    pub verbatim_window_turns: usize,
-    /// Soft seam thresholds.
+    /// Soft seam thresholds based on the active request input estimate.
    pub l1_threshold: usize,
    pub l2_threshold: usize,
    pub l3_threshold: usize,
@@ -143,29 +143,14 @@ impl SeamManager {
    }

    /// Determine which seam level (if any) should fire for the given
-    /// cumulative token count. Returns `None` when no seam is due.
+    /// active request input estimate. Returns `None` when no seam is due.
    #[must_use]
    pub fn seam_level_for(
        &self,
-        cumulative_tokens: usize,
+        active_input_tokens: usize,
        highest_existing_level: Option<u8>,
    ) -> Option<u8> {
-        if !self.config.enabled {
-            return None;
-        }
-        let highest = highest_existing_level.unwrap_or(0);
-
-        // Each level fires at most once, and only in order.
-        if highest < 1 && cumulative_tokens >= self.config.l1_threshold {
-            return Some(1);
-        }
-        if highest < 2 && cumulative_tokens >= self.config.l2_threshold {
-            return Some(2);
-        }
-        if highest < 3 && cumulative_tokens >= self.config.l3_threshold {
-            return Some(3);
-        }
-        None
+        seam_level_for_active_input(&self.config, active_input_tokens, highest_existing_level)
    }

    /// Check whether the hard cycle boundary is crossed.
@@ -174,8 +159,8 @@ impl SeamManager {
    /// Kept as the canonical boundary definition for future wiring.
    #[must_use]
    #[allow(dead_code)]
-    pub fn should_cycle(&self, cumulative_tokens: usize) -> bool {
-        self.config.enabled && cumulative_tokens >= self.config.cycle_threshold
+    pub fn should_cycle(&self, active_input_tokens: usize) -> bool {
+        self.config.enabled && active_input_tokens >= self.config.cycle_threshold
    }

    /// Compute the verbatim window: the last N message indices that must
@@ -577,6 +562,30 @@ impl SeamManager {
    }
 }

+#[must_use]
+pub fn seam_level_for_active_input(
+    config: &SeamConfig,
+    active_input_tokens: usize,
+    highest_existing_level: Option<u8>,
+) -> Option<u8> {
+    if !config.enabled {
+        return None;
+    }
+    let highest = highest_existing_level.unwrap_or(0);
+
+    // Each level fires at most once, and only in order.
+    if highest < 1 && active_input_tokens >= config.l1_threshold {
+        return Some(1);
+    }
+    if highest < 2 && active_input_tokens >= config.l2_threshold {
+        return Some(2);
+    }
+    if highest < 3 && active_input_tokens >= config.l3_threshold {
+        return Some(3);
+    }
+    None
+}
+
 /// Truncate a string to max_chars, respecting Unicode boundaries.
 fn truncate_chars(text: &str, max_chars: usize) -> String {
    if max_chars == 0 {
@@ -598,15 +607,29 @@ mod tests {
        // Test the pure logic functions only.
        let config = SeamConfig::default();

-        // Test seam_level_for logic manually.
-        // Below L1
-        assert!(config.enabled && 100_000 < config.l1_threshold);
-        // At L1
-        assert!(192_000 >= config.l1_threshold);
-        // At L2
-        assert!(384_000 >= config.l2_threshold);
-        // At L3
-        assert!(576_000 >= config.l3_threshold);
+        assert_eq!(seam_level_for_active_input(&config, 100_000, None), None);
+        assert_eq!(seam_level_for_active_input(&config, 192_000, None), Some(1));
+        assert_eq!(
+            seam_level_for_active_input(&config, 384_000, Some(1)),
+            Some(2)
+        );
+        assert_eq!(
+            seam_level_for_active_input(&config, 576_000, Some(2)),
+            Some(3)
+        );
+    }
+
+    #[test]
+    fn seam_trigger_uses_active_request_size_not_lifetime_usage() {
+        let config = SeamConfig::default();
+        let lifetime_prompt_usage = 900_000usize;
+        let active_request_input = 120_000usize;
+
+        assert!(lifetime_prompt_usage >= config.l3_threshold);
+        assert_eq!(
+            seam_level_for_active_input(&config, active_request_input, None),
+            None
+        );
    }

    #[test]
@@ -1287,6 +1287,10 @@ use crate::tools::spec::{
 use async_trait::async_trait;
 use serde_json::json;

+const FOREGROUND_TIMEOUT_RECOVERY_HINT: &str = "Foreground exec_shell is for bounded commands. \
+The timed-out process was killed; rerun long work with task_shell_start or exec_shell with \
+background: true, then poll with task_shell_wait or exec_shell_wait.";
+
 async fn execute_foreground_via_background(
    context: &ToolContext,
    command: &str,
@@ -1372,7 +1376,7 @@ impl ToolSpec for ExecShellTool {
    }

    fn description(&self) -> &'static str {
-        "Execute a shell command in the workspace directory. Returns stdout, stderr, and exit code."
+        "Execute a shell command in the workspace directory. Foreground mode is for bounded commands; use background=true or task_shell_start for long-running work, then poll/wait."
    }

    fn input_schema(&self) -> serde_json::Value {
@@ -1389,7 +1393,7 @@ impl ToolSpec for ExecShellTool {
                },
                "background": {
                    "type": "boolean",
-                    "description": "Run in background and return task_id (default: false)"
+                    "description": "Run in background and return task_id (default: false). Prefer true for commands that may run for minutes; poll with exec_shell_wait or task_shell_wait."
                },
                "interactive": {
                    "type": "boolean",
@@ -1599,7 +1603,7 @@ impl ToolSpec for ExecShellTool {
                    )
                } else if result.status == ShellStatus::TimedOut {
                    format!(
-                        "Command timed out after {timeout_ms}ms; process killed.\n\nSTDOUT:\n{}\n\nSTDERR:\n{}",
+                        "Command timed out after {timeout_ms}ms; process killed.\n\n{FOREGROUND_TIMEOUT_RECOVERY_HINT}\n\nSTDOUT:\n{}\n\nSTDERR:\n{}",
                        result.stdout, result.stderr
                    )
                } else {
@@ -1609,44 +1613,60 @@ impl ToolSpec for ExecShellTool {
                    )
                };

+                let mut metadata = json!({
+                    "exit_code": result.exit_code,
+                    "status": format!("{:?}", result.status),
+                    "duration_ms": result.duration_ms,
+                    "sandboxed": result.sandboxed,
+                    "sandbox_type": result.sandbox_type,
+                    "sandbox_denied": result.sandbox_denied,
+                    "task_id": result.task_id,
+                    "stdout_len": result.stdout_len,
+                    "stderr_len": result.stderr_len,
+                    "stdout_truncated": result.stdout_truncated,
+                    "stderr_truncated": result.stderr_truncated,
+                    "stdout_omitted": result.stdout_omitted,
+                    "stderr_omitted": result.stderr_omitted,
+                    "summary": summary,
+                    "stdout_summary": stdout_summary,
+                    "stderr_summary": stderr_summary,
+                    "safety_level": format!("{:?}", safety.level),
+                    "interactive": interactive,
+                    "canceled": was_cancelled,
+                    "execpolicy": execpolicy_decision.as_ref().map(|decision| match decision {
+                        ExecPolicyDecision::Allow => json!({
+                            "decision": "allow",
+                        }),
+                        ExecPolicyDecision::Deny(reason) => json!({
+                            "decision": "deny",
+                            "reason": reason,
+                        }),
+                        ExecPolicyDecision::AskUser(reason) => json!({
+                            "decision": "ask_user",
+                            "reason": reason,
+                        }),
+                    }),
+                });
+                if result.status == ShellStatus::TimedOut && !background && !interactive {
+                    metadata["foreground_timeout_recovery"] = json!({
+                        "process_killed": true,
+                        "hint": FOREGROUND_TIMEOUT_RECOVERY_HINT,
+                        "recommended_tools": [
+                            "task_shell_start",
+                            "task_shell_wait",
+                            "exec_shell",
+                            "exec_shell_wait"
+                        ],
+                        "exec_shell_background": true,
+                        "poll_with": ["task_shell_wait", "exec_shell_wait"]
+                    });
+                }
+
                Ok(ToolResult {
                    content: output,
                    success: result.status == ShellStatus::Completed
                        || result.status == ShellStatus::Running,
-                    metadata: Some(json!({
-                        "exit_code": result.exit_code,
-                        "status": format!("{:?}", result.status),
-                        "duration_ms": result.duration_ms,
-                        "sandboxed": result.sandboxed,
-                        "sandbox_type": result.sandbox_type,
-                        "sandbox_denied": result.sandbox_denied,
-                        "task_id": result.task_id,
-                        "stdout_len": result.stdout_len,
-                        "stderr_len": result.stderr_len,
-                        "stdout_truncated": result.stdout_truncated,
-                        "stderr_truncated": result.stderr_truncated,
-                        "stdout_omitted": result.stdout_omitted,
-                        "stderr_omitted": result.stderr_omitted,
-                        "summary": summary,
-                        "stdout_summary": stdout_summary,
-                        "stderr_summary": stderr_summary,
-                        "safety_level": format!("{:?}", safety.level),
-                        "interactive": interactive,
-                        "canceled": was_cancelled,
-                        "execpolicy": execpolicy_decision.as_ref().map(|decision| match decision {
-                            ExecPolicyDecision::Allow => json!({
-                                "decision": "allow",
-                            }),
-                            ExecPolicyDecision::Deny(reason) => json!({
-                                "decision": "deny",
-                                "reason": reason,
-                            }),
-                            ExecPolicyDecision::AskUser(reason) => json!({
-                                "decision": "ask_user",
-                                "reason": reason,
-                            }),
-                        }),
-                    })),
+                    metadata: Some(metadata),
                })
            }
            Err(e) => Ok(ToolResult::error(format!("Shell execution failed: {e}"))),
@@ -263,6 +263,47 @@ async fn test_exec_shell_metadata_includes_summaries() {
    assert!(meta.get("stdout_truncated").is_some());
 }

+#[tokio::test]
+async fn test_exec_shell_foreground_timeout_guides_background_rerun() {
+    let tmp = tempdir().expect("tempdir");
+    let ctx = ToolContext::new(tmp.path());
+    let tool = ExecShellTool;
+
+    let result = tool
+        .execute(
+            json!({
+                "command": sleep_command(10),
+                "timeout_ms": 1000
+            }),
+            &ctx,
+        )
+        .await
+        .expect("execute");
+
+    assert!(!result.success);
+    assert!(result.content.contains("task_shell_start"));
+    assert!(result.content.contains("background: true"));
+    assert!(result.content.contains("process killed"));
+    let meta = result.metadata.expect("metadata");
+    assert_eq!(meta.get("status").and_then(Value::as_str), Some("TimedOut"));
+    let recovery = meta
+        .get("foreground_timeout_recovery")
+        .expect("timeout recovery metadata");
+    assert_eq!(
+        recovery
+            .get("exec_shell_background")
+            .and_then(Value::as_bool),
+        Some(true)
+    );
+    assert!(
+        recovery
+            .get("hint")
+            .and_then(Value::as_str)
+            .unwrap_or_default()
+            .contains("exec_shell_wait")
+    );
+}
+
 #[tokio::test]
 async fn test_exec_shell_foreground_cancel_kills_process() {
    let tmp = tempdir().expect("tempdir");
@@ -612,13 +612,15 @@ pub struct App {
    pub runtime_turn_id: Option<String>,
    /// Current runtime turn status (if known).
    pub runtime_turn_status: Option<String>,
-    /// Last prompt token usage
+    /// Provider-reported input tokens from the last completed turn. This is
+    /// telemetry/cost data and may sum repeated stable prefixes across tool
+    /// rounds; active context pressure is estimated from `api_messages`.
    pub last_prompt_tokens: Option<u32>,
-    /// Last completion token usage
+    /// Provider-reported output tokens from the last completed turn.
    pub last_completion_tokens: Option<u32>,
-    /// DeepSeek context-cache hit tokens from the last API call.
+    /// DeepSeek context-cache hit tokens from the last API call. Telemetry only.
    pub last_prompt_cache_hit_tokens: Option<u32>,
-    /// DeepSeek context-cache miss tokens from the last API call.
+    /// DeepSeek context-cache miss tokens from the last API call. Telemetry only.
    pub last_prompt_cache_miss_tokens: Option<u32>,
    /// Approximate input tokens spent re-sending prior `reasoning_content` on
    /// the last thinking-mode tool-calling turn (V4 §5.1.1 "Interleaved
@@ -4714,7 +4714,7 @@ fn footer_context_percent_spans(app: &App) -> Vec<Span<'static>> {
        palette::TEXT_MUTED
    };
    vec![Span::styled(
-        format!("ctx {percent:.0}%"),
+        format!("active ctx {percent:.0}%"),
        Style::default().fg(color),
    )]
 }
@@ -4802,7 +4802,7 @@ fn footer_cache_spans(app: &App) -> Vec<Span<'static>> {

    let percent = (f64::from(hit_tokens) / f64::from(total) * 100.0).clamp(0.0, 100.0);
    vec![Span::styled(
-        format!("cache {:.0}%", percent),
+        format!("cache hit {:.0}%", percent),
        Style::default().fg(palette::TEXT_MUTED),
    )]
 }
@@ -575,7 +575,7 @@ fn footer_auxiliary_spans_show_cache_when_compact() {
    app.last_prompt_cache_miss_tokens = Some(12_000);
    app.session_cost = 12.34;

-    let compact = spans_text(&footer_auxiliary_spans(&app, 12));
+    let compact = spans_text(&footer_auxiliary_spans(&app, 14));
    assert!(compact.contains("cache"));
    assert!(!compact.contains('$'));
 }
@@ -589,7 +589,7 @@ fn footer_auxiliary_spans_show_cache_and_cost_when_roomy() {
    app.session_cost = 12.34;

    let roomy = spans_text(&footer_auxiliary_spans(&app, 32));
-    assert!(roomy.contains("cache 75%"));
+    assert!(roomy.contains("cache hit 75%"));
    assert!(roomy.contains("$12.34"));
    assert!(
        !roomy.contains("ctx"),
@@ -31,7 +31,8 @@ pub struct HeaderData<'a> {
    pub context_window: Option<u32>,
    /// Accumulated session cost in USD.
    pub session_cost: f64,
-    /// Input tokens from the most recent API call (current context utilization).
+    /// Active context input tokens used for context utilization. Callers should
+    /// pass a sanitized live-context estimate, not cumulative API usage.
    pub last_prompt_tokens: Option<u32>,
    /// Short label for the current reasoning-effort tier (e.g. "max", "high",
    /// "off"). Rendered as a chip when space allows.
@@ -90,12 +91,12 @@ impl<'a> HeaderData<'a> {
        total_tokens: u32,
        context_window: Option<u32>,
        session_cost: f64,
-        last_prompt_tokens: Option<u32>,
+        active_context_input_tokens: Option<u32>,
    ) -> Self {
        self.total_tokens = total_tokens;
        self.context_window = context_window;
        self.session_cost = session_cost;
-        self.last_prompt_tokens = last_prompt_tokens;
+        self.last_prompt_tokens = active_context_input_tokens;
        self
    }
 }
@@ -157,6 +157,26 @@ Readability semantics:
  `crowded`, `refreshing`, `verifying`, and `resetting`; these are derived from
  capacity and compaction events without exposing internal formulas in normal UI.

+### Token Quantities and Drivers
+
+DeepSeek V4 prefix caching makes token labels matter. These quantities are kept
+separate:
+
+| Quantity | Meaning | Allowed to drive |
+|---|---|---|
+| Active request input estimate | Conservative estimate of the next request's live system prompt and transcript payload. | Header/footer context percent, hard-cycle trigger, opt-in Flash seam trigger, and emergency overflow preflight. |
+| Reserved response headroom | The requested `max_tokens` budget plus safety headroom. v0.7.5 keeps normal turns at `262144` output tokens and adds `1024` safety tokens for context-window checks. | Hard-cycle and emergency overflow budget checks only. |
+| Cumulative API usage | Provider-reported input plus output tokens summed across completed API calls; multi-tool turns may count the same stable prefix more than once. | Session usage and approximate cost telemetry only. |
+| Prompt cache hit/miss | Provider cache telemetry for the most recent call when available. | Cache-hit display and cost estimation only; never compaction, seam, or cycle triggers. |
+| Context percent | Active request input estimate divided by the model context window. | Display only; it mirrors the active-input basis used by context safeguards. |
+| Cost estimate | Approximate spend from provider usage and configured DeepSeek rates. | Display only. |
+
+For the default V4 path, hard cycles fire when active input reaches the smaller
+of the configured cycle threshold (`768000`) and the model window minus reserved
+response headroom. Replacement compaction remains opt-in (`auto_compact = false`
+by default), the Flash seam manager remains opt-in (`[context].enabled = false`),
+and the capacity controller remains disabled unless configured.
+
 ### Command Migration Notes

 If you are upgrading from older releases:
@@ -196,7 +216,9 @@ If you are upgrading from older releases:
  - `[snapshots].enabled` (bool, default `true`)
  - `[snapshots].max_age_days` (int, default `7`)
  - snapshots live under `~/.deepseek/snapshots/<project_hash>/<worktree_hash>/.git` and never use the workspace's own `.git` directory
- `context.*` (optional): append-only Flash seam manager, currently opt-in:
+- `context.*` (optional): append-only Flash seam manager, currently opt-in.
+  Thresholds use the active request input estimate, not lifetime summed API
+  usage:
  - `[context].enabled` (bool, default `false`)
  - `[context].verbatim_window_turns` (int, default `16`)
  - `[context].l1_threshold` (int, default `192000`)
@@ -15,7 +15,7 @@ chosen over the available shell equivalent. Companion to `crates/tui/src/prompts
  for the same backing operation are a model trap — the LLM will alternate
  between them and the cache hit rate suffers.

-## Final surface (v0.7.4)
+## Current surface (v0.7.5)

 ### File operations

@@ -40,19 +40,25 @@ chosen over the available shell equivalent. Companion to `crates/tui/src/prompts

 | Tool | Niche |
 |---|---|
-| `exec_shell` | Run a shell command. Foreground runs are cancellable, but use them only for bounded commands. |
+| `exec_shell` | Run a shell command. Foreground runs are cancellable, but use them only for bounded commands; timeout kills the process and returns a background-rerun hint. |
 | `exec_shell_wait` | Poll a background task for incremental output. |
 | `exec_shell_interact` | Send stdin to a running background task and read incremental output. |
 | `task_shell_start` | Start a long-running command in the background and return immediately. Preferred over foreground shell for diagnostics, tests, searches, and servers that may run for minutes. |
 | `task_shell_wait` | Poll a background command. If `gate` is supplied after completion, record structured gate evidence on the active durable task. |

+When a foreground shell command times out, the process is not continued
+silently. The tool result tells the model to rerun long work with
+`task_shell_start` or `exec_shell` with `background = true`, then poll with
+`task_shell_wait` or `exec_shell_wait`.
+
 Interactive shell jobs are also visible through `/jobs`. The TUI job center is
 fed by the same shell manager as `exec_shell`/`task_shell_start`, and shows the
 command, cwd, elapsed time, status, output tail, process-local shell id, and
 linked durable task id when available. `/jobs show`, `/jobs poll`, `/jobs wait`,
 `/jobs stdin`, and `/jobs cancel` provide inspect, polling, stdin, and cancel
-controls for live jobs. Jobs are process-local; after restart, detached entries
-are marked stale rather than presented as live processes.
+controls for live jobs. Jobs are process-local; after restart, live process
+state is not reattached, and any remembered detached entries must be marked
+stale rather than presented as live processes.

 ### MCP manager and palette discovery

@@ -1,7 +1,7 @@
 {
  "name": "deepseek-tui",
-  "version": "0.7.4",
-  "deepseekBinaryVersion": "0.7.4",
+  "version": "0.7.5",
+  "deepseekBinaryVersion": "0.7.5",
  "description": "Install and run deepseek and deepseek-tui binaries from GitHub release artifacts.",
  "author": "Hmbown",
  "license": "MIT",