From 7712a372721caefce0c1a0f8146bdc50e7631258 Mon Sep 17 00:00:00 2001
From: Hunter Bown <hmbown@gmail.com>
Date: Mon, 4 May 2026 12:54:59 -0500
Subject: [PATCH 1/2] feat(compaction): debug telemetry on summary calls +
 document framing fork
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two follow-ups from the post-#572 cache-aligned compaction review (#584):

1. `should_use_cache_aligned_summary` now carries a doc comment that
   explains why the cache-aligned and fallback summary requests are
   framed differently. Cache-aligned replays the conversation as the
   model's own history under `system: None`; fallback reformats it
   into a `User:/Assistant:` transcript under a "concise summaries"
   system prompt. The fallback's external-transcript framing is more
   conservative for the older / smaller models the cache-aligned path
   explicitly excludes, so dropping the system prompt risks regressing
   those models without a corresponding gain. Unifying the two paths
   is a research question that wants an A/B summary-quality eval, not
   a drive-by cleanup — flagged here for a future PR rather than
   resolved silently.

2. `create_summary` now emits one `tracing::debug!` event per summary
   call carrying which path was chosen, the prompt-token count, and
   the cache-hit / miss split. Filter with `RUST_LOG=compaction=debug`
   (or the full module path
   `RUST_LOG=deepseek_tui::compaction=debug`). This makes the V4
   prefix-cache win from #572 observable post-deploy without adding
   UI surface — the compaction summary call is the request we most
   expect to benefit, and previously we had no per-call signal for it.

No UI surface changes. No model-facing prompt changes. Only adds the
path-choice variable and the debug log; existing compaction tests
(56 across `compaction::*` and `models::*`) still pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/tui/src/compaction.rs | 79 +++++++++++++++++++++++++++++++++++-
 1 file changed, 78 insertions(+), 1 deletion(-)
diff --git a/crates/tui/src/compaction.rs b/crates/tui/src/compaction.rs
index cda69077..48634b56 100644
--- a/crates/tui/src/compaction.rs
+++ b/crates/tui/src/compaction.rs
@@ -820,7 +820,8 @@ async fn create_summary(
     model: &str,
 ) -> Result<String> {
     let limits = summary_input_limits_for_model(model);
-    let request = if should_use_cache_aligned_summary(model, messages) {
+    let used_cache_aligned = should_use_cache_aligned_summary(model, messages);
+    let request = if used_cache_aligned {
         build_cache_aligned_summary_request(model, messages, limits)
     } else {
         build_formatted_summary_request(model, messages, limits)
@@ -832,6 +833,13 @@ async fn create_summary(
     // matches the website (#526).
     crate::cost_status::report(&response.model, &response.usage);
 
+    // #584: emit one debug-level event per summary call so the
+    // V4 cache-aligned win is observable post-deploy without
+    // adding UI surface. Filter with `RUST_LOG=compaction=debug`
+    // (or the full module path
+    // `RUST_LOG=deepseek_tui::compaction=debug`).
+    log_summary_cache_telemetry(used_cache_aligned, &response.usage);
+
     // Extract text from response
     let summary = response
         .content
@@ -846,6 +854,75 @@ async fn create_summary(
     Ok(summary)
 }
 
+/// Emit one `tracing::debug!` event per compaction summary call so the
+/// path choice (cache-aligned vs fallback) and the resulting cache-hit
+/// rate are observable. Both raw token counts and the percentage are
+/// included; on providers that don't return cache-token fields the
+/// counts are reported as `0` and the percentage as `0.0`. (#584)
+fn log_summary_cache_telemetry(used_cache_aligned: bool, usage: &crate::models::Usage) {
+    let path = if used_cache_aligned {
+        "cache_aligned"
+    } else {
+        "fallback"
+    };
+    let cache_hit = usage.prompt_cache_hit_tokens.unwrap_or(0);
+    let cache_miss = usage.prompt_cache_miss_tokens.unwrap_or(0);
+    let cached_window = u64::from(cache_hit) + u64::from(cache_miss);
+    let cache_hit_pct = if cached_window > 0 {
+        (f64::from(cache_hit) * 100.0) / cached_window as f64
+    } else {
+        0.0
+    };
+    tracing::debug!(
+        target: "compaction",
+        "compaction summary call: path={} prompt_tokens={} cache_hit_tokens={} cache_miss_tokens={} cache_hit_pct={:.1}",
+        path,
+        usage.input_tokens,
+        cache_hit,
+        cache_miss,
+        cache_hit_pct,
+    );
+}
+
+/// Decide whether to use the cache-aligned summary path
+/// ([`build_cache_aligned_summary_request`]) or the fallback
+/// ([`build_formatted_summary_request`]). Returns `true` when both
+/// gates hold:
+///
+/// 1. The model has a known large context window
+///    (≥ `LARGE_CONTEXT_WINDOW_TOKENS`, currently V4-scale).
+/// 2. Replaying the message prefix plus a ~512-token instruction
+///    still fits within `CACHE_ALIGNED_SUMMARY_CONTEXT_BUDGET_PERCENT`
+///    of that budget.
+///
+/// ## Why the two paths produce slightly different prompts (#584)
+///
+/// The two summary requests are *intentionally* framed differently:
+///
+/// - **Cache-aligned** replays the original `messages` verbatim
+///   with `system: None` and appends the summary instruction as
+///   the final `user` turn. The model sees the conversation as if
+///   it were its own history. This is what lets the V4 prefix cache
+///   hit on the bulk of the request (#572).
+/// - **Fallback** reformats the conversation into a flat
+///   `User:/Assistant:` transcript inside a single `user` message
+///   and adds a "You are a helpful assistant that creates concise
+///   conversation summaries." system prompt. The model sees a
+///   transcript of someone else's conversation.
+///
+/// The empirical bar is that V4 produces equivalent summaries
+/// either way; the post-#572 review noted this fork is worth
+/// documenting but not yet worth unifying. The fallback's
+/// external-transcript framing is also more conservative for the
+/// older / smaller models the cache-aligned path explicitly
+/// excludes, so dropping the system prompt would risk regressing
+/// those models without a corresponding gain. If we ever want to
+/// unify, land it in a separate PR backed by an A/B summary-quality
+/// evaluation rather than as a drive-by cleanup.
+///
+/// `create_summary` emits a `tracing::debug!` event under
+/// `target = "compaction"` after each call so the path choice and
+/// cache-hit rate are observable post-deploy without UI surface.
 fn should_use_cache_aligned_summary(model: &str, messages: &[Message]) -> bool {
     let Some(window) = context_window_for_model(model) else {
         return false;

From dcadb5d388b4e30272f7aaf8ebb4713a41e1db20 Mon Sep 17 00:00:00 2001
From: Hunter Bown <hmbown@gmail.com>
Date: Mon, 4 May 2026 13:35:05 -0500
Subject: [PATCH 2/2] fix(compaction): cache-hit % denominator + correct
 RUST_LOG filter docs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The post-merge review on #584 surfaced two real bugs in the new
summary-call telemetry:

1. The cache-hit percentage used `cache_hit + cache_miss` as the
   denominator. Providers that populate `prompt_cache_hit_tokens` but
   leave `prompt_cache_miss_tokens` as `None` (the rest of the codebase
   already infers misses from `input_tokens` for cost reporting and
   `/cache`) were silently reported as a flat 100% hit rate, masking the
   actual ratio. Switch the denominator to `usage.input_tokens` so the
   ratio matches how the rest of the project reasons about cache spread.
   Extract the calc into a small `summary_cache_hit_percent` helper so
   the invariant is unit-testable.

2. The doc comment on the emit site advertised that
   `RUST_LOG=deepseek_tui::compaction=debug` would also work as a
   filter. It does not — `EnvFilter` matches the explicit target string
   when one is set, so only `RUST_LOG=compaction=debug` activates the
   event. Drop the misleading parenthetical and call out the filter
   semantics explicitly.

The new unit test pins the partial-telemetry guard so a future regress
to `(hit + miss)` denominator would be caught immediately.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/tui/src/compaction.rs | 56 ++++++++++++++++++++++++++++++------
 1 file changed, 47 insertions(+), 9 deletions(-)

diff --git a/crates/tui/src/compaction.rs b/crates/tui/src/compaction.rs
index 48634b56..59024426 100644
--- a/crates/tui/src/compaction.rs
+++ b/crates/tui/src/compaction.rs
@@ -835,9 +835,11 @@ async fn create_summary(
 
     // #584: emit one debug-level event per summary call so the
     // V4 cache-aligned win is observable post-deploy without
-    // adding UI surface. Filter with `RUST_LOG=compaction=debug`
-    // (or the full module path
-    // `RUST_LOG=deepseek_tui::compaction=debug`).
+    // adding UI surface. The event is emitted with
+    // `target = "compaction"`, so the filter is
+    // `RUST_LOG=compaction=debug` (the module-path form
+    // `deepseek_tui::compaction=debug` does NOT match — `EnvFilter`
+    // matches the explicit target string when one is set).
     log_summary_cache_telemetry(used_cache_aligned, &response.usage);
 
     // Extract text from response
@@ -854,6 +856,23 @@ async fn create_summary(
     Ok(summary)
 }
 
+/// Cache-hit percentage for a compaction summary call.
+///
+/// Denominator is `input_tokens` (the total prompt size), not
+/// `cache_hit + cache_miss`. Some providers populate
+/// `prompt_cache_hit_tokens` but not `prompt_cache_miss_tokens` — using
+/// the sum as the denominator there reports an inflated 100% even when
+/// most of the prompt was uncached. Anchoring on `input_tokens` matches
+/// how the rest of the codebase (cost reporting, `/cache`) infers
+/// missing miss counts. (#584)
+fn summary_cache_hit_percent(cache_hit: u32, input_tokens: u32) -> f64 {
+    if input_tokens > 0 {
+        (f64::from(cache_hit) * 100.0) / f64::from(input_tokens)
+    } else {
+        0.0
+    }
+}
+
 /// Emit one `tracing::debug!` event per compaction summary call so the
 /// path choice (cache-aligned vs fallback) and the resulting cache-hit
 /// rate are observable. Both raw token counts and the percentage are
@@ -867,12 +886,7 @@ fn log_summary_cache_telemetry(used_cache_aligned: bool, usage: &crate::models::
     };
     let cache_hit = usage.prompt_cache_hit_tokens.unwrap_or(0);
     let cache_miss = usage.prompt_cache_miss_tokens.unwrap_or(0);
-    let cached_window = u64::from(cache_hit) + u64::from(cache_miss);
-    let cache_hit_pct = if cached_window > 0 {
-        (f64::from(cache_hit) * 100.0) / cached_window as f64
-    } else {
-        0.0
-    };
+    let cache_hit_pct = summary_cache_hit_percent(cache_hit, usage.input_tokens);
     tracing::debug!(
         target: "compaction",
         "compaction summary call: path={} prompt_tokens={} cache_hit_tokens={} cache_miss_tokens={} cache_hit_pct={:.1}",
@@ -1268,6 +1282,30 @@ mod tests {
         ));
     }
 
+    /// #584: the summary cache-hit percentage must be computed against
+    /// `input_tokens`, not `cache_hit + cache_miss`. Providers that
+    /// only populate `prompt_cache_hit_tokens` (and leave the miss
+    /// field at `None`) would otherwise be reported as a flat 100%
+    /// hit rate even when most of the prompt was uncached.
+    #[test]
+    fn summary_cache_hit_percent_uses_input_tokens_as_denominator() {
+        // Both fields populated and consistent.
+        assert!((summary_cache_hit_percent(800, 1000) - 80.0).abs() < f64::EPSILON);
+        // No cache hit at all.
+        assert!((summary_cache_hit_percent(0, 1000) - 0.0).abs() < f64::EPSILON);
+        // Full cache hit.
+        assert!((summary_cache_hit_percent(1000, 1000) - 100.0).abs() < f64::EPSILON);
+        // Partial-telemetry guard: provider reports `cache_hit` only,
+        // miss is unknown (treated as 0 by the caller). Naive
+        // `hit / (hit + miss)` would have reported 100%; against
+        // `input_tokens` the answer is the real share.
+        assert!((summary_cache_hit_percent(200, 1000) - 20.0).abs() < f64::EPSILON);
+        // Defensive: zero `input_tokens` short-circuits without a
+        // divide-by-zero.
+        assert!((summary_cache_hit_percent(0, 0) - 0.0).abs() < f64::EPSILON);
+        assert!((summary_cache_hit_percent(50, 0) - 0.0).abs() < f64::EPSILON);
+    }
+
     #[test]
     fn cache_aligned_summary_request_preserves_message_prefix() {
         let messages = vec![