From 7712a372721caefce0c1a0f8146bdc50e7631258 Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Mon, 4 May 2026 12:54:59 -0500 Subject: [PATCH 1/2] feat(compaction): debug telemetry on summary calls + document framing fork MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two follow-ups from the post-#572 cache-aligned compaction review (#584): 1. `should_use_cache_aligned_summary` now carries a doc comment that explains why the cache-aligned and fallback summary requests are framed differently. Cache-aligned replays the conversation as the model's own history under `system: None`; fallback reformats it into a `User:/Assistant:` transcript under a "concise summaries" system prompt. The fallback's external-transcript framing is more conservative for the older / smaller models the cache-aligned path explicitly excludes, so dropping the system prompt risks regressing those models without a corresponding gain. Unifying the two paths is a research question that wants an A/B summary-quality eval, not a drive-by cleanup — flagged here for a future PR rather than resolved silently. 2. `create_summary` now emits one `tracing::debug!` event per summary call carrying which path was chosen, the prompt-token count, and the cache-hit / miss split. Filter with `RUST_LOG=compaction=debug` (or the full module path `RUST_LOG=deepseek_tui::compaction=debug`). This makes the V4 prefix-cache win from #572 observable post-deploy without adding UI surface — the compaction summary call is the request we most expect to benefit, and previously we had no per-call signal for it. No UI surface changes. No model-facing prompt changes. Only adds the path-choice variable and the debug log; existing compaction tests (56 across `compaction::*` and `models::*`) still pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/tui/src/compaction.rs | 79 +++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/crates/tui/src/compaction.rs b/crates/tui/src/compaction.rs index cda69077..48634b56 100644 --- a/crates/tui/src/compaction.rs +++ b/crates/tui/src/compaction.rs @@ -820,7 +820,8 @@ async fn create_summary( model: &str, ) -> Result { let limits = summary_input_limits_for_model(model); - let request = if should_use_cache_aligned_summary(model, messages) { + let used_cache_aligned = should_use_cache_aligned_summary(model, messages); + let request = if used_cache_aligned { build_cache_aligned_summary_request(model, messages, limits) } else { build_formatted_summary_request(model, messages, limits) @@ -832,6 +833,13 @@ async fn create_summary( // matches the website (#526). crate::cost_status::report(&response.model, &response.usage); + // #584: emit one debug-level event per summary call so the + // V4 cache-aligned win is observable post-deploy without + // adding UI surface. Filter with `RUST_LOG=compaction=debug` + // (or the full module path + // `RUST_LOG=deepseek_tui::compaction=debug`). + log_summary_cache_telemetry(used_cache_aligned, &response.usage); + // Extract text from response let summary = response .content @@ -846,6 +854,75 @@ async fn create_summary( Ok(summary) } +/// Emit one `tracing::debug!` event per compaction summary call so the +/// path choice (cache-aligned vs fallback) and the resulting cache-hit +/// rate are observable. Both raw token counts and the percentage are +/// included; on providers that don't return cache-token fields the +/// counts are reported as `0` and the percentage as `0.0`. (#584) +fn log_summary_cache_telemetry(used_cache_aligned: bool, usage: &crate::models::Usage) { + let path = if used_cache_aligned { + "cache_aligned" + } else { + "fallback" + }; + let cache_hit = usage.prompt_cache_hit_tokens.unwrap_or(0); + let cache_miss = usage.prompt_cache_miss_tokens.unwrap_or(0); + let cached_window = u64::from(cache_hit) + u64::from(cache_miss); + let cache_hit_pct = if cached_window > 0 { + (f64::from(cache_hit) * 100.0) / cached_window as f64 + } else { + 0.0 + }; + tracing::debug!( + target: "compaction", + "compaction summary call: path={} prompt_tokens={} cache_hit_tokens={} cache_miss_tokens={} cache_hit_pct={:.1}", + path, + usage.input_tokens, + cache_hit, + cache_miss, + cache_hit_pct, + ); +} + +/// Decide whether to use the cache-aligned summary path +/// ([`build_cache_aligned_summary_request`]) or the fallback +/// ([`build_formatted_summary_request`]). Returns `true` when both +/// gates hold: +/// +/// 1. The model has a known large context window +/// (≥ `LARGE_CONTEXT_WINDOW_TOKENS`, currently V4-scale). +/// 2. Replaying the message prefix plus a ~512-token instruction +/// still fits within `CACHE_ALIGNED_SUMMARY_CONTEXT_BUDGET_PERCENT` +/// of that budget. +/// +/// ## Why the two paths produce slightly different prompts (#584) +/// +/// The two summary requests are *intentionally* framed differently: +/// +/// - **Cache-aligned** replays the original `messages` verbatim +/// with `system: None` and appends the summary instruction as +/// the final `user` turn. The model sees the conversation as if +/// it were its own history. This is what lets the V4 prefix cache +/// hit on the bulk of the request (#572). +/// - **Fallback** reformats the conversation into a flat +/// `User:/Assistant:` transcript inside a single `user` message +/// and adds a "You are a helpful assistant that creates concise +/// conversation summaries." system prompt. The model sees a +/// transcript of someone else's conversation. +/// +/// The empirical bar is that V4 produces equivalent summaries +/// either way; the post-#572 review noted this fork is worth +/// documenting but not yet worth unifying. The fallback's +/// external-transcript framing is also more conservative for the +/// older / smaller models the cache-aligned path explicitly +/// excludes, so dropping the system prompt would risk regressing +/// those models without a corresponding gain. If we ever want to +/// unify, land it in a separate PR backed by an A/B summary-quality +/// evaluation rather than as a drive-by cleanup. +/// +/// `create_summary` emits a `tracing::debug!` event under +/// `target = "compaction"` after each call so the path choice and +/// cache-hit rate are observable post-deploy without UI surface. fn should_use_cache_aligned_summary(model: &str, messages: &[Message]) -> bool { let Some(window) = context_window_for_model(model) else { return false; From dcadb5d388b4e30272f7aaf8ebb4713a41e1db20 Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Mon, 4 May 2026 13:35:05 -0500 Subject: [PATCH 2/2] fix(compaction): cache-hit % denominator + correct RUST_LOG filter docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The post-merge review on #584 surfaced two real bugs in the new summary-call telemetry: 1. The cache-hit percentage used `cache_hit + cache_miss` as the denominator. Providers that populate `prompt_cache_hit_tokens` but leave `prompt_cache_miss_tokens` as `None` (the rest of the codebase already infers misses from `input_tokens` for cost reporting and `/cache`) were silently reported as a flat 100% hit rate, masking the actual ratio. Switch the denominator to `usage.input_tokens` so the ratio matches how the rest of the project reasons about cache spread. Extract the calc into a small `summary_cache_hit_percent` helper so the invariant is unit-testable. 2. The doc comment on the emit site advertised that `RUST_LOG=deepseek_tui::compaction=debug` would also work as a filter. It does not — `EnvFilter` matches the explicit target string when one is set, so only `RUST_LOG=compaction=debug` activates the event. Drop the misleading parenthetical and call out the filter semantics explicitly. The new unit test pins the partial-telemetry guard so a future regress to `(hit + miss)` denominator would be caught immediately. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/tui/src/compaction.rs | 56 ++++++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 9 deletions(-) diff --git a/crates/tui/src/compaction.rs b/crates/tui/src/compaction.rs index 48634b56..59024426 100644 --- a/crates/tui/src/compaction.rs +++ b/crates/tui/src/compaction.rs @@ -835,9 +835,11 @@ async fn create_summary( // #584: emit one debug-level event per summary call so the // V4 cache-aligned win is observable post-deploy without - // adding UI surface. Filter with `RUST_LOG=compaction=debug` - // (or the full module path - // `RUST_LOG=deepseek_tui::compaction=debug`). + // adding UI surface. The event is emitted with + // `target = "compaction"`, so the filter is + // `RUST_LOG=compaction=debug` (the module-path form + // `deepseek_tui::compaction=debug` does NOT match — `EnvFilter` + // matches the explicit target string when one is set). log_summary_cache_telemetry(used_cache_aligned, &response.usage); // Extract text from response @@ -854,6 +856,23 @@ async fn create_summary( Ok(summary) } +/// Cache-hit percentage for a compaction summary call. +/// +/// Denominator is `input_tokens` (the total prompt size), not +/// `cache_hit + cache_miss`. Some providers populate +/// `prompt_cache_hit_tokens` but not `prompt_cache_miss_tokens` — using +/// the sum as the denominator there reports an inflated 100% even when +/// most of the prompt was uncached. Anchoring on `input_tokens` matches +/// how the rest of the codebase (cost reporting, `/cache`) infers +/// missing miss counts. (#584) +fn summary_cache_hit_percent(cache_hit: u32, input_tokens: u32) -> f64 { + if input_tokens > 0 { + (f64::from(cache_hit) * 100.0) / f64::from(input_tokens) + } else { + 0.0 + } +} + /// Emit one `tracing::debug!` event per compaction summary call so the /// path choice (cache-aligned vs fallback) and the resulting cache-hit /// rate are observable. Both raw token counts and the percentage are @@ -867,12 +886,7 @@ fn log_summary_cache_telemetry(used_cache_aligned: bool, usage: &crate::models:: }; let cache_hit = usage.prompt_cache_hit_tokens.unwrap_or(0); let cache_miss = usage.prompt_cache_miss_tokens.unwrap_or(0); - let cached_window = u64::from(cache_hit) + u64::from(cache_miss); - let cache_hit_pct = if cached_window > 0 { - (f64::from(cache_hit) * 100.0) / cached_window as f64 - } else { - 0.0 - }; + let cache_hit_pct = summary_cache_hit_percent(cache_hit, usage.input_tokens); tracing::debug!( target: "compaction", "compaction summary call: path={} prompt_tokens={} cache_hit_tokens={} cache_miss_tokens={} cache_hit_pct={:.1}", @@ -1268,6 +1282,30 @@ mod tests { )); } + /// #584: the summary cache-hit percentage must be computed against + /// `input_tokens`, not `cache_hit + cache_miss`. Providers that + /// only populate `prompt_cache_hit_tokens` (and leave the miss + /// field at `None`) would otherwise be reported as a flat 100% + /// hit rate even when most of the prompt was uncached. + #[test] + fn summary_cache_hit_percent_uses_input_tokens_as_denominator() { + // Both fields populated and consistent. + assert!((summary_cache_hit_percent(800, 1000) - 80.0).abs() < f64::EPSILON); + // No cache hit at all. + assert!((summary_cache_hit_percent(0, 1000) - 0.0).abs() < f64::EPSILON); + // Full cache hit. + assert!((summary_cache_hit_percent(1000, 1000) - 100.0).abs() < f64::EPSILON); + // Partial-telemetry guard: provider reports `cache_hit` only, + // miss is unknown (treated as 0 by the caller). Naive + // `hit / (hit + miss)` would have reported 100%; against + // `input_tokens` the answer is the real share. + assert!((summary_cache_hit_percent(200, 1000) - 20.0).abs() < f64::EPSILON); + // Defensive: zero `input_tokens` short-circuits without a + // divide-by-zero. + assert!((summary_cache_hit_percent(0, 0) - 0.0).abs() < f64::EPSILON); + assert!((summary_cache_hit_percent(50, 0) - 0.0).abs() < f64::EPSILON); + } + #[test] fn cache_aligned_summary_request_preserves_message_prefix() { let messages = vec![