fix(engine): keep auto-compaction working on sub-500K self-hosted windows

Harvested from PR #2060 by @h3c-hexin. Co-authored-by: hexin <he.xin@h3c.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-26 07:25:58 +08:00
parent aa468c3078
commit 9c8e482607
6 changed files with 70 additions and 59 deletions
@@ -1287,15 +1287,8 @@ impl Engine {
        removed
    }

-    async fn recover_context_overflow(
-        &mut self,
-        client: &DeepSeekClient,
-        reason: &str,
-        requested_output_tokens: u32,
-    ) -> bool {
-        let Some(target_budget) =
-            context_input_budget(&self.session.model, requested_output_tokens)
-        else {
+    async fn recover_context_overflow(&mut self, client: &DeepSeekClient, reason: &str) -> bool {
+        let Some(target_budget) = context_input_budget(&self.session.model) else {
            return false;
        };

@@ -1971,9 +1964,9 @@ mod handle;
 pub(crate) use context::compact_tool_result_for_context;
 use context::{
    COMPACTION_SUMMARY_MARKER, MAX_CONTEXT_RECOVERY_ATTEMPTS, MIN_RECENT_MESSAGES_TO_KEEP,
-    TURN_MAX_OUTPUT_TOKENS, context_input_budget, effective_max_output_tokens,
-    estimate_input_tokens_conservative, extract_compaction_summary_prompt,
-    is_context_length_error_message, summarize_text, turn_response_headroom_tokens,
+    context_input_budget, effective_max_output_tokens, estimate_input_tokens_conservative,
+    extract_compaction_summary_prompt, is_context_length_error_message, summarize_text,
+    turn_response_headroom_tokens,
 };
 mod dispatch;
 mod loop_guard;
@@ -435,7 +435,7 @@ impl Engine {
        }

        if !refreshed {
-            let target_budget = context_input_budget(&self.session.model, TURN_MAX_OUTPUT_TOKENS)
+            let target_budget = context_input_budget(&self.session.model)
                .unwrap_or(self.config.compaction.token_threshold.max(1));
            if self.estimated_input_tokens() > target_budget {
                let trimmed = self.trim_oldest_messages_to_budget(target_budget);
@@ -354,9 +354,35 @@ pub(super) fn estimate_input_tokens_conservative(
        .saturating_add(framing_overhead)
 }

-pub(super) fn context_input_budget(model: &str, requested_output_tokens: u32) -> Option<usize> {
-    let window = usize::try_from(context_window_for_model(model)?).ok()?;
-    let output = usize::try_from(requested_output_tokens).ok()?;
+/// Context windows at or above this size reserve the full
+/// [`TURN_MAX_OUTPUT_TOKENS`] (262K) when computing the internal input budget,
+/// leaving room for V4-class interleaved thinking. Below it, the reservation
+/// falls back to [`effective_max_output_tokens`] so a smaller self-hosted
+/// window does not underflow to a negative budget.
+const INTERNAL_BUDGET_LARGE_WINDOW_THRESHOLD: u32 = 500_000;
+
+/// Internal input-side token budget for a model: `window - reserved_output -
+/// headroom`. Used by the preflight check, emergency recovery, and capacity
+/// trimming to decide when to compact.
+///
+/// The reserved-output term is window-dependent:
+///   * `window >= 500K` (V4-class large-context) -> [`TURN_MAX_OUTPUT_TOKENS`]
+///     (262K). Preserves the "leave room for interleaved thinking" contract.
+///   * `window < 500K` (smaller / self-hosted, e.g. a 256K vLLM Qwen window)
+///     -> [`effective_max_output_tokens`], i.e. what the API actually caps
+///     output at. Reserving the full 262K here would compute
+///     `256K - 262K - 1K`, which underflows `checked_sub` to `None` and
+///     *silently disables every preflight and emergency recovery path* — the
+///     session then runs until the provider hard-rejects on context length.
+pub(super) fn context_input_budget(model: &str) -> Option<usize> {
+    let window_tokens = context_window_for_model(model)?;
+    let window = usize::try_from(window_tokens).ok()?;
+    let reserved_output = if window_tokens >= INTERNAL_BUDGET_LARGE_WINDOW_THRESHOLD {
+        TURN_MAX_OUTPUT_TOKENS
+    } else {
+        effective_max_output_tokens(model)
+    };
+    let output = usize::try_from(reserved_output).ok()?;
    window
        .checked_sub(output)
        .and_then(|v| v.checked_sub(CONTEXT_HEADROOM_TOKENS))
@@ -1,5 +1,6 @@
 use super::*;

+use super::context::TURN_MAX_OUTPUT_TOKENS;
 use crate::models::SystemBlock;
 use crate::test_support::lock_test_env;
 use crate::tools::spec::ToolCapability;
@@ -916,7 +917,7 @@ fn detects_context_length_errors_from_provider_payloads() {
 fn context_budget_reserves_output_and_headroom() {
    // V4 has a 1M context window — the only family that comfortably hosts
    // a 256K output reservation without saturating the input budget to 0.
-    let budget = context_input_budget("deepseek-v4-pro", TURN_MAX_OUTPUT_TOKENS)
+    let budget = context_input_budget("deepseek-v4-pro")
        .expect("deepseek-v4-pro should have a known context window");
    let v4_window: usize = 1_000_000;
    let expected = v4_window - (TURN_MAX_OUTPUT_TOKENS as usize) - 1_024usize;
@@ -943,31 +944,24 @@ fn effective_max_output_tokens_caps_api_request_for_large_window_models() {
 }

 #[test]
-fn internal_context_budget_unaffected_by_api_request_cap() {
-    // The internal context budget (used for compaction/preflight/recovery)
-    // must still use the full TURN_MAX_OUTPUT_TOKENS headroom, NOT the
-    // smaller API request cap. This ensures long-context V4 sessions don't
-    // compact prematurely.
-    let internal_budget = context_input_budget("deepseek-v4-pro", TURN_MAX_OUTPUT_TOKENS)
-        .expect("V4 should have a known context window");
-    let api_cap_budget = context_input_budget(
-        "deepseek-v4-pro",
-        effective_max_output_tokens("deepseek-v4-pro"),
-    )
-    .expect("V4 should have a known context window");
-
-    // Internal budget reserves 262K for output; API-cap budget would only
-    // reserve 64K. Internal budget must be smaller (more conservative).
-    assert!(
-        internal_budget < api_cap_budget,
-        "Internal budget ({internal_budget}) should be smaller than API-cap budget ({api_cap_budget}) \
-         because it reserves more headroom for output"
-    );
-
-    // Verify the internal budget is what the compaction logic actually uses.
+fn internal_context_budget_tiers_reserved_output_by_window() {
+    // Large-context (>=500K) models reserve the full TURN_MAX_OUTPUT_TOKENS
+    // headroom so long V4 sessions don't compact prematurely.
+    let internal_budget =
+        context_input_budget("deepseek-v4-pro").expect("V4 should have a known context window");
    let v4_window: usize = 1_000_000;
    let expected_internal = v4_window - (TURN_MAX_OUTPUT_TOKENS as usize) - 1_024usize;
    assert_eq!(internal_budget, expected_internal);
+
+    // Sub-500K windows cross into the effective-cap branch: a 256K self-hosted
+    // deployment must yield a usable positive budget rather than None. The
+    // previous formula reserved the full 262K and computed 256K - 262K - 1K,
+    // which underflowed to None and silently disabled preflight/recovery.
+    let small_window_budget = context_input_budget("qwen3-32b-256k")
+        .expect("a 256K-suffix model must yield Some budget via the effective-cap branch");
+    let effective_output = effective_max_output_tokens("qwen3-32b-256k") as usize;
+    let expected_small = 256_000 - effective_output - 1_024;
+    assert_eq!(small_window_budget, expected_small);
 }

 #[test]
@@ -173,9 +173,7 @@ impl Engine {
                continue;
            }

-            if let Some(input_budget) =
-                context_input_budget(&self.session.model, TURN_MAX_OUTPUT_TOKENS)
-            {
+            if let Some(input_budget) = context_input_budget(&self.session.model) {
                let estimated_input = self.estimated_input_tokens();
                if estimated_input > input_budget {
                    if context_recovery_attempts >= MAX_CONTEXT_RECOVERY_ATTEMPTS {
@@ -192,11 +190,7 @@ impl Engine {
                    }

                    if self
-                        .recover_context_overflow(
-                            &client,
-                            "preflight token budget",
-                            TURN_MAX_OUTPUT_TOKENS,
-                        )
+                        .recover_context_overflow(&client, "preflight token budget")
                        .await
                    {
                        context_recovery_attempts = context_recovery_attempts.saturating_add(1);
@@ -326,11 +320,7 @@ impl Engine {
                    if is_context_length_error_message(&message)
                        && context_recovery_attempts < MAX_CONTEXT_RECOVERY_ATTEMPTS
                        && self
-                            .recover_context_overflow(
-                                &client,
-                                "provider context-length rejection",
-                                TURN_MAX_OUTPUT_TOKENS,
-                            )
+                            .recover_context_overflow(&client, "provider context-length rejection")
                            .await
                    {
                        context_recovery_attempts = context_recovery_attempts.saturating_add(1);
@@ -208,16 +208,22 @@ pub struct Usage {
 }

 /// Map known models to their approximate context window sizes.
+///
+/// Lookup order:
+/// 1. An explicit `_Nk` suffix in the model name, for **any** vendor. This
+///    lets self-hosted deployments advertise their window through the served
+///    model name (e.g. a vLLM `--served-model-name qwen3-32b-256k`), which is
+///    the only signal we have for non-DeepSeek/Claude models. The 1000-token
+///    approximation is fine for compaction-threshold math.
+/// 2. DeepSeek vendor heuristics (V4 family -> 1M, legacy -> 128K).
+/// 3. Claude -> 200K.
 #[must_use]
 pub fn context_window_for_model(model: &str) -> Option<u32> {
    let lower = model.to_lowercase();
-    // Unknown legacy DeepSeek model IDs default to 128K unless an explicit
-    // *k suffix is present. DeepSeek-V4 family and current compatibility
-    // aliases ship with a 1M context window.
+    if let Some(explicit_window) = explicit_context_window_hint(&lower) {
+        return Some(explicit_window);
+    }
    if lower.contains("deepseek") {
-        if let Some(explicit_window) = deepseek_context_window_hint(&lower) {
-            return Some(explicit_window);
-        }
        if lower.contains("v4") {
            return Some(DEEPSEEK_V4_CONTEXT_WINDOW_TOKENS);
        }
@@ -229,7 +235,9 @@ pub fn context_window_for_model(model: &str) -> Option<u32> {
    None
 }

-fn deepseek_context_window_hint(model_lower: &str) -> Option<u32> {
+/// Parse an explicit `_Nk` context-window hint from a model name (vendor
+/// agnostic). Returns the window in tokens for `N` in `8..=1024`.
+fn explicit_context_window_hint(model_lower: &str) -> Option<u32> {
    let bytes = model_lower.as_bytes();
    let mut i = 0usize;
    while i < bytes.len() {