fix(engine): keep auto-compaction working on sub-500K self-hosted windows

Harvested from PR #2060 by @h3c-hexin.

Co-authored-by: hexin <he.xin@h3c.com>
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
hexin
2026-05-26 07:25:58 +08:00
committed by GitHub
parent aa468c3078
commit 9c8e482607
6 changed files with 70 additions and 59 deletions
+5 -12
View File
@@ -1287,15 +1287,8 @@ impl Engine {
removed
}
async fn recover_context_overflow(
&mut self,
client: &DeepSeekClient,
reason: &str,
requested_output_tokens: u32,
) -> bool {
let Some(target_budget) =
context_input_budget(&self.session.model, requested_output_tokens)
else {
async fn recover_context_overflow(&mut self, client: &DeepSeekClient, reason: &str) -> bool {
let Some(target_budget) = context_input_budget(&self.session.model) else {
return false;
};
@@ -1971,9 +1964,9 @@ mod handle;
pub(crate) use context::compact_tool_result_for_context;
use context::{
COMPACTION_SUMMARY_MARKER, MAX_CONTEXT_RECOVERY_ATTEMPTS, MIN_RECENT_MESSAGES_TO_KEEP,
TURN_MAX_OUTPUT_TOKENS, context_input_budget, effective_max_output_tokens,
estimate_input_tokens_conservative, extract_compaction_summary_prompt,
is_context_length_error_message, summarize_text, turn_response_headroom_tokens,
context_input_budget, effective_max_output_tokens, estimate_input_tokens_conservative,
extract_compaction_summary_prompt, is_context_length_error_message, summarize_text,
turn_response_headroom_tokens,
};
mod dispatch;
mod loop_guard;
+1 -1
View File
@@ -435,7 +435,7 @@ impl Engine {
}
if !refreshed {
let target_budget = context_input_budget(&self.session.model, TURN_MAX_OUTPUT_TOKENS)
let target_budget = context_input_budget(&self.session.model)
.unwrap_or(self.config.compaction.token_threshold.max(1));
if self.estimated_input_tokens() > target_budget {
let trimmed = self.trim_oldest_messages_to_budget(target_budget);
+29 -3
View File
@@ -354,9 +354,35 @@ pub(super) fn estimate_input_tokens_conservative(
.saturating_add(framing_overhead)
}
pub(super) fn context_input_budget(model: &str, requested_output_tokens: u32) -> Option<usize> {
let window = usize::try_from(context_window_for_model(model)?).ok()?;
let output = usize::try_from(requested_output_tokens).ok()?;
/// Context windows at or above this size reserve the full
/// [`TURN_MAX_OUTPUT_TOKENS`] (262K) when computing the internal input budget,
/// leaving room for V4-class interleaved thinking. Below it, the reservation
/// falls back to [`effective_max_output_tokens`] so a smaller self-hosted
/// window does not underflow to a negative budget.
const INTERNAL_BUDGET_LARGE_WINDOW_THRESHOLD: u32 = 500_000;
/// Internal input-side token budget for a model: `window - reserved_output -
/// headroom`. Used by the preflight check, emergency recovery, and capacity
/// trimming to decide when to compact.
///
/// The reserved-output term is window-dependent:
/// * `window >= 500K` (V4-class large-context) -> [`TURN_MAX_OUTPUT_TOKENS`]
/// (262K). Preserves the "leave room for interleaved thinking" contract.
/// * `window < 500K` (smaller / self-hosted, e.g. a 256K vLLM Qwen window)
/// -> [`effective_max_output_tokens`], i.e. what the API actually caps
/// output at. Reserving the full 262K here would compute
/// `256K - 262K - 1K`, which underflows `checked_sub` to `None` and
/// *silently disables every preflight and emergency recovery path* — the
/// session then runs until the provider hard-rejects on context length.
pub(super) fn context_input_budget(model: &str) -> Option<usize> {
let window_tokens = context_window_for_model(model)?;
let window = usize::try_from(window_tokens).ok()?;
let reserved_output = if window_tokens >= INTERNAL_BUDGET_LARGE_WINDOW_THRESHOLD {
TURN_MAX_OUTPUT_TOKENS
} else {
effective_max_output_tokens(model)
};
let output = usize::try_from(reserved_output).ok()?;
window
.checked_sub(output)
.and_then(|v| v.checked_sub(CONTEXT_HEADROOM_TOKENS))
+17 -23
View File
@@ -1,5 +1,6 @@
use super::*;
use super::context::TURN_MAX_OUTPUT_TOKENS;
use crate::models::SystemBlock;
use crate::test_support::lock_test_env;
use crate::tools::spec::ToolCapability;
@@ -916,7 +917,7 @@ fn detects_context_length_errors_from_provider_payloads() {
fn context_budget_reserves_output_and_headroom() {
// V4 has a 1M context window — the only family that comfortably hosts
// a 256K output reservation without saturating the input budget to 0.
let budget = context_input_budget("deepseek-v4-pro", TURN_MAX_OUTPUT_TOKENS)
let budget = context_input_budget("deepseek-v4-pro")
.expect("deepseek-v4-pro should have a known context window");
let v4_window: usize = 1_000_000;
let expected = v4_window - (TURN_MAX_OUTPUT_TOKENS as usize) - 1_024usize;
@@ -943,31 +944,24 @@ fn effective_max_output_tokens_caps_api_request_for_large_window_models() {
}
#[test]
fn internal_context_budget_unaffected_by_api_request_cap() {
// The internal context budget (used for compaction/preflight/recovery)
// must still use the full TURN_MAX_OUTPUT_TOKENS headroom, NOT the
// smaller API request cap. This ensures long-context V4 sessions don't
// compact prematurely.
let internal_budget = context_input_budget("deepseek-v4-pro", TURN_MAX_OUTPUT_TOKENS)
.expect("V4 should have a known context window");
let api_cap_budget = context_input_budget(
"deepseek-v4-pro",
effective_max_output_tokens("deepseek-v4-pro"),
)
.expect("V4 should have a known context window");
// Internal budget reserves 262K for output; API-cap budget would only
// reserve 64K. Internal budget must be smaller (more conservative).
assert!(
internal_budget < api_cap_budget,
"Internal budget ({internal_budget}) should be smaller than API-cap budget ({api_cap_budget}) \
because it reserves more headroom for output"
);
// Verify the internal budget is what the compaction logic actually uses.
fn internal_context_budget_tiers_reserved_output_by_window() {
// Large-context (>=500K) models reserve the full TURN_MAX_OUTPUT_TOKENS
// headroom so long V4 sessions don't compact prematurely.
let internal_budget =
context_input_budget("deepseek-v4-pro").expect("V4 should have a known context window");
let v4_window: usize = 1_000_000;
let expected_internal = v4_window - (TURN_MAX_OUTPUT_TOKENS as usize) - 1_024usize;
assert_eq!(internal_budget, expected_internal);
// Sub-500K windows cross into the effective-cap branch: a 256K self-hosted
// deployment must yield a usable positive budget rather than None. The
// previous formula reserved the full 262K and computed 256K - 262K - 1K,
// which underflowed to None and silently disabled preflight/recovery.
let small_window_budget = context_input_budget("qwen3-32b-256k")
.expect("a 256K-suffix model must yield Some budget via the effective-cap branch");
let effective_output = effective_max_output_tokens("qwen3-32b-256k") as usize;
let expected_small = 256_000 - effective_output - 1_024;
assert_eq!(small_window_budget, expected_small);
}
#[test]
+3 -13
View File
@@ -173,9 +173,7 @@ impl Engine {
continue;
}
if let Some(input_budget) =
context_input_budget(&self.session.model, TURN_MAX_OUTPUT_TOKENS)
{
if let Some(input_budget) = context_input_budget(&self.session.model) {
let estimated_input = self.estimated_input_tokens();
if estimated_input > input_budget {
if context_recovery_attempts >= MAX_CONTEXT_RECOVERY_ATTEMPTS {
@@ -192,11 +190,7 @@ impl Engine {
}
if self
.recover_context_overflow(
&client,
"preflight token budget",
TURN_MAX_OUTPUT_TOKENS,
)
.recover_context_overflow(&client, "preflight token budget")
.await
{
context_recovery_attempts = context_recovery_attempts.saturating_add(1);
@@ -326,11 +320,7 @@ impl Engine {
if is_context_length_error_message(&message)
&& context_recovery_attempts < MAX_CONTEXT_RECOVERY_ATTEMPTS
&& self
.recover_context_overflow(
&client,
"provider context-length rejection",
TURN_MAX_OUTPUT_TOKENS,
)
.recover_context_overflow(&client, "provider context-length rejection")
.await
{
context_recovery_attempts = context_recovery_attempts.saturating_add(1);
+15 -7
View File
@@ -208,16 +208,22 @@ pub struct Usage {
}
/// Map known models to their approximate context window sizes.
///
/// Lookup order:
/// 1. An explicit `_Nk` suffix in the model name, for **any** vendor. This
/// lets self-hosted deployments advertise their window through the served
/// model name (e.g. a vLLM `--served-model-name qwen3-32b-256k`), which is
/// the only signal we have for non-DeepSeek/Claude models. The 1000-token
/// approximation is fine for compaction-threshold math.
/// 2. DeepSeek vendor heuristics (V4 family -> 1M, legacy -> 128K).
/// 3. Claude -> 200K.
#[must_use]
pub fn context_window_for_model(model: &str) -> Option<u32> {
let lower = model.to_lowercase();
// Unknown legacy DeepSeek model IDs default to 128K unless an explicit
// *k suffix is present. DeepSeek-V4 family and current compatibility
// aliases ship with a 1M context window.
if let Some(explicit_window) = explicit_context_window_hint(&lower) {
return Some(explicit_window);
}
if lower.contains("deepseek") {
if let Some(explicit_window) = deepseek_context_window_hint(&lower) {
return Some(explicit_window);
}
if lower.contains("v4") {
return Some(DEEPSEEK_V4_CONTEXT_WINDOW_TOKENS);
}
@@ -229,7 +235,9 @@ pub fn context_window_for_model(model: &str) -> Option<u32> {
None
}
fn deepseek_context_window_hint(model_lower: &str) -> Option<u32> {
/// Parse an explicit `_Nk` context-window hint from a model name (vendor
/// agnostic). Returns the window in tokens for `N` in `8..=1024`.
fn explicit_context_window_hint(model_lower: &str) -> Option<u32> {
let bytes = model_lower.as_bytes();
let mut i = 0usize;
while i < bytes.len() {