fix(engine): keep auto-compaction working on sub-500K self-hosted windows
Harvested from PR #2060 by @h3c-hexin. Co-authored-by: hexin <he.xin@h3c.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1287,15 +1287,8 @@ impl Engine {
|
||||
removed
|
||||
}
|
||||
|
||||
async fn recover_context_overflow(
|
||||
&mut self,
|
||||
client: &DeepSeekClient,
|
||||
reason: &str,
|
||||
requested_output_tokens: u32,
|
||||
) -> bool {
|
||||
let Some(target_budget) =
|
||||
context_input_budget(&self.session.model, requested_output_tokens)
|
||||
else {
|
||||
async fn recover_context_overflow(&mut self, client: &DeepSeekClient, reason: &str) -> bool {
|
||||
let Some(target_budget) = context_input_budget(&self.session.model) else {
|
||||
return false;
|
||||
};
|
||||
|
||||
@@ -1971,9 +1964,9 @@ mod handle;
|
||||
pub(crate) use context::compact_tool_result_for_context;
|
||||
use context::{
|
||||
COMPACTION_SUMMARY_MARKER, MAX_CONTEXT_RECOVERY_ATTEMPTS, MIN_RECENT_MESSAGES_TO_KEEP,
|
||||
TURN_MAX_OUTPUT_TOKENS, context_input_budget, effective_max_output_tokens,
|
||||
estimate_input_tokens_conservative, extract_compaction_summary_prompt,
|
||||
is_context_length_error_message, summarize_text, turn_response_headroom_tokens,
|
||||
context_input_budget, effective_max_output_tokens, estimate_input_tokens_conservative,
|
||||
extract_compaction_summary_prompt, is_context_length_error_message, summarize_text,
|
||||
turn_response_headroom_tokens,
|
||||
};
|
||||
mod dispatch;
|
||||
mod loop_guard;
|
||||
|
||||
@@ -435,7 +435,7 @@ impl Engine {
|
||||
}
|
||||
|
||||
if !refreshed {
|
||||
let target_budget = context_input_budget(&self.session.model, TURN_MAX_OUTPUT_TOKENS)
|
||||
let target_budget = context_input_budget(&self.session.model)
|
||||
.unwrap_or(self.config.compaction.token_threshold.max(1));
|
||||
if self.estimated_input_tokens() > target_budget {
|
||||
let trimmed = self.trim_oldest_messages_to_budget(target_budget);
|
||||
|
||||
@@ -354,9 +354,35 @@ pub(super) fn estimate_input_tokens_conservative(
|
||||
.saturating_add(framing_overhead)
|
||||
}
|
||||
|
||||
pub(super) fn context_input_budget(model: &str, requested_output_tokens: u32) -> Option<usize> {
|
||||
let window = usize::try_from(context_window_for_model(model)?).ok()?;
|
||||
let output = usize::try_from(requested_output_tokens).ok()?;
|
||||
/// Context windows at or above this size reserve the full
|
||||
/// [`TURN_MAX_OUTPUT_TOKENS`] (262K) when computing the internal input budget,
|
||||
/// leaving room for V4-class interleaved thinking. Below it, the reservation
|
||||
/// falls back to [`effective_max_output_tokens`] so a smaller self-hosted
|
||||
/// window does not underflow to a negative budget.
|
||||
const INTERNAL_BUDGET_LARGE_WINDOW_THRESHOLD: u32 = 500_000;
|
||||
|
||||
/// Internal input-side token budget for a model: `window - reserved_output -
|
||||
/// headroom`. Used by the preflight check, emergency recovery, and capacity
|
||||
/// trimming to decide when to compact.
|
||||
///
|
||||
/// The reserved-output term is window-dependent:
|
||||
/// * `window >= 500K` (V4-class large-context) -> [`TURN_MAX_OUTPUT_TOKENS`]
|
||||
/// (262K). Preserves the "leave room for interleaved thinking" contract.
|
||||
/// * `window < 500K` (smaller / self-hosted, e.g. a 256K vLLM Qwen window)
|
||||
/// -> [`effective_max_output_tokens`], i.e. what the API actually caps
|
||||
/// output at. Reserving the full 262K here would compute
|
||||
/// `256K - 262K - 1K`, which underflows `checked_sub` to `None` and
|
||||
/// *silently disables every preflight and emergency recovery path* — the
|
||||
/// session then runs until the provider hard-rejects on context length.
|
||||
pub(super) fn context_input_budget(model: &str) -> Option<usize> {
|
||||
let window_tokens = context_window_for_model(model)?;
|
||||
let window = usize::try_from(window_tokens).ok()?;
|
||||
let reserved_output = if window_tokens >= INTERNAL_BUDGET_LARGE_WINDOW_THRESHOLD {
|
||||
TURN_MAX_OUTPUT_TOKENS
|
||||
} else {
|
||||
effective_max_output_tokens(model)
|
||||
};
|
||||
let output = usize::try_from(reserved_output).ok()?;
|
||||
window
|
||||
.checked_sub(output)
|
||||
.and_then(|v| v.checked_sub(CONTEXT_HEADROOM_TOKENS))
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use super::*;
|
||||
|
||||
use super::context::TURN_MAX_OUTPUT_TOKENS;
|
||||
use crate::models::SystemBlock;
|
||||
use crate::test_support::lock_test_env;
|
||||
use crate::tools::spec::ToolCapability;
|
||||
@@ -916,7 +917,7 @@ fn detects_context_length_errors_from_provider_payloads() {
|
||||
fn context_budget_reserves_output_and_headroom() {
|
||||
// V4 has a 1M context window — the only family that comfortably hosts
|
||||
// a 256K output reservation without saturating the input budget to 0.
|
||||
let budget = context_input_budget("deepseek-v4-pro", TURN_MAX_OUTPUT_TOKENS)
|
||||
let budget = context_input_budget("deepseek-v4-pro")
|
||||
.expect("deepseek-v4-pro should have a known context window");
|
||||
let v4_window: usize = 1_000_000;
|
||||
let expected = v4_window - (TURN_MAX_OUTPUT_TOKENS as usize) - 1_024usize;
|
||||
@@ -943,31 +944,24 @@ fn effective_max_output_tokens_caps_api_request_for_large_window_models() {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn internal_context_budget_unaffected_by_api_request_cap() {
|
||||
// The internal context budget (used for compaction/preflight/recovery)
|
||||
// must still use the full TURN_MAX_OUTPUT_TOKENS headroom, NOT the
|
||||
// smaller API request cap. This ensures long-context V4 sessions don't
|
||||
// compact prematurely.
|
||||
let internal_budget = context_input_budget("deepseek-v4-pro", TURN_MAX_OUTPUT_TOKENS)
|
||||
.expect("V4 should have a known context window");
|
||||
let api_cap_budget = context_input_budget(
|
||||
"deepseek-v4-pro",
|
||||
effective_max_output_tokens("deepseek-v4-pro"),
|
||||
)
|
||||
.expect("V4 should have a known context window");
|
||||
|
||||
// Internal budget reserves 262K for output; API-cap budget would only
|
||||
// reserve 64K. Internal budget must be smaller (more conservative).
|
||||
assert!(
|
||||
internal_budget < api_cap_budget,
|
||||
"Internal budget ({internal_budget}) should be smaller than API-cap budget ({api_cap_budget}) \
|
||||
because it reserves more headroom for output"
|
||||
);
|
||||
|
||||
// Verify the internal budget is what the compaction logic actually uses.
|
||||
fn internal_context_budget_tiers_reserved_output_by_window() {
|
||||
// Large-context (>=500K) models reserve the full TURN_MAX_OUTPUT_TOKENS
|
||||
// headroom so long V4 sessions don't compact prematurely.
|
||||
let internal_budget =
|
||||
context_input_budget("deepseek-v4-pro").expect("V4 should have a known context window");
|
||||
let v4_window: usize = 1_000_000;
|
||||
let expected_internal = v4_window - (TURN_MAX_OUTPUT_TOKENS as usize) - 1_024usize;
|
||||
assert_eq!(internal_budget, expected_internal);
|
||||
|
||||
// Sub-500K windows cross into the effective-cap branch: a 256K self-hosted
|
||||
// deployment must yield a usable positive budget rather than None. The
|
||||
// previous formula reserved the full 262K and computed 256K - 262K - 1K,
|
||||
// which underflowed to None and silently disabled preflight/recovery.
|
||||
let small_window_budget = context_input_budget("qwen3-32b-256k")
|
||||
.expect("a 256K-suffix model must yield Some budget via the effective-cap branch");
|
||||
let effective_output = effective_max_output_tokens("qwen3-32b-256k") as usize;
|
||||
let expected_small = 256_000 - effective_output - 1_024;
|
||||
assert_eq!(small_window_budget, expected_small);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -173,9 +173,7 @@ impl Engine {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(input_budget) =
|
||||
context_input_budget(&self.session.model, TURN_MAX_OUTPUT_TOKENS)
|
||||
{
|
||||
if let Some(input_budget) = context_input_budget(&self.session.model) {
|
||||
let estimated_input = self.estimated_input_tokens();
|
||||
if estimated_input > input_budget {
|
||||
if context_recovery_attempts >= MAX_CONTEXT_RECOVERY_ATTEMPTS {
|
||||
@@ -192,11 +190,7 @@ impl Engine {
|
||||
}
|
||||
|
||||
if self
|
||||
.recover_context_overflow(
|
||||
&client,
|
||||
"preflight token budget",
|
||||
TURN_MAX_OUTPUT_TOKENS,
|
||||
)
|
||||
.recover_context_overflow(&client, "preflight token budget")
|
||||
.await
|
||||
{
|
||||
context_recovery_attempts = context_recovery_attempts.saturating_add(1);
|
||||
@@ -326,11 +320,7 @@ impl Engine {
|
||||
if is_context_length_error_message(&message)
|
||||
&& context_recovery_attempts < MAX_CONTEXT_RECOVERY_ATTEMPTS
|
||||
&& self
|
||||
.recover_context_overflow(
|
||||
&client,
|
||||
"provider context-length rejection",
|
||||
TURN_MAX_OUTPUT_TOKENS,
|
||||
)
|
||||
.recover_context_overflow(&client, "provider context-length rejection")
|
||||
.await
|
||||
{
|
||||
context_recovery_attempts = context_recovery_attempts.saturating_add(1);
|
||||
|
||||
@@ -208,16 +208,22 @@ pub struct Usage {
|
||||
}
|
||||
|
||||
/// Map known models to their approximate context window sizes.
|
||||
///
|
||||
/// Lookup order:
|
||||
/// 1. An explicit `_Nk` suffix in the model name, for **any** vendor. This
|
||||
/// lets self-hosted deployments advertise their window through the served
|
||||
/// model name (e.g. a vLLM `--served-model-name qwen3-32b-256k`), which is
|
||||
/// the only signal we have for non-DeepSeek/Claude models. The 1000-token
|
||||
/// approximation is fine for compaction-threshold math.
|
||||
/// 2. DeepSeek vendor heuristics (V4 family -> 1M, legacy -> 128K).
|
||||
/// 3. Claude -> 200K.
|
||||
#[must_use]
|
||||
pub fn context_window_for_model(model: &str) -> Option<u32> {
|
||||
let lower = model.to_lowercase();
|
||||
// Unknown legacy DeepSeek model IDs default to 128K unless an explicit
|
||||
// *k suffix is present. DeepSeek-V4 family and current compatibility
|
||||
// aliases ship with a 1M context window.
|
||||
if let Some(explicit_window) = explicit_context_window_hint(&lower) {
|
||||
return Some(explicit_window);
|
||||
}
|
||||
if lower.contains("deepseek") {
|
||||
if let Some(explicit_window) = deepseek_context_window_hint(&lower) {
|
||||
return Some(explicit_window);
|
||||
}
|
||||
if lower.contains("v4") {
|
||||
return Some(DEEPSEEK_V4_CONTEXT_WINDOW_TOKENS);
|
||||
}
|
||||
@@ -229,7 +235,9 @@ pub fn context_window_for_model(model: &str) -> Option<u32> {
|
||||
None
|
||||
}
|
||||
|
||||
fn deepseek_context_window_hint(model_lower: &str) -> Option<u32> {
|
||||
/// Parse an explicit `_Nk` context-window hint from a model name (vendor
|
||||
/// agnostic). Returns the window in tokens for `N` in `8..=1024`.
|
||||
fn explicit_context_window_hint(model_lower: &str) -> Option<u32> {
|
||||
let bytes = model_lower.as_bytes();
|
||||
let mut i = 0usize;
|
||||
while i < bytes.len() {
|
||||
|
||||
Reference in New Issue
Block a user