feat(engine): allow DEEPSEEK_MAX_OUTPUT_TOKENS env override for tight-context providers (#2147)

The `effective_max_output_tokens` heuristic defaults to 64K for any model
not in the known-context-window table. This is fine for DeepSeek's hosted
API (1M context) but causes immediate HTTP 400s on self-hosted providers
with tight `max-model-len`.

Example: vLLM serving Qwen3.6 with `--max-model-len 65536` rejects
requests because 64000 (output) + ~1500 (input) exceeds the limit by 1
token.

This change lets the operator set `DEEPSEEK_MAX_OUTPUT_TOKENS=16384` (or
whatever fits their deployment) to override the heuristic. The env var
takes precedence over the model-table lookup when set to a positive
integer; otherwise the existing behavior is preserved.

No new config struct field — env-only override keeps the public API
unchanged. Useful for embedded users (e.g. pinvou3) who need to control
output budget without forking the engine config schema.

Co-authored-by: hexin <he.xin@h3c.com>
This commit is contained in:
hexin
2026-05-26 23:31:26 +08:00
committed by GitHub
parent 60a3069705
commit 4f3a0c3cfc
2 changed files with 96 additions and 0 deletions
+14
View File
@@ -28,7 +28,21 @@ const API_MAX_OUTPUT_TOKENS: u32 = 65_536;
/// model. Uses `API_MAX_OUTPUT_TOKENS` (64K) which fits within common provider
/// limits (128K+ total). For non-V4 models with smaller context windows, caps
/// at half the context window.
///
/// Override: when the env var `DEEPSEEK_MAX_OUTPUT_TOKENS` is set to a positive
/// integer, this function returns that value directly. Use this for self-hosted
/// providers (vLLM/SGLang) whose `max-model-len` is tight and where the
/// model-table heuristic above would over-allocate. Example: vLLM serving
/// Qwen3.6 with `--max-model-len 65536` should set
/// `DEEPSEEK_MAX_OUTPUT_TOKENS=16384` so input + output stays well under the
/// provider's hard limit.
pub(super) fn effective_max_output_tokens(model: &str) -> u32 {
if let Ok(raw) = std::env::var("DEEPSEEK_MAX_OUTPUT_TOKENS")
&& let Ok(n) = raw.trim().parse::<u32>()
&& n > 0
{
return n;
}
let window = context_window_for_model(model).unwrap_or(128_000);
if window >= 500_000 {
// V4-class models on large-context providers: use 64K which is safe
+82
View File
@@ -1047,6 +1047,9 @@ fn detects_context_length_errors_from_provider_payloads() {
#[test]
fn context_budget_reserves_output_and_headroom() {
// Serialize with other tests that mutate DEEPSEEK_MAX_OUTPUT_TOKENS so
// the internal effective_max_output_tokens() call sees a stable env.
let _lock = lock_test_env();
// V4 has a 1M context window — the only family that comfortably hosts
// a 256K output reservation without saturating the input budget to 0.
let budget = context_input_budget("deepseek-v4-pro")
@@ -1058,6 +1061,9 @@ fn context_budget_reserves_output_and_headroom() {
#[test]
fn effective_max_output_tokens_caps_api_request_for_large_window_models() {
// Serialize with other tests that mutate DEEPSEEK_MAX_OUTPUT_TOKENS so
// v4_cap and flash_cap below see the same env state.
let _lock = lock_test_env();
// V4 models have a 1M context window but the API request cap must stay
// well below common provider limits (e.g., 131K total on self-hosted
// vLLM/SGLang). The cap should never exceed 65K.
@@ -1075,8 +1081,84 @@ fn effective_max_output_tokens_caps_api_request_for_large_window_models() {
assert_eq!(v4_cap, flash_cap);
}
struct ScopedDeepSeekMaxOutputTokens {
previous: Option<OsString>,
}
impl ScopedDeepSeekMaxOutputTokens {
fn set(value: &str) -> Self {
let previous = std::env::var_os("DEEPSEEK_MAX_OUTPUT_TOKENS");
// Safety: tests using this helper serialize with lock_test_env() and
// restore the original value in Drop.
unsafe {
std::env::set_var("DEEPSEEK_MAX_OUTPUT_TOKENS", value);
}
Self { previous }
}
fn unset() -> Self {
let previous = std::env::var_os("DEEPSEEK_MAX_OUTPUT_TOKENS");
// Safety: see set().
unsafe {
std::env::remove_var("DEEPSEEK_MAX_OUTPUT_TOKENS");
}
Self { previous }
}
}
impl Drop for ScopedDeepSeekMaxOutputTokens {
fn drop(&mut self) {
// Safety: tests using this helper serialize with lock_test_env().
unsafe {
if let Some(previous) = self.previous.take() {
std::env::set_var("DEEPSEEK_MAX_OUTPUT_TOKENS", previous);
} else {
std::env::remove_var("DEEPSEEK_MAX_OUTPUT_TOKENS");
}
}
}
}
#[test]
fn effective_max_output_tokens_env_override_returns_positive_value() {
let _lock = lock_test_env();
let _guard = ScopedDeepSeekMaxOutputTokens::set("16384");
// Override applies regardless of model — V4 hosted, V4 flash, sub-500K
// self-hosted all return the env value verbatim.
assert_eq!(effective_max_output_tokens("deepseek-v4-pro"), 16_384);
assert_eq!(effective_max_output_tokens("deepseek-v4-flash"), 16_384);
assert_eq!(effective_max_output_tokens("qwen3-32b-256k"), 16_384);
}
#[test]
fn effective_max_output_tokens_env_override_rejects_zero_and_invalid() {
let _lock = lock_test_env();
// Establish the heuristic baseline with the env unset.
let baseline = {
let _guard = ScopedDeepSeekMaxOutputTokens::unset();
effective_max_output_tokens("deepseek-v4-pro")
};
assert!(baseline > 0);
// 0, non-numeric, and empty values must all fall through to the heuristic
// rather than producing a zero/garbage cap that would silently break
// request budgeting.
for raw in ["0", "abc", "", " ", "-1"] {
let _guard = ScopedDeepSeekMaxOutputTokens::set(raw);
assert_eq!(
effective_max_output_tokens("deepseek-v4-pro"),
baseline,
"env={raw:?} should fall through to heuristic"
);
}
}
#[test]
fn internal_context_budget_tiers_reserved_output_by_window() {
// Serialize with other tests that mutate DEEPSEEK_MAX_OUTPUT_TOKENS so
// both branches below see a stable env.
let _lock = lock_test_env();
// Large-context (>=500K) models reserve the full TURN_MAX_OUTPUT_TOKENS
// headroom so long V4 sessions don't compact prematurely.
let internal_budget =