feat(engine): allow DEEPSEEK_MAX_OUTPUT_TOKENS env override for tight-context providers (#2147)
The `effective_max_output_tokens` heuristic defaults to 64K for any model not in the known-context-window table. This is fine for DeepSeek's hosted API (1M context) but causes immediate HTTP 400s on self-hosted providers with tight `max-model-len`. Example: vLLM serving Qwen3.6 with `--max-model-len 65536` rejects requests because 64000 (output) + ~1500 (input) exceeds the limit by 1 token. This change lets the operator set `DEEPSEEK_MAX_OUTPUT_TOKENS=16384` (or whatever fits their deployment) to override the heuristic. The env var takes precedence over the model-table lookup when set to a positive integer; otherwise the existing behavior is preserved. No new config struct field — env-only override keeps the public API unchanged. Useful for embedded users (e.g. pinvou3) who need to control output budget without forking the engine config schema. Co-authored-by: hexin <he.xin@h3c.com>
This commit is contained in:
@@ -28,7 +28,21 @@ const API_MAX_OUTPUT_TOKENS: u32 = 65_536;
|
||||
/// model. Uses `API_MAX_OUTPUT_TOKENS` (64K) which fits within common provider
|
||||
/// limits (128K+ total). For non-V4 models with smaller context windows, caps
|
||||
/// at half the context window.
|
||||
///
|
||||
/// Override: when the env var `DEEPSEEK_MAX_OUTPUT_TOKENS` is set to a positive
|
||||
/// integer, this function returns that value directly. Use this for self-hosted
|
||||
/// providers (vLLM/SGLang) whose `max-model-len` is tight and where the
|
||||
/// model-table heuristic above would over-allocate. Example: vLLM serving
|
||||
/// Qwen3.6 with `--max-model-len 65536` should set
|
||||
/// `DEEPSEEK_MAX_OUTPUT_TOKENS=16384` so input + output stays well under the
|
||||
/// provider's hard limit.
|
||||
pub(super) fn effective_max_output_tokens(model: &str) -> u32 {
|
||||
if let Ok(raw) = std::env::var("DEEPSEEK_MAX_OUTPUT_TOKENS")
|
||||
&& let Ok(n) = raw.trim().parse::<u32>()
|
||||
&& n > 0
|
||||
{
|
||||
return n;
|
||||
}
|
||||
let window = context_window_for_model(model).unwrap_or(128_000);
|
||||
if window >= 500_000 {
|
||||
// V4-class models on large-context providers: use 64K which is safe
|
||||
|
||||
@@ -1047,6 +1047,9 @@ fn detects_context_length_errors_from_provider_payloads() {
|
||||
|
||||
#[test]
|
||||
fn context_budget_reserves_output_and_headroom() {
|
||||
// Serialize with other tests that mutate DEEPSEEK_MAX_OUTPUT_TOKENS so
|
||||
// the internal effective_max_output_tokens() call sees a stable env.
|
||||
let _lock = lock_test_env();
|
||||
// V4 has a 1M context window — the only family that comfortably hosts
|
||||
// a 256K output reservation without saturating the input budget to 0.
|
||||
let budget = context_input_budget("deepseek-v4-pro")
|
||||
@@ -1058,6 +1061,9 @@ fn context_budget_reserves_output_and_headroom() {
|
||||
|
||||
#[test]
|
||||
fn effective_max_output_tokens_caps_api_request_for_large_window_models() {
|
||||
// Serialize with other tests that mutate DEEPSEEK_MAX_OUTPUT_TOKENS so
|
||||
// v4_cap and flash_cap below see the same env state.
|
||||
let _lock = lock_test_env();
|
||||
// V4 models have a 1M context window but the API request cap must stay
|
||||
// well below common provider limits (e.g., 131K total on self-hosted
|
||||
// vLLM/SGLang). The cap should never exceed 65K.
|
||||
@@ -1075,8 +1081,84 @@ fn effective_max_output_tokens_caps_api_request_for_large_window_models() {
|
||||
assert_eq!(v4_cap, flash_cap);
|
||||
}
|
||||
|
||||
struct ScopedDeepSeekMaxOutputTokens {
|
||||
previous: Option<OsString>,
|
||||
}
|
||||
|
||||
impl ScopedDeepSeekMaxOutputTokens {
|
||||
fn set(value: &str) -> Self {
|
||||
let previous = std::env::var_os("DEEPSEEK_MAX_OUTPUT_TOKENS");
|
||||
// Safety: tests using this helper serialize with lock_test_env() and
|
||||
// restore the original value in Drop.
|
||||
unsafe {
|
||||
std::env::set_var("DEEPSEEK_MAX_OUTPUT_TOKENS", value);
|
||||
}
|
||||
Self { previous }
|
||||
}
|
||||
|
||||
fn unset() -> Self {
|
||||
let previous = std::env::var_os("DEEPSEEK_MAX_OUTPUT_TOKENS");
|
||||
// Safety: see set().
|
||||
unsafe {
|
||||
std::env::remove_var("DEEPSEEK_MAX_OUTPUT_TOKENS");
|
||||
}
|
||||
Self { previous }
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ScopedDeepSeekMaxOutputTokens {
|
||||
fn drop(&mut self) {
|
||||
// Safety: tests using this helper serialize with lock_test_env().
|
||||
unsafe {
|
||||
if let Some(previous) = self.previous.take() {
|
||||
std::env::set_var("DEEPSEEK_MAX_OUTPUT_TOKENS", previous);
|
||||
} else {
|
||||
std::env::remove_var("DEEPSEEK_MAX_OUTPUT_TOKENS");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn effective_max_output_tokens_env_override_returns_positive_value() {
|
||||
let _lock = lock_test_env();
|
||||
let _guard = ScopedDeepSeekMaxOutputTokens::set("16384");
|
||||
|
||||
// Override applies regardless of model — V4 hosted, V4 flash, sub-500K
|
||||
// self-hosted all return the env value verbatim.
|
||||
assert_eq!(effective_max_output_tokens("deepseek-v4-pro"), 16_384);
|
||||
assert_eq!(effective_max_output_tokens("deepseek-v4-flash"), 16_384);
|
||||
assert_eq!(effective_max_output_tokens("qwen3-32b-256k"), 16_384);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn effective_max_output_tokens_env_override_rejects_zero_and_invalid() {
|
||||
let _lock = lock_test_env();
|
||||
// Establish the heuristic baseline with the env unset.
|
||||
let baseline = {
|
||||
let _guard = ScopedDeepSeekMaxOutputTokens::unset();
|
||||
effective_max_output_tokens("deepseek-v4-pro")
|
||||
};
|
||||
assert!(baseline > 0);
|
||||
|
||||
// 0, non-numeric, and empty values must all fall through to the heuristic
|
||||
// rather than producing a zero/garbage cap that would silently break
|
||||
// request budgeting.
|
||||
for raw in ["0", "abc", "", " ", "-1"] {
|
||||
let _guard = ScopedDeepSeekMaxOutputTokens::set(raw);
|
||||
assert_eq!(
|
||||
effective_max_output_tokens("deepseek-v4-pro"),
|
||||
baseline,
|
||||
"env={raw:?} should fall through to heuristic"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn internal_context_budget_tiers_reserved_output_by_window() {
|
||||
// Serialize with other tests that mutate DEEPSEEK_MAX_OUTPUT_TOKENS so
|
||||
// both branches below see a stable env.
|
||||
let _lock = lock_test_env();
|
||||
// Large-context (>=500K) models reserve the full TURN_MAX_OUTPUT_TOKENS
|
||||
// headroom so long V4 sessions don't compact prematurely.
|
||||
let internal_budget =
|
||||
|
||||
Reference in New Issue
Block a user