feat(engine): allow DEEPSEEK_MAX_OUTPUT_TOKENS env override for tight-context providers (#2147)

The `effective_max_output_tokens` heuristic defaults to 64K for any model not in the known-context-window table. This is fine for DeepSeek's hosted API (1M context) but causes immediate HTTP 400s on self-hosted providers with tight `max-model-len`. Example: vLLM serving Qwen3.6 with `--max-model-len 65536` rejects requests because 64000 (output) + ~1500 (input) exceeds the limit by 1 token. This change lets the operator set `DEEPSEEK_MAX_OUTPUT_TOKENS=16384` (or whatever fits their deployment) to override the heuristic. The env var takes precedence over the model-table lookup when set to a positive integer; otherwise the existing behavior is preserved. No new config struct field — env-only override keeps the public API unchanged. Useful for embedded users (e.g. pinvou3) who need to control output budget without forking the engine config schema. Co-authored-by: hexin <he.xin@h3c.com>
2026-05-26 23:31:26 +08:00
parent 60a3069705
commit 4f3a0c3cfc
2 changed files with 96 additions and 0 deletions
@@ -28,7 +28,21 @@ const API_MAX_OUTPUT_TOKENS: u32 = 65_536;
 /// model. Uses `API_MAX_OUTPUT_TOKENS` (64K) which fits within common provider
 /// limits (128K+ total). For non-V4 models with smaller context windows, caps
 /// at half the context window.
+///
+/// Override: when the env var `DEEPSEEK_MAX_OUTPUT_TOKENS` is set to a positive
+/// integer, this function returns that value directly. Use this for self-hosted
+/// providers (vLLM/SGLang) whose `max-model-len` is tight and where the
+/// model-table heuristic above would over-allocate. Example: vLLM serving
+/// Qwen3.6 with `--max-model-len 65536` should set
+/// `DEEPSEEK_MAX_OUTPUT_TOKENS=16384` so input + output stays well under the
+/// provider's hard limit.
 pub(super) fn effective_max_output_tokens(model: &str) -> u32 {
+    if let Ok(raw) = std::env::var("DEEPSEEK_MAX_OUTPUT_TOKENS")
+        && let Ok(n) = raw.trim().parse::<u32>()
+        && n > 0
+    {
+        return n;
+    }
    let window = context_window_for_model(model).unwrap_or(128_000);
    if window >= 500_000 {
        // V4-class models on large-context providers: use 64K which is safe
@@ -1047,6 +1047,9 @@ fn detects_context_length_errors_from_provider_payloads() {

 #[test]
 fn context_budget_reserves_output_and_headroom() {
+    // Serialize with other tests that mutate DEEPSEEK_MAX_OUTPUT_TOKENS so
+    // the internal effective_max_output_tokens() call sees a stable env.
+    let _lock = lock_test_env();
    // V4 has a 1M context window — the only family that comfortably hosts
    // a 256K output reservation without saturating the input budget to 0.
    let budget = context_input_budget("deepseek-v4-pro")
@@ -1058,6 +1061,9 @@ fn context_budget_reserves_output_and_headroom() {

 #[test]
 fn effective_max_output_tokens_caps_api_request_for_large_window_models() {
+    // Serialize with other tests that mutate DEEPSEEK_MAX_OUTPUT_TOKENS so
+    // v4_cap and flash_cap below see the same env state.
+    let _lock = lock_test_env();
    // V4 models have a 1M context window but the API request cap must stay
    // well below common provider limits (e.g., 131K total on self-hosted
    // vLLM/SGLang). The cap should never exceed 65K.
@@ -1075,8 +1081,84 @@ fn effective_max_output_tokens_caps_api_request_for_large_window_models() {
    assert_eq!(v4_cap, flash_cap);
 }

+struct ScopedDeepSeekMaxOutputTokens {
+    previous: Option<OsString>,
+}
+
+impl ScopedDeepSeekMaxOutputTokens {
+    fn set(value: &str) -> Self {
+        let previous = std::env::var_os("DEEPSEEK_MAX_OUTPUT_TOKENS");
+        // Safety: tests using this helper serialize with lock_test_env() and
+        // restore the original value in Drop.
+        unsafe {
+            std::env::set_var("DEEPSEEK_MAX_OUTPUT_TOKENS", value);
+        }
+        Self { previous }
+    }
+
+    fn unset() -> Self {
+        let previous = std::env::var_os("DEEPSEEK_MAX_OUTPUT_TOKENS");
+        // Safety: see set().
+        unsafe {
+            std::env::remove_var("DEEPSEEK_MAX_OUTPUT_TOKENS");
+        }
+        Self { previous }
+    }
+}
+
+impl Drop for ScopedDeepSeekMaxOutputTokens {
+    fn drop(&mut self) {
+        // Safety: tests using this helper serialize with lock_test_env().
+        unsafe {
+            if let Some(previous) = self.previous.take() {
+                std::env::set_var("DEEPSEEK_MAX_OUTPUT_TOKENS", previous);
+            } else {
+                std::env::remove_var("DEEPSEEK_MAX_OUTPUT_TOKENS");
+            }
+        }
+    }
+}
+
+#[test]
+fn effective_max_output_tokens_env_override_returns_positive_value() {
+    let _lock = lock_test_env();
+    let _guard = ScopedDeepSeekMaxOutputTokens::set("16384");
+
+    // Override applies regardless of model — V4 hosted, V4 flash, sub-500K
+    // self-hosted all return the env value verbatim.
+    assert_eq!(effective_max_output_tokens("deepseek-v4-pro"), 16_384);
+    assert_eq!(effective_max_output_tokens("deepseek-v4-flash"), 16_384);
+    assert_eq!(effective_max_output_tokens("qwen3-32b-256k"), 16_384);
+}
+
+#[test]
+fn effective_max_output_tokens_env_override_rejects_zero_and_invalid() {
+    let _lock = lock_test_env();
+    // Establish the heuristic baseline with the env unset.
+    let baseline = {
+        let _guard = ScopedDeepSeekMaxOutputTokens::unset();
+        effective_max_output_tokens("deepseek-v4-pro")
+    };
+    assert!(baseline > 0);
+
+    // 0, non-numeric, and empty values must all fall through to the heuristic
+    // rather than producing a zero/garbage cap that would silently break
+    // request budgeting.
+    for raw in ["0", "abc", "", "  ", "-1"] {
+        let _guard = ScopedDeepSeekMaxOutputTokens::set(raw);
+        assert_eq!(
+            effective_max_output_tokens("deepseek-v4-pro"),
+            baseline,
+            "env={raw:?} should fall through to heuristic"
+        );
+    }
+}
+
 #[test]
 fn internal_context_budget_tiers_reserved_output_by_window() {
+    // Serialize with other tests that mutate DEEPSEEK_MAX_OUTPUT_TOKENS so
+    // both branches below see a stable env.
+    let _lock = lock_test_env();
    // Large-context (>=500K) models reserve the full TURN_MAX_OUTPUT_TOKENS
    // headroom so long V4 sessions don't compact prematurely.
    let internal_budget =