feat(api): default DeepSeek to beta endpoint

Closes #941.\n\nRefs #938, #939, #940.
2026-05-06 21:24:59 -05:00
parent 0ee298bd77
commit c7ed05a07c
10 changed files with 111 additions and 49 deletions
@@ -329,7 +329,9 @@ UI locale is separate from model language — set `locale` in `settings.toml`, u
 | `deepseek-v4-pro` | 1M | $0.003625 / 1M* | $0.435 / 1M* | $0.87 / 1M* |
 | `deepseek-v4-flash` | 1M | $0.0028 / 1M | $0.14 / 1M | $0.28 / 1M |

-Legacy aliases `deepseek-chat` / `deepseek-reasoner` map to `deepseek-v4-flash`. NVIDIA NIM variants use your NVIDIA account terms.
+DeepSeek Platform defaults to `https://api.deepseek.com/beta` in v0.8.16 so beta-gated API features can be tested without extra setup. Set `base_url = "https://api.deepseek.com"` to opt out.
+
+Legacy aliases `deepseek-chat` / `deepseek-reasoner` map to `deepseek-v4-flash` and retire after July 24, 2026. NVIDIA NIM variants use your NVIDIA account terms.

 *DeepSeek Pro rates currently reflect a limited-time 75% discount, which remains valid until 15:59 UTC on 31 May 2026. After that time, the TUI cost estimator will revert to the base Pro rates.*

@@ -17,9 +17,9 @@
 # defaults when `[providers.deepseek]` is absent (backward compatibility).
 provider = "deepseek" # deepseek | nvidia-nim | openrouter | novita | fireworks | sglang | vllm | ollama
 api_key = "YOUR_DEEPSEEK_API_KEY" # must be non-empty
-base_url = "https://api.deepseek.com"
+base_url = "https://api.deepseek.com/beta"
 # base_url = "https://api.deepseeki.com"         # China users
-# base_url = "https://api.deepseek.com/beta"     # DeepSeek beta features such as strict tool mode
+# base_url = "https://api.deepseek.com"          # opt out of DeepSeek beta features
 # Optional custom model request headers for OpenAI-compatible gateways.
 # Authorization and Content-Type are managed by the client and cannot be overridden here.
 # http_headers = { "X-Model-Provider-Id" = "your-model-provider" }
@@ -163,7 +163,7 @@ max_subagents = 10 # optional (1-20)
 # DeepSeek Platform (https://platform.deepseek.com)
 [providers.deepseek]
 # api_key = "YOUR_DEEPSEEK_API_KEY"
-# base_url = "https://api.deepseek.com"
+# base_url = "https://api.deepseek.com/beta"
 # model = "deepseek-v4-pro"
 # http_headers = { "X-Model-Provider-Id" = "your-model-provider" } # optional custom request headers

@@ -296,8 +296,8 @@ verbatim_window_turns = 16
 l1_threshold = 192000
 l2_threshold = 384000
 l3_threshold = 576000
-# Hard cycle also reserves the normal 262144-token output budget plus 1024
-# safety tokens against the model window.
+# Hard cycle reserves the normal 262144-token internal turn budget plus 1024
+# safety tokens, separate from V4's official 384000 max-output metadata.
 cycle_threshold = 768000
 seam_model = "deepseek-v4-flash"

@@ -347,7 +347,7 @@ fallback_default_prior = 3.8
 # Select a profile with `deepseek --profile <name>` or `DEEPSEEK_PROFILE=<name>`.
 [profiles.work]
 api_key = "WORK_DEEPSEEK_API_KEY"
-base_url = "https://api.deepseek.com"
+base_url = "https://api.deepseek.com/beta"

 [profiles.dev]
 api_key = "DEV_DEEPSEEK_API_KEY"
@@ -18,7 +18,7 @@ const DEFAULT_DEEPSEEK_MODEL: &str = "deepseek-v4-pro";
 const DEFAULT_NVIDIA_NIM_MODEL: &str = "deepseek-ai/deepseek-v4-pro";
 const DEFAULT_NVIDIA_NIM_FLASH_MODEL: &str = "deepseek-ai/deepseek-v4-flash";
 const DEFAULT_OPENAI_MODEL: &str = "gpt-4.1";
-const DEFAULT_DEEPSEEK_BASE_URL: &str = "https://api.deepseek.com";
+const DEFAULT_DEEPSEEK_BASE_URL: &str = "https://api.deepseek.com/beta";
 const DEFAULT_NVIDIA_NIM_BASE_URL: &str = "https://integrate.api.nvidia.com/v1";
 const DEFAULT_OPENAI_BASE_URL: &str = "https://api.openai.com/v1";
 const DEFAULT_OPENROUTER_MODEL: &str = "deepseek/deepseek-v4-pro";
@@ -1492,6 +1492,19 @@ mod tests {
        assert_eq!(resolved.model, "deepseek-v4-pro");
    }

+    #[test]
+    fn deepseek_runtime_defaults_to_beta_endpoint() {
+        let _lock = env_lock();
+        let _env = EnvGuard::without_deepseek_runtime_overrides();
+        let config = ConfigToml::default();
+
+        let resolved = config.resolve_runtime_options(&CliRuntimeOverrides::default());
+
+        assert_eq!(resolved.provider, ProviderKind::Deepseek);
+        assert_eq!(resolved.base_url, DEFAULT_DEEPSEEK_BASE_URL);
+        assert_eq!(resolved.model, DEFAULT_DEEPSEEK_MODEL);
+    }
+
    #[test]
    fn provider_specific_deepseek_fields_override_tui_compat_fields() {
        let _lock = env_lock();
@@ -367,11 +367,24 @@ pub(super) fn versioned_base_url(base_url: &str) -> String {
    }
 }

+fn unversioned_base_url(base_url: &str) -> String {
+    let trimmed = base_url.trim_end_matches('/');
+    trimmed
+        .strip_suffix("/v1")
+        .or_else(|| trimmed.strip_suffix("/beta"))
+        .unwrap_or(trimmed)
+        .to_string()
+}
+
 pub(super) fn api_url(base_url: &str, path: &str) -> String {
+    let path = path.trim_start_matches('/');
+    if path.starts_with("beta/") {
+        return format!("{}/{}", unversioned_base_url(base_url), path);
+    }
    format!(
        "{}/{}",
        versioned_base_url(base_url).trim_end_matches('/'),
-        path.trim_start_matches('/')
+        path
    )
 }

@@ -1021,6 +1034,22 @@ mod tests {
        );
    }

+    #[test]
+    fn api_url_routes_beta_paths_from_any_deepseek_base() {
+        assert_eq!(
+            api_url("https://api.deepseek.com", "beta/completions"),
+            "https://api.deepseek.com/beta/completions"
+        );
+        assert_eq!(
+            api_url("https://api.deepseek.com/v1", "beta/completions"),
+            "https://api.deepseek.com/beta/completions"
+        );
+        assert_eq!(
+            api_url("https://api.deepseek.com/beta", "beta/completions"),
+            "https://api.deepseek.com/beta/completions"
+        );
+    }
+
    #[test]
    fn default_headers_include_custom_headers_when_configured() {
        let mut extra = HashMap::new();
@@ -20,6 +20,7 @@ use crate::hooks::HooksConfig;
 pub const DEFAULT_MAX_SUBAGENTS: usize = 10;
 pub const MAX_SUBAGENTS: usize = 20;
 pub const DEFAULT_TEXT_MODEL: &str = "deepseek-v4-pro";
+pub const DEFAULT_DEEPSEEK_BASE_URL: &str = "https://api.deepseek.com/beta";
 pub const DEFAULT_NVIDIA_NIM_MODEL: &str = "deepseek-ai/deepseek-v4-pro";
 pub const DEFAULT_NVIDIA_NIM_FLASH_MODEL: &str = "deepseek-ai/deepseek-v4-flash";
 pub const DEFAULT_NVIDIA_NIM_BASE_URL: &str = "https://integrate.api.nvidia.com/v1";
@@ -148,7 +149,10 @@ pub struct ProviderCapability {
    pub resolved_model: String,
    /// Context window in tokens (the maximum input the model can accept).
    pub context_window: u32,
-    /// Recommended maximum output tokens (`max_tokens`) for this combo.
+    /// Official maximum output tokens for this combo.
+    ///
+    /// This is model metadata for diagnostics and CI policy. Normal turns use
+    /// a separate, more conservative request cap in the engine.
    pub max_output: u32,
    /// Whether the provider+model supports thinking/reasoning mode.
    pub thinking_supported: bool,
@@ -199,9 +203,10 @@ pub fn provider_capability(provider: ApiProvider, resolved_model: &str) -> Provi
            .unwrap_or(crate::models::LEGACY_DEEPSEEK_CONTEXT_WINDOW_TOKENS)
    };

-    // Max output tokens: DeepSeek V4 models allow 262K; others get 4096.
+    // Max output tokens: official DeepSeek V4 API metadata lists 384K;
+    // runtime request caps remain separate and more conservative.
    let max_output = if is_v4_pro || is_v4_flash {
-        262_144
+        384_000
    } else {
        4096
    };
@@ -1231,7 +1236,7 @@ impl Config {
        };
        let base = provider_base.or(root_base).unwrap_or_else(|| {
            match provider {
-                ApiProvider::Deepseek => "https://api.deepseek.com",
+                ApiProvider::Deepseek => DEFAULT_DEEPSEEK_BASE_URL,
                ApiProvider::DeepseekCN => DEFAULT_DEEPSEEKCN_BASE_URL,
                ApiProvider::NvidiaNim => DEFAULT_NVIDIA_NIM_BASE_URL,
                ApiProvider::Openrouter => DEFAULT_OPENROUTER_BASE_URL,
@@ -1704,8 +1709,9 @@ pub fn ensure_config_file_exists(path: Option<PathBuf>) -> Result<Option<PathBuf
 # Get your API key from https://platform.deepseek.com
 # Save it with: deepseek auth set --provider deepseek

-# Base URL (default: https://api.deepseek.com)
-# base_url = "https://api.deepseek.com"
+# Base URL (default: https://api.deepseek.com/beta)
+# Set https://api.deepseek.com to opt out of beta features.
+# base_url = "https://api.deepseek.com/beta"

 # Default model
 default_text_model = "{default_model}"
@@ -2651,8 +2657,9 @@ fn save_api_key_to_config_file(api_key: &str) -> Result<PathBuf> {

 api_key = "{key_to_write}"

-# Base URL (default: https://api.deepseek.com)
-# base_url = "https://api.deepseek.com"
+# Base URL (default: https://api.deepseek.com/beta)
+# Set https://api.deepseek.com to opt out of beta features.
+# base_url = "https://api.deepseek.com/beta"

 # Default model
 default_text_model = "{default_model}"
@@ -3916,6 +3923,25 @@ api_key = "old-openrouter-key"
        Ok(())
    }

+    #[test]
+    fn deepseek_provider_defaults_to_beta_endpoint() {
+        let config = Config::default();
+
+        assert_eq!(config.api_provider(), ApiProvider::Deepseek);
+        assert_eq!(config.deepseek_base_url(), DEFAULT_DEEPSEEK_BASE_URL);
+    }
+
+    #[test]
+    fn explicit_deepseek_base_url_overrides_beta_default() {
+        let config = Config {
+            base_url: Some("https://api.deepseek.com".to_string()),
+            ..Default::default()
+        };
+
+        assert_eq!(config.api_provider(), ApiProvider::Deepseek);
+        assert_eq!(config.deepseek_base_url(), "https://api.deepseek.com");
+    }
+
    #[test]
    fn deepseek_model_env_overrides_default_text_model() -> Result<()> {
        let _lock = lock_test_env();
@@ -4752,7 +4778,7 @@ model = "deepseek-v4-pro"
            cap.context_window,
            crate::models::DEEPSEEK_V4_CONTEXT_WINDOW_TOKENS
        );
-        assert_eq!(cap.max_output, 262_144);
+        assert_eq!(cap.max_output, 384_000);
        assert!(cap.thinking_supported);
        assert!(cap.cache_telemetry_supported);
        assert_eq!(
@@ -4768,7 +4794,7 @@ model = "deepseek-v4-pro"
            cap.context_window,
            crate::models::DEEPSEEK_V4_CONTEXT_WINDOW_TOKENS
        );
-        assert_eq!(cap.max_output, 262_144);
+        assert_eq!(cap.max_output, 384_000);
        assert!(cap.thinking_supported);
        assert!(cap.cache_telemetry_supported);
    }
@@ -4780,7 +4806,7 @@ model = "deepseek-v4-pro"
            cap.context_window,
            crate::models::DEEPSEEK_V4_CONTEXT_WINDOW_TOKENS
        );
-        assert_eq!(cap.max_output, 262_144);
+        assert_eq!(cap.max_output, 384_000);
        assert!(cap.thinking_supported);
        assert!(cap.cache_telemetry_supported);
        assert_eq!(
@@ -4796,7 +4822,7 @@ model = "deepseek-v4-pro"
            cap.context_window,
            crate::models::DEEPSEEK_V4_CONTEXT_WINDOW_TOKENS
        );
-        assert_eq!(cap.max_output, 262_144);
+        assert_eq!(cap.max_output, 384_000);
        assert!(cap.thinking_supported);
        assert!(cap.cache_telemetry_supported);
    }
@@ -4808,7 +4834,7 @@ model = "deepseek-v4-pro"
            cap.context_window,
            crate::models::DEEPSEEK_V4_CONTEXT_WINDOW_TOKENS
        );
-        assert_eq!(cap.max_output, 262_144);
+        assert_eq!(cap.max_output, 384_000);
        assert!(cap.thinking_supported);
        // OpenRouter does not return DeepSeek prompt-cache telemetry.
        assert!(!cap.cache_telemetry_supported);
@@ -4825,7 +4851,7 @@ model = "deepseek-v4-pro"
            cap.context_window,
            crate::models::DEEPSEEK_V4_CONTEXT_WINDOW_TOKENS
        );
-        assert_eq!(cap.max_output, 262_144);
+        assert_eq!(cap.max_output, 384_000);
        assert!(cap.thinking_supported);
        assert!(!cap.cache_telemetry_supported);
    }
@@ -4837,7 +4863,7 @@ model = "deepseek-v4-pro"
            cap.context_window,
            crate::models::DEEPSEEK_V4_CONTEXT_WINDOW_TOKENS
        );
-        assert_eq!(cap.max_output, 262_144);
+        assert_eq!(cap.max_output, 384_000);
        assert!(cap.thinking_supported);
        assert!(!cap.cache_telemetry_supported);
    }
@@ -4849,7 +4875,7 @@ model = "deepseek-v4-pro"
            cap.context_window,
            crate::models::DEEPSEEK_V4_CONTEXT_WINDOW_TOKENS
        );
-        assert_eq!(cap.max_output, 262_144);
+        assert_eq!(cap.max_output, 384_000);
        assert!(cap.thinking_supported);
        assert!(!cap.cache_telemetry_supported);
    }
@@ -1379,13 +1379,7 @@ fn run_setup_status(config: &Config, workspace: &Path) -> Result<()> {
            );
        }
    }
-    println!(
-        "  · base_url: {}",
-        config
-            .base_url
-            .as_deref()
-            .unwrap_or("https://api.deepseek.com")
-    );
+    println!("  · base_url: {}", config.deepseek_base_url());
    let model = config
        .default_text_model
        .clone()
@@ -4278,7 +4272,7 @@ mod doctor_endpoint_tests {
        let target = doctor_api_target(&config);

        assert_eq!(target.provider, "deepseek");
-        assert_eq!(target.base_url, "https://api.deepseek.com");
+        assert_eq!(target.base_url, crate::config::DEFAULT_DEEPSEEK_BASE_URL);
        assert_eq!(target.model, crate::config::DEFAULT_TEXT_MODEL);
    }

@@ -109,11 +109,11 @@ Current boundary note (v0.8.6):
 #### DeepSeek API Endpoints

 DeepSeek exposes OpenAI-compatible endpoints. The CLI uses:
- `https://api.deepseek.com/v1/chat/completions` - normal and streaming model turns
- `https://api.deepseek.com/v1/models` - live model discovery and health checks
+- `https://api.deepseek.com/beta/chat/completions` - default v0.8.16 DeepSeek model turns
+- `https://api.deepseek.com/beta/models` - default v0.8.16 live model discovery and health checks

 `https://api.deepseek.com/v1` is accepted for OpenAI SDK compatibility, and
-`https://api.deepseek.com/beta` can be configured for beta-only features such as
+can still be configured explicitly to opt out of beta-only features such as
 strict tool mode, chat prefix completion, and FIM completion. The public
 DeepSeek docs do not document a Responses API path for this workflow; the engine
 drives turns through Chat Completions.
@@ -94,7 +94,7 @@ default_text_model = "deepseek-v4-pro"

 [profiles.work]
 api_key = "WORK_KEY"
-base_url = "https://api.deepseek.com"
+base_url = "https://api.deepseek.com/beta"

 [profiles.nvidia-nim]
 provider = "nvidia-nim"
@@ -296,7 +296,7 @@ separate:
 | Quantity | Meaning | Allowed to drive |
 |---|---|---|
 | Active request input estimate | Conservative estimate of the next request's live system prompt and transcript payload. | Header/footer context percent, hard-cycle trigger, opt-in Flash seam trigger, and emergency overflow preflight. |
-| Reserved response headroom | The requested `max_tokens` budget plus safety headroom. v0.7.5 keeps normal turns at `262144` output tokens and adds `1024` safety tokens for context-window checks. | Hard-cycle and emergency overflow budget checks only. |
+| Reserved response headroom | The internal turn budget plus safety headroom. v0.8.16 keeps normal turns at `262144` reserved output tokens and adds `1024` safety tokens for context-window checks, even though V4 capability metadata reports the official `384000` max output. | Hard-cycle and emergency overflow budget checks only. |
 | Cumulative API usage | Provider-reported input plus output tokens summed across completed API calls; multi-tool turns may count the same stable prefix more than once. | Session usage and approximate cost telemetry only. |
 | Prompt cache hit/miss | Provider cache telemetry for the most recent call when available. | Cache-hit display and cost estimation only; never compaction, seam, or cycle triggers. |
 | Context percent | Active request input estimate divided by the model context window. | Display only; it mirrors the active-input basis used by context safeguards. |
@@ -327,8 +327,8 @@ If you are upgrading from older releases:

 - `provider` (string, optional): `deepseek` (default), `deepseek-cn`, `nvidia-nim`, `openrouter`, `novita`, `fireworks`, `sglang`, `vllm`, or `ollama`. `deepseek-cn` uses DeepSeek's mainland China endpoint (`https://api.deepseeki.com`); `nvidia-nim` targets NVIDIA's NIM-hosted DeepSeek endpoints through `https://integrate.api.nvidia.com/v1`; `fireworks` targets `https://api.fireworks.ai/inference/v1`; `sglang` targets a self-hosted OpenAI-compatible endpoint, defaulting to `http://localhost:30000/v1`; `vllm` targets a self-hosted vLLM OpenAI-compatible endpoint, defaulting to `http://localhost:8000/v1`; `ollama` targets Ollama's OpenAI-compatible endpoint, defaulting to `http://localhost:11434/v1`.
 - `api_key` (string, required for hosted providers): must be non-empty for DeepSeek/hosted providers (or set the provider API key env var). Self-hosted SGLang, vLLM, and Ollama can omit it.
- `base_url` (string, optional): defaults to `https://api.deepseek.com` for DeepSeek's OpenAI-compatible Chat Completions API, `https://api.deepseeki.com` for `provider = "deepseek-cn"`, or the provider-specific endpoint for hosted/self-hosted providers. `https://api.deepseek.com/v1` is also accepted for SDK compatibility; use `https://api.deepseek.com/beta` only for DeepSeek beta features such as strict tool mode, chat prefix completion, and FIM completion.
- `default_text_model` (string, optional): defaults to `deepseek-v4-pro` for DeepSeek, `deepseek-ai/deepseek-v4-pro` for NVIDIA NIM, `accounts/fireworks/models/deepseek-v4-pro` for Fireworks, `deepseek-ai/DeepSeek-V4-Pro` for SGLang/vLLM, and `deepseek-coder:1.3b` for Ollama. Current public DeepSeek IDs are `deepseek-v4-pro` and `deepseek-v4-flash`, both with 1M context windows and thinking mode enabled by default. Legacy `deepseek-chat` and `deepseek-reasoner` remain compatibility aliases for `deepseek-v4-flash`. Provider-specific mappings translate `deepseek-v4-pro` / `deepseek-v4-flash` to each provider's model ID where supported. Ollama model tags are passed through unchanged. Use `/models` or `deepseek models` to discover live IDs from your configured endpoint. `DEEPSEEK_MODEL` overrides this for a single process.
+- `base_url` (string, optional): defaults to `https://api.deepseek.com/beta` for DeepSeek's OpenAI-compatible Chat Completions API in v0.8.16, `https://api.deepseeki.com` for `provider = "deepseek-cn"`, or the provider-specific endpoint for hosted/self-hosted providers. Set `https://api.deepseek.com` or `https://api.deepseek.com/v1` explicitly to opt out of DeepSeek beta features.
+- `default_text_model` (string, optional): defaults to `deepseek-v4-pro` for DeepSeek, `deepseek-ai/deepseek-v4-pro` for NVIDIA NIM, `accounts/fireworks/models/deepseek-v4-pro` for Fireworks, `deepseek-ai/DeepSeek-V4-Pro` for SGLang/vLLM, and `deepseek-coder:1.3b` for Ollama. Current public DeepSeek IDs are `deepseek-v4-pro` and `deepseek-v4-flash`, both with 1M context windows, 384K max output, and thinking mode enabled by default. Legacy `deepseek-chat` and `deepseek-reasoner` remain compatibility aliases for `deepseek-v4-flash` until July 24, 2026. Provider-specific mappings translate `deepseek-v4-pro` / `deepseek-v4-flash` to each provider's model ID where supported. Ollama model tags are passed through unchanged. Use `/models` or `deepseek models` to discover live IDs from your configured endpoint. `DEEPSEEK_MODEL` overrides this for a single process.
 - `reasoning_effort` (string, optional): `off`, `low`, `medium`, `high`, or `max`; defaults to the configured UI tier. DeepSeek Platform receives top-level `thinking` / `reasoning_effort` fields. NVIDIA NIM receives equivalent settings through `chat_template_kwargs`.
 - `allow_shell` (bool, optional): defaults to `true` (sandboxed).
 - `approval_policy` (string, optional): `on-request`, `untrusted`, or `never`. Runtime `approval_mode` editing in `/config` also accepts `on-request` and `untrusted` aliases.
@@ -550,14 +550,12 @@ The `capability` key contains per-provider capability info derived from
 static knowledge (release docs, API guides) rather than live API probes.
 Top-level sub-keys: `resolved_provider`, `resolved_model`, `context_window`,
 `max_output`, `thinking_supported`, `cache_telemetry_supported`,
-`request_payload_mode`, and `deprecation`. When the resolved model is a known
-legacy alias (e.g. `deepseek-chat`, `deepseek-reasoner`), the `deprecation`
-sub-object carries `alias`, `replacement`, and `notice` fields.
+and `request_payload_mode`.

-Use `capability.context_window` and `capability.max_output` for context-window
-budgeting in CI scripts. Use `capability.thinking_supported` to decide whether
-to configure reasoning effort. Use `capability.deprecation` to warn users about
-legacy model aliases.
+Use `capability.context_window` and `capability.max_output` for model-limit
+checks in CI scripts; do not treat `capability.max_output` as the per-turn
+request budget. Use `capability.thinking_supported` to decide whether to
+configure reasoning effort.

 ## Setup status, clean, and extension dirs

@@ -24,7 +24,7 @@ Symptoms:
 Checks:
 1. Inspect retry/health logs (`deepseek_cli::client`)
 2. Verify endpoint connectivity:
-   - `curl -sS https://api.deepseek.com/v1/models -H "Authorization: Bearer $DEEPSEEK_API_KEY"`
+   - `curl -sS https://api.deepseek.com/beta/models -H "Authorization: Bearer $DEEPSEEK_API_KEY"`
 3. Confirm no local sandbox/permission deadlock in tool output

 Actions:
@@ -92,7 +92,7 @@ deepseek doctor --json
  "api_key": {
    "source": "env"
  },
-  "base_url": "https://api.deepseek.com",
+  "base_url": "https://api.deepseek.com/beta",
  "default_text_model": "deepseek-v4-pro",
  "memory": {
    "enabled": false,