feat(subagents): make step API timeout configurable

2026-05-21 00:45:04 +08:00
parent 2c642ec375
commit ad122fd6f8
9 changed files with 146 additions and 8 deletions
@@ -146,6 +146,7 @@ max_subagents = 10 # optional (1-20)
 # Optional sub-agent tuning. max_concurrent overrides top-level max_subagents.
 # [subagents]
 # max_concurrent = 10
+# api_timeout_secs = 120 # per-step API timeout, clamped to 1..=1800

 # Optional managed policy paths (defaults to /etc/deepseek/*.toml on unix):
 # managed_config_path = "/etc/deepseek/managed_config.toml"
@@ -19,6 +19,18 @@ use crate::hooks::HooksConfig;

 pub const DEFAULT_MAX_SUBAGENTS: usize = 10;
 pub const MAX_SUBAGENTS: usize = 20;
+/// Default per-step DeepSeek API timeout for sub-agent requests, in seconds.
+/// Matches the legacy hardcoded value so existing configs keep their old
+/// behavior when `[subagents] api_timeout_secs` is unset (#1806, #1808).
+pub const DEFAULT_SUBAGENT_API_TIMEOUT_SECS: u64 = 120;
+/// Minimum accepted `[subagents] api_timeout_secs`. Anything lower (including
+/// `0`, which would otherwise produce an immediate timeout footgun) clamps
+/// up to this value before the runtime sees it.
+pub const MIN_SUBAGENT_API_TIMEOUT_SECS: u64 = 1;
+/// Maximum accepted `[subagents] api_timeout_secs` (30 minutes). The cap
+/// keeps a misconfigured per-step timeout from masking real model/network
+/// hangs forever.
+pub const MAX_SUBAGENT_API_TIMEOUT_SECS: u64 = 1800;
 pub const DEFAULT_TEXT_MODEL: &str = "deepseek-v4-pro";
 pub const DEFAULT_DEEPSEEK_BASE_URL: &str = "https://api.deepseek.com/beta";
 pub const DEFAULT_NVIDIA_NIM_MODEL: &str = "deepseek-ai/deepseek-v4-pro";
@@ -879,6 +891,14 @@ pub struct SubagentsConfig {
    /// setting. Clamped to [1, MAX_SUBAGENTS].
    #[serde(default)]
    pub max_concurrent: Option<usize>,
+    /// Per-step DeepSeek API timeout for sub-agent requests, in seconds. The
+    /// timeout wraps `client.create_message` so a stuck single step cannot
+    /// pin the parent's parent-completion wakeup channel indefinitely.
+    /// Defaults to `DEFAULT_SUBAGENT_API_TIMEOUT_SECS` (120) and is clamped
+    /// to `MIN_SUBAGENT_API_TIMEOUT_SECS..=MAX_SUBAGENT_API_TIMEOUT_SECS`
+    /// (1..=1800). Zero or unset uses the legacy 120s default (#1806, #1808).
+    #[serde(default)]
+    pub api_timeout_secs: Option<u64>,
 }

 /// `[auto]` table — knobs for the `--model auto` / `/model auto` router.
@@ -1815,6 +1835,27 @@ impl Config {
            .clamp(1, MAX_SUBAGENTS)
    }

+    /// Resolved per-step DeepSeek API timeout for sub-agents, in seconds.
+    ///
+    /// Reads `[subagents] api_timeout_secs` and clamps to
+    /// `[MIN_SUBAGENT_API_TIMEOUT_SECS, MAX_SUBAGENT_API_TIMEOUT_SECS]`
+    /// (1..=1800). `None` or `0` resolve to the legacy
+    /// `DEFAULT_SUBAGENT_API_TIMEOUT_SECS` (120) so existing configs keep
+    /// their old behavior; explicit `1` is honored, useful only in fast
+    /// fail-fast tests, not production (#1806, #1808).
+    #[must_use]
+    pub fn subagent_api_timeout_secs(&self) -> u64 {
+        let raw = self
+            .subagents
+            .as_ref()
+            .and_then(|cfg| cfg.api_timeout_secs)
+            .unwrap_or(DEFAULT_SUBAGENT_API_TIMEOUT_SECS);
+        if raw == 0 {
+            return DEFAULT_SUBAGENT_API_TIMEOUT_SECS;
+        }
+        raw.clamp(MIN_SUBAGENT_API_TIMEOUT_SECS, MAX_SUBAGENT_API_TIMEOUT_SECS)
+    }
+
    /// Raw sub-agent model override map. Values are validated at spawn time
    /// so an invalid role/type model fails before any partial agent spawn.
    #[must_use]
@@ -3970,6 +4011,47 @@ mod tests {
        assert_eq!(high.max_subagents(), MAX_SUBAGENTS);
    }

+    #[test]
+    fn subagent_api_timeout_defaults_and_clamps() {
+        assert_eq!(
+            Config::default().subagent_api_timeout_secs(),
+            DEFAULT_SUBAGENT_API_TIMEOUT_SECS
+        );
+
+        let zero = Config {
+            subagents: Some(SubagentsConfig {
+                api_timeout_secs: Some(0),
+                ..SubagentsConfig::default()
+            }),
+            ..Config::default()
+        };
+        assert_eq!(
+            zero.subagent_api_timeout_secs(),
+            DEFAULT_SUBAGENT_API_TIMEOUT_SECS
+        );
+
+        let explicit_min = Config {
+            subagents: Some(SubagentsConfig {
+                api_timeout_secs: Some(MIN_SUBAGENT_API_TIMEOUT_SECS),
+                ..SubagentsConfig::default()
+            }),
+            ..Config::default()
+        };
+        assert_eq!(explicit_min.subagent_api_timeout_secs(), 1);
+
+        let high = Config {
+            subagents: Some(SubagentsConfig {
+                api_timeout_secs: Some(MAX_SUBAGENT_API_TIMEOUT_SECS + 60),
+                ..SubagentsConfig::default()
+            }),
+            ..Config::default()
+        };
+        assert_eq!(
+            high.subagent_api_timeout_secs(),
+            MAX_SUBAGENT_API_TIMEOUT_SECS
+        );
+    }
+
    #[test]
    fn save_api_key_writes_config_file_under_cfg_test() -> Result<()> {
        // `save_api_key` writes to the shared user config file. This
@@ -166,6 +166,11 @@ pub struct EngineConfig {
    pub search_provider: crate::config::SearchProvider,
    /// API key for Tavily or Bocha. `None` for Bing or DuckDuckGo.
    pub search_api_key: Option<String>,
+    /// Per-step DeepSeek API timeout for sub-agent `create_message` requests.
+    /// Resolved from `[subagents] api_timeout_secs` (clamped to 1..=1800)
+    /// once at engine construction, then threaded onto every
+    /// `SubAgentRuntime` the engine builds (#1806, #1808).
+    pub subagent_api_timeout: Duration,
 }

 impl Default for EngineConfig {
@@ -206,6 +211,9 @@ impl Default for EngineConfig {
            workshop: None,
            search_provider: crate::config::SearchProvider::default(),
            search_api_key: None,
+            subagent_api_timeout: Duration::from_secs(
+                crate::config::DEFAULT_SUBAGENT_API_TIMEOUT_SECS,
+            ),
        }
    }
 }
@@ -656,6 +664,7 @@ impl Engine {
                        self.session.reasoning_effort_auto,
                    )
                    .with_max_spawn_depth(self.config.max_spawn_depth)
+                    .with_step_api_timeout(self.config.subagent_api_timeout)
                    .background_runtime();
                    let route = resolve_subagent_assignment_route(&runtime, None, &prompt).await;
                    runtime.model = route.model;
@@ -1063,6 +1072,7 @@ impl Engine {
                            self.session.reasoning_effort_auto,
                        )
                        .with_max_spawn_depth(self.config.max_spawn_depth)
+                        .with_step_api_timeout(self.config.subagent_api_timeout)
                        .with_parent_completion_tx(self.tx_subagent_completion.clone());
                        if let Some(context) = fork_context_for_runtime.clone() {
                            rt = rt.with_fork_context(context);
@@ -4679,6 +4679,7 @@ async fn run_exec_agent(
        lsp_config,
        runtime_services: crate::tools::spec::RuntimeToolServices::default(),
        subagent_model_overrides: config.subagent_model_overrides(),
+        subagent_api_timeout: std::time::Duration::from_secs(config.subagent_api_timeout_secs()),
        memory_enabled: config.memory_enabled(),
        memory_path: config.memory_path(),
        vision_config: config.vision_model_config(),
@@ -1964,6 +1964,9 @@ impl RuntimeThreadManager {
                rlm_sessions: crate::rlm::session::new_shared_rlm_session_store(),
            },
            subagent_model_overrides: self.config.subagent_model_overrides(),
+            subagent_api_timeout: std::time::Duration::from_secs(
+                self.config.subagent_api_timeout_secs(),
+            ),
            memory_enabled: self.config.memory_enabled(),
            memory_path: self.config.memory_path(),
            vision_config: self.config.vision_model_config(),
@@ -67,7 +67,13 @@ const TOOL_TIMEOUT: Duration = Duration::from_secs(30);
 /// Per-step LLM API call timeout. Each `create_message` request must complete
 /// within this window or the step is treated as timed out. Prevents a single
 /// stuck API call from blocking the sub-agent indefinitely.
-const STEP_API_TIMEOUT: Duration = Duration::from_secs(120);
+/// Legacy fallback for the per-step DeepSeek API timeout. The active timeout
+/// now travels on `SubAgentRuntime::step_api_timeout` so users can override
+/// it via `[subagents] api_timeout_secs` in `~/.deepseek/config.toml`. The
+/// constant only exists for tests/stub runtimes that need a hard-coded
+/// default; production runtimes set the field explicitly (#1806, #1808).
+const DEFAULT_STEP_API_TIMEOUT: Duration =
+    Duration::from_secs(crate::config::DEFAULT_SUBAGENT_API_TIMEOUT_SECS);
 const RESULT_POLL_INTERVAL: Duration = Duration::from_millis(250);
 const DEFAULT_RESULT_TIMEOUT_MS: u64 = 30_000;
 #[allow(dead_code)] // Legacy agent_wait clamp; new agent_eval uses DEFAULT/MAX.
@@ -643,6 +649,12 @@ pub struct SubAgentRuntime {
    pub parent_completion_tx: Option<mpsc::UnboundedSender<SubAgentCompletion>>,
    /// Snapshot of the request prefix visible to an opt-in forked child.
    pub fork_context: Option<SubAgentForkContext>,
+    /// Per-step DeepSeek API timeout for the child's `create_message` call.
+    /// Resolved from `[subagents] api_timeout_secs` (clamped to 1..=1800) at
+    /// engine construction so a slow but legitimate model turn does not
+    /// false-timeout the child mid-thinking. `child_runtime()` and
+    /// `background_runtime()` preserve the parent's value (#1806, #1808).
+    pub step_api_timeout: Duration,
 }

 impl SubAgentRuntime {
@@ -676,9 +688,20 @@ impl SubAgentRuntime {
            mailbox: None,
            parent_completion_tx: None,
            fork_context: None,
+            step_api_timeout: DEFAULT_STEP_API_TIMEOUT,
        }
    }

+    /// Override the per-step DeepSeek API timeout (default
+    /// `DEFAULT_STEP_API_TIMEOUT`). Called by the engine after reading
+    /// `[subagents] api_timeout_secs`. Tests may use this to fail fast
+    /// without waiting the legacy 120 seconds (#1806, #1808).
+    #[must_use]
+    pub fn with_step_api_timeout(mut self, timeout: Duration) -> Self {
+        self.step_api_timeout = timeout;
+        self
+    }
+
    /// Attach the wakeup channel so the engine's parent turn loop can resume
    /// when this runtime's direct children finish (issue #756). The channel
    /// is propagated to descendants via clone, but only `spawn_depth == 1`
@@ -799,6 +822,7 @@ impl SubAgentRuntime {
            mailbox: self.mailbox.clone(),
            parent_completion_tx: self.parent_completion_tx.clone(),
            fork_context: self.fork_context.clone(),
+            step_api_timeout: self.step_api_timeout,
        }
    }

@@ -3534,8 +3558,8 @@ async fn run_subagent(
                    from_prior_session: false,
                });
            }
-            api = tokio::time::timeout(STEP_API_TIMEOUT, runtime.client.create_message(request)) => {
-                api.map_err(|_| anyhow!("API call timed out after {}s", STEP_API_TIMEOUT.as_secs()))??
+            api = tokio::time::timeout(runtime.step_api_timeout, runtime.client.create_message(request)) => {
+                api.map_err(|_| anyhow!("API call timed out after {}s", runtime.step_api_timeout.as_secs()))??
            }
        };

@@ -1164,6 +1164,7 @@ fn child_runtime_increments_depth_and_preserves_auto_approve() {
    parent.context.auto_approve = false; // parent in suggest mode
    let child = parent.child_runtime();
    assert_eq!(child.spawn_depth, 2, "child depth = parent + 1");
+    assert_eq!(child.step_api_timeout, DEFAULT_STEP_API_TIMEOUT);
    assert!(
        !child.context.auto_approve,
        "child must inherit parent approval state"
@@ -1178,6 +1179,18 @@ fn child_runtime_increments_depth_and_preserves_auto_approve() {
    );
 }

+#[test]
+fn child_and_background_runtimes_preserve_step_api_timeout() {
+    let timeout = Duration::from_secs(7);
+    let parent = stub_runtime().with_step_api_timeout(timeout);
+
+    let child = parent.child_runtime();
+    assert_eq!(child.step_api_timeout, timeout);
+
+    let background = parent.background_runtime();
+    assert_eq!(background.step_api_timeout, timeout);
+}
+
 #[tokio::test]
 async fn subagent_registry_blocks_approval_tools_without_parent_auto_approve() {
    let mut runtime = stub_runtime();
@@ -1434,6 +1447,7 @@ fn stub_runtime() -> SubAgentRuntime {
        mailbox: None,
        parent_completion_tx: None,
        fork_context: None,
+        step_api_timeout: DEFAULT_STEP_API_TIMEOUT,
    }
 }

@@ -707,6 +707,7 @@ fn build_engine_config(app: &App, config: &Config) -> EngineConfig {
            .map(crate::config::LspConfigToml::into_runtime),
        runtime_services: app.runtime_services.clone(),
        subagent_model_overrides: config.subagent_model_overrides(),
+        subagent_api_timeout: Duration::from_secs(config.subagent_api_timeout_secs()),
        memory_enabled: config.memory_enabled(),
        memory_path: config.memory_path(),
        vision_config: config.vision_model_config(),
@@ -447,12 +447,14 @@ If you are upgrading from older releases:
  related persistent sub-agent sessions. Explicit tool `model` values win, then role/type
  overrides, then the parent runtime model. Supported convenience keys are
  `default_model`, `worker_model`, `explorer_model`, `awaiter_model`,
-  `review_model`, `custom_model`, and `max_concurrent`. The
+  `review_model`, `custom_model`, `max_concurrent`, and `api_timeout_secs`. The
  `[subagents] max_concurrent` value overrides top-level `max_subagents` and is
-  also clamped to `1..=20`. `[subagents.models]` accepts lower-case role or type
-  keys such as `worker`, `explorer`, `general`, `explore`, `plan`, and
-  `review`. Values must normalize to a supported DeepSeek model id before an
-  agent is spawned.
+  also clamped to `1..=20`; `[subagents] api_timeout_secs` controls the
+  per-step API timeout for sub-agent model calls and is clamped to `1..=1800`,
+  with `0` or unset preserving the legacy 120 second default.
+  `[subagents.models]` accepts lower-case role or type keys such as `worker`,
+  `explorer`, `general`, `explore`, `plan`, and `review`. Values must normalize
+  to a supported DeepSeek model id before an agent is spawned.
 - `skills_dir` (string, optional): defaults to `~/.deepseek/skills` (each skill is
  a directory containing `SKILL.md`). Workspace-local `.agents/skills` or
  `./skills` are preferred when present; the runtime also discovers global