From ad122fd6f8afe0ec17605496492ffe94cbd23a10 Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Thu, 21 May 2026 00:45:04 +0800 Subject: [PATCH] feat(subagents): make step API timeout configurable --- config.example.toml | 1 + crates/tui/src/config.rs | 82 ++++++++++++++++++++++++++ crates/tui/src/core/engine.rs | 10 ++++ crates/tui/src/main.rs | 1 + crates/tui/src/runtime_threads.rs | 3 + crates/tui/src/tools/subagent/mod.rs | 30 +++++++++- crates/tui/src/tools/subagent/tests.rs | 14 +++++ crates/tui/src/tui/ui.rs | 1 + docs/CONFIGURATION.md | 12 ++-- 9 files changed, 146 insertions(+), 8 deletions(-) diff --git a/config.example.toml b/config.example.toml index 5e0762ed..a8b25e06 100644 --- a/config.example.toml +++ b/config.example.toml @@ -146,6 +146,7 @@ max_subagents = 10 # optional (1-20) # Optional sub-agent tuning. max_concurrent overrides top-level max_subagents. # [subagents] # max_concurrent = 10 +# api_timeout_secs = 120 # per-step API timeout, clamped to 1..=1800 # Optional managed policy paths (defaults to /etc/deepseek/*.toml on unix): # managed_config_path = "/etc/deepseek/managed_config.toml" diff --git a/crates/tui/src/config.rs b/crates/tui/src/config.rs index d366888a..04bc18e8 100644 --- a/crates/tui/src/config.rs +++ b/crates/tui/src/config.rs @@ -19,6 +19,18 @@ use crate::hooks::HooksConfig; pub const DEFAULT_MAX_SUBAGENTS: usize = 10; pub const MAX_SUBAGENTS: usize = 20; +/// Default per-step DeepSeek API timeout for sub-agent requests, in seconds. +/// Matches the legacy hardcoded value so existing configs keep their old +/// behavior when `[subagents] api_timeout_secs` is unset (#1806, #1808). +pub const DEFAULT_SUBAGENT_API_TIMEOUT_SECS: u64 = 120; +/// Minimum accepted `[subagents] api_timeout_secs`. Anything lower (including +/// `0`, which would otherwise produce an immediate timeout footgun) clamps +/// up to this value before the runtime sees it. +pub const MIN_SUBAGENT_API_TIMEOUT_SECS: u64 = 1; +/// Maximum accepted `[subagents] api_timeout_secs` (30 minutes). The cap +/// keeps a misconfigured per-step timeout from masking real model/network +/// hangs forever. +pub const MAX_SUBAGENT_API_TIMEOUT_SECS: u64 = 1800; pub const DEFAULT_TEXT_MODEL: &str = "deepseek-v4-pro"; pub const DEFAULT_DEEPSEEK_BASE_URL: &str = "https://api.deepseek.com/beta"; pub const DEFAULT_NVIDIA_NIM_MODEL: &str = "deepseek-ai/deepseek-v4-pro"; @@ -879,6 +891,14 @@ pub struct SubagentsConfig { /// setting. Clamped to [1, MAX_SUBAGENTS]. #[serde(default)] pub max_concurrent: Option, + /// Per-step DeepSeek API timeout for sub-agent requests, in seconds. The + /// timeout wraps `client.create_message` so a stuck single step cannot + /// pin the parent's parent-completion wakeup channel indefinitely. + /// Defaults to `DEFAULT_SUBAGENT_API_TIMEOUT_SECS` (120) and is clamped + /// to `MIN_SUBAGENT_API_TIMEOUT_SECS..=MAX_SUBAGENT_API_TIMEOUT_SECS` + /// (1..=1800). Zero or unset uses the legacy 120s default (#1806, #1808). + #[serde(default)] + pub api_timeout_secs: Option, } /// `[auto]` table — knobs for the `--model auto` / `/model auto` router. @@ -1815,6 +1835,27 @@ impl Config { .clamp(1, MAX_SUBAGENTS) } + /// Resolved per-step DeepSeek API timeout for sub-agents, in seconds. + /// + /// Reads `[subagents] api_timeout_secs` and clamps to + /// `[MIN_SUBAGENT_API_TIMEOUT_SECS, MAX_SUBAGENT_API_TIMEOUT_SECS]` + /// (1..=1800). `None` or `0` resolve to the legacy + /// `DEFAULT_SUBAGENT_API_TIMEOUT_SECS` (120) so existing configs keep + /// their old behavior; explicit `1` is honored, useful only in fast + /// fail-fast tests, not production (#1806, #1808). + #[must_use] + pub fn subagent_api_timeout_secs(&self) -> u64 { + let raw = self + .subagents + .as_ref() + .and_then(|cfg| cfg.api_timeout_secs) + .unwrap_or(DEFAULT_SUBAGENT_API_TIMEOUT_SECS); + if raw == 0 { + return DEFAULT_SUBAGENT_API_TIMEOUT_SECS; + } + raw.clamp(MIN_SUBAGENT_API_TIMEOUT_SECS, MAX_SUBAGENT_API_TIMEOUT_SECS) + } + /// Raw sub-agent model override map. Values are validated at spawn time /// so an invalid role/type model fails before any partial agent spawn. #[must_use] @@ -3970,6 +4011,47 @@ mod tests { assert_eq!(high.max_subagents(), MAX_SUBAGENTS); } + #[test] + fn subagent_api_timeout_defaults_and_clamps() { + assert_eq!( + Config::default().subagent_api_timeout_secs(), + DEFAULT_SUBAGENT_API_TIMEOUT_SECS + ); + + let zero = Config { + subagents: Some(SubagentsConfig { + api_timeout_secs: Some(0), + ..SubagentsConfig::default() + }), + ..Config::default() + }; + assert_eq!( + zero.subagent_api_timeout_secs(), + DEFAULT_SUBAGENT_API_TIMEOUT_SECS + ); + + let explicit_min = Config { + subagents: Some(SubagentsConfig { + api_timeout_secs: Some(MIN_SUBAGENT_API_TIMEOUT_SECS), + ..SubagentsConfig::default() + }), + ..Config::default() + }; + assert_eq!(explicit_min.subagent_api_timeout_secs(), 1); + + let high = Config { + subagents: Some(SubagentsConfig { + api_timeout_secs: Some(MAX_SUBAGENT_API_TIMEOUT_SECS + 60), + ..SubagentsConfig::default() + }), + ..Config::default() + }; + assert_eq!( + high.subagent_api_timeout_secs(), + MAX_SUBAGENT_API_TIMEOUT_SECS + ); + } + #[test] fn save_api_key_writes_config_file_under_cfg_test() -> Result<()> { // `save_api_key` writes to the shared user config file. This diff --git a/crates/tui/src/core/engine.rs b/crates/tui/src/core/engine.rs index 407e6346..1fe60032 100644 --- a/crates/tui/src/core/engine.rs +++ b/crates/tui/src/core/engine.rs @@ -166,6 +166,11 @@ pub struct EngineConfig { pub search_provider: crate::config::SearchProvider, /// API key for Tavily or Bocha. `None` for Bing or DuckDuckGo. pub search_api_key: Option, + /// Per-step DeepSeek API timeout for sub-agent `create_message` requests. + /// Resolved from `[subagents] api_timeout_secs` (clamped to 1..=1800) + /// once at engine construction, then threaded onto every + /// `SubAgentRuntime` the engine builds (#1806, #1808). + pub subagent_api_timeout: Duration, } impl Default for EngineConfig { @@ -206,6 +211,9 @@ impl Default for EngineConfig { workshop: None, search_provider: crate::config::SearchProvider::default(), search_api_key: None, + subagent_api_timeout: Duration::from_secs( + crate::config::DEFAULT_SUBAGENT_API_TIMEOUT_SECS, + ), } } } @@ -656,6 +664,7 @@ impl Engine { self.session.reasoning_effort_auto, ) .with_max_spawn_depth(self.config.max_spawn_depth) + .with_step_api_timeout(self.config.subagent_api_timeout) .background_runtime(); let route = resolve_subagent_assignment_route(&runtime, None, &prompt).await; runtime.model = route.model; @@ -1063,6 +1072,7 @@ impl Engine { self.session.reasoning_effort_auto, ) .with_max_spawn_depth(self.config.max_spawn_depth) + .with_step_api_timeout(self.config.subagent_api_timeout) .with_parent_completion_tx(self.tx_subagent_completion.clone()); if let Some(context) = fork_context_for_runtime.clone() { rt = rt.with_fork_context(context); diff --git a/crates/tui/src/main.rs b/crates/tui/src/main.rs index a261e02e..315fcd00 100644 --- a/crates/tui/src/main.rs +++ b/crates/tui/src/main.rs @@ -4679,6 +4679,7 @@ async fn run_exec_agent( lsp_config, runtime_services: crate::tools::spec::RuntimeToolServices::default(), subagent_model_overrides: config.subagent_model_overrides(), + subagent_api_timeout: std::time::Duration::from_secs(config.subagent_api_timeout_secs()), memory_enabled: config.memory_enabled(), memory_path: config.memory_path(), vision_config: config.vision_model_config(), diff --git a/crates/tui/src/runtime_threads.rs b/crates/tui/src/runtime_threads.rs index 5ec3c8f6..138bd54f 100644 --- a/crates/tui/src/runtime_threads.rs +++ b/crates/tui/src/runtime_threads.rs @@ -1964,6 +1964,9 @@ impl RuntimeThreadManager { rlm_sessions: crate::rlm::session::new_shared_rlm_session_store(), }, subagent_model_overrides: self.config.subagent_model_overrides(), + subagent_api_timeout: std::time::Duration::from_secs( + self.config.subagent_api_timeout_secs(), + ), memory_enabled: self.config.memory_enabled(), memory_path: self.config.memory_path(), vision_config: self.config.vision_model_config(), diff --git a/crates/tui/src/tools/subagent/mod.rs b/crates/tui/src/tools/subagent/mod.rs index b62d252b..8de4c327 100644 --- a/crates/tui/src/tools/subagent/mod.rs +++ b/crates/tui/src/tools/subagent/mod.rs @@ -67,7 +67,13 @@ const TOOL_TIMEOUT: Duration = Duration::from_secs(30); /// Per-step LLM API call timeout. Each `create_message` request must complete /// within this window or the step is treated as timed out. Prevents a single /// stuck API call from blocking the sub-agent indefinitely. -const STEP_API_TIMEOUT: Duration = Duration::from_secs(120); +/// Legacy fallback for the per-step DeepSeek API timeout. The active timeout +/// now travels on `SubAgentRuntime::step_api_timeout` so users can override +/// it via `[subagents] api_timeout_secs` in `~/.deepseek/config.toml`. The +/// constant only exists for tests/stub runtimes that need a hard-coded +/// default; production runtimes set the field explicitly (#1806, #1808). +const DEFAULT_STEP_API_TIMEOUT: Duration = + Duration::from_secs(crate::config::DEFAULT_SUBAGENT_API_TIMEOUT_SECS); const RESULT_POLL_INTERVAL: Duration = Duration::from_millis(250); const DEFAULT_RESULT_TIMEOUT_MS: u64 = 30_000; #[allow(dead_code)] // Legacy agent_wait clamp; new agent_eval uses DEFAULT/MAX. @@ -643,6 +649,12 @@ pub struct SubAgentRuntime { pub parent_completion_tx: Option>, /// Snapshot of the request prefix visible to an opt-in forked child. pub fork_context: Option, + /// Per-step DeepSeek API timeout for the child's `create_message` call. + /// Resolved from `[subagents] api_timeout_secs` (clamped to 1..=1800) at + /// engine construction so a slow but legitimate model turn does not + /// false-timeout the child mid-thinking. `child_runtime()` and + /// `background_runtime()` preserve the parent's value (#1806, #1808). + pub step_api_timeout: Duration, } impl SubAgentRuntime { @@ -676,9 +688,20 @@ impl SubAgentRuntime { mailbox: None, parent_completion_tx: None, fork_context: None, + step_api_timeout: DEFAULT_STEP_API_TIMEOUT, } } + /// Override the per-step DeepSeek API timeout (default + /// `DEFAULT_STEP_API_TIMEOUT`). Called by the engine after reading + /// `[subagents] api_timeout_secs`. Tests may use this to fail fast + /// without waiting the legacy 120 seconds (#1806, #1808). + #[must_use] + pub fn with_step_api_timeout(mut self, timeout: Duration) -> Self { + self.step_api_timeout = timeout; + self + } + /// Attach the wakeup channel so the engine's parent turn loop can resume /// when this runtime's direct children finish (issue #756). The channel /// is propagated to descendants via clone, but only `spawn_depth == 1` @@ -799,6 +822,7 @@ impl SubAgentRuntime { mailbox: self.mailbox.clone(), parent_completion_tx: self.parent_completion_tx.clone(), fork_context: self.fork_context.clone(), + step_api_timeout: self.step_api_timeout, } } @@ -3534,8 +3558,8 @@ async fn run_subagent( from_prior_session: false, }); } - api = tokio::time::timeout(STEP_API_TIMEOUT, runtime.client.create_message(request)) => { - api.map_err(|_| anyhow!("API call timed out after {}s", STEP_API_TIMEOUT.as_secs()))?? + api = tokio::time::timeout(runtime.step_api_timeout, runtime.client.create_message(request)) => { + api.map_err(|_| anyhow!("API call timed out after {}s", runtime.step_api_timeout.as_secs()))?? } }; diff --git a/crates/tui/src/tools/subagent/tests.rs b/crates/tui/src/tools/subagent/tests.rs index 7e228f41..3f4f13e3 100644 --- a/crates/tui/src/tools/subagent/tests.rs +++ b/crates/tui/src/tools/subagent/tests.rs @@ -1164,6 +1164,7 @@ fn child_runtime_increments_depth_and_preserves_auto_approve() { parent.context.auto_approve = false; // parent in suggest mode let child = parent.child_runtime(); assert_eq!(child.spawn_depth, 2, "child depth = parent + 1"); + assert_eq!(child.step_api_timeout, DEFAULT_STEP_API_TIMEOUT); assert!( !child.context.auto_approve, "child must inherit parent approval state" @@ -1178,6 +1179,18 @@ fn child_runtime_increments_depth_and_preserves_auto_approve() { ); } +#[test] +fn child_and_background_runtimes_preserve_step_api_timeout() { + let timeout = Duration::from_secs(7); + let parent = stub_runtime().with_step_api_timeout(timeout); + + let child = parent.child_runtime(); + assert_eq!(child.step_api_timeout, timeout); + + let background = parent.background_runtime(); + assert_eq!(background.step_api_timeout, timeout); +} + #[tokio::test] async fn subagent_registry_blocks_approval_tools_without_parent_auto_approve() { let mut runtime = stub_runtime(); @@ -1434,6 +1447,7 @@ fn stub_runtime() -> SubAgentRuntime { mailbox: None, parent_completion_tx: None, fork_context: None, + step_api_timeout: DEFAULT_STEP_API_TIMEOUT, } } diff --git a/crates/tui/src/tui/ui.rs b/crates/tui/src/tui/ui.rs index 203073c0..58066cb0 100644 --- a/crates/tui/src/tui/ui.rs +++ b/crates/tui/src/tui/ui.rs @@ -707,6 +707,7 @@ fn build_engine_config(app: &App, config: &Config) -> EngineConfig { .map(crate::config::LspConfigToml::into_runtime), runtime_services: app.runtime_services.clone(), subagent_model_overrides: config.subagent_model_overrides(), + subagent_api_timeout: Duration::from_secs(config.subagent_api_timeout_secs()), memory_enabled: config.memory_enabled(), memory_path: config.memory_path(), vision_config: config.vision_model_config(), diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index a0052c89..498a20be 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -447,12 +447,14 @@ If you are upgrading from older releases: related persistent sub-agent sessions. Explicit tool `model` values win, then role/type overrides, then the parent runtime model. Supported convenience keys are `default_model`, `worker_model`, `explorer_model`, `awaiter_model`, - `review_model`, `custom_model`, and `max_concurrent`. The + `review_model`, `custom_model`, `max_concurrent`, and `api_timeout_secs`. The `[subagents] max_concurrent` value overrides top-level `max_subagents` and is - also clamped to `1..=20`. `[subagents.models]` accepts lower-case role or type - keys such as `worker`, `explorer`, `general`, `explore`, `plan`, and - `review`. Values must normalize to a supported DeepSeek model id before an - agent is spawned. + also clamped to `1..=20`; `[subagents] api_timeout_secs` controls the + per-step API timeout for sub-agent model calls and is clamped to `1..=1800`, + with `0` or unset preserving the legacy 120 second default. + `[subagents.models]` accepts lower-case role or type keys such as `worker`, + `explorer`, `general`, `explore`, `plan`, and `review`. Values must normalize + to a supported DeepSeek model id before an agent is spawned. - `skills_dir` (string, optional): defaults to `~/.deepseek/skills` (each skill is a directory containing `SKILL.md`). Workspace-local `.agents/skills` or `./skills` are preferred when present; the runtime also discovers global