feat(subagents): make step API timeout configurable

This commit is contained in:
Hunter Bown
2026-05-21 00:45:04 +08:00
parent 2c642ec375
commit ad122fd6f8
9 changed files with 146 additions and 8 deletions
+1
View File
@@ -146,6 +146,7 @@ max_subagents = 10 # optional (1-20)
# Optional sub-agent tuning. max_concurrent overrides top-level max_subagents.
# [subagents]
# max_concurrent = 10
# api_timeout_secs = 120 # per-step API timeout, clamped to 1..=1800
# Optional managed policy paths (defaults to /etc/deepseek/*.toml on unix):
# managed_config_path = "/etc/deepseek/managed_config.toml"
+82
View File
@@ -19,6 +19,18 @@ use crate::hooks::HooksConfig;
pub const DEFAULT_MAX_SUBAGENTS: usize = 10;
pub const MAX_SUBAGENTS: usize = 20;
/// Default per-step DeepSeek API timeout for sub-agent requests, in seconds.
/// Matches the legacy hardcoded value so existing configs keep their old
/// behavior when `[subagents] api_timeout_secs` is unset (#1806, #1808).
pub const DEFAULT_SUBAGENT_API_TIMEOUT_SECS: u64 = 120;
/// Minimum accepted `[subagents] api_timeout_secs`. Anything lower (including
/// `0`, which would otherwise produce an immediate timeout footgun) clamps
/// up to this value before the runtime sees it.
pub const MIN_SUBAGENT_API_TIMEOUT_SECS: u64 = 1;
/// Maximum accepted `[subagents] api_timeout_secs` (30 minutes). The cap
/// keeps a misconfigured per-step timeout from masking real model/network
/// hangs forever.
pub const MAX_SUBAGENT_API_TIMEOUT_SECS: u64 = 1800;
pub const DEFAULT_TEXT_MODEL: &str = "deepseek-v4-pro";
pub const DEFAULT_DEEPSEEK_BASE_URL: &str = "https://api.deepseek.com/beta";
pub const DEFAULT_NVIDIA_NIM_MODEL: &str = "deepseek-ai/deepseek-v4-pro";
@@ -879,6 +891,14 @@ pub struct SubagentsConfig {
/// setting. Clamped to [1, MAX_SUBAGENTS].
#[serde(default)]
pub max_concurrent: Option<usize>,
/// Per-step DeepSeek API timeout for sub-agent requests, in seconds. The
/// timeout wraps `client.create_message` so a stuck single step cannot
/// pin the parent's parent-completion wakeup channel indefinitely.
/// Defaults to `DEFAULT_SUBAGENT_API_TIMEOUT_SECS` (120) and is clamped
/// to `MIN_SUBAGENT_API_TIMEOUT_SECS..=MAX_SUBAGENT_API_TIMEOUT_SECS`
/// (1..=1800). Zero or unset uses the legacy 120s default (#1806, #1808).
#[serde(default)]
pub api_timeout_secs: Option<u64>,
}
/// `[auto]` table — knobs for the `--model auto` / `/model auto` router.
@@ -1815,6 +1835,27 @@ impl Config {
.clamp(1, MAX_SUBAGENTS)
}
/// Resolved per-step DeepSeek API timeout for sub-agents, in seconds.
///
/// Reads `[subagents] api_timeout_secs` and clamps to
/// `[MIN_SUBAGENT_API_TIMEOUT_SECS, MAX_SUBAGENT_API_TIMEOUT_SECS]`
/// (1..=1800). `None` or `0` resolve to the legacy
/// `DEFAULT_SUBAGENT_API_TIMEOUT_SECS` (120) so existing configs keep
/// their old behavior; explicit `1` is honored, useful only in fast
/// fail-fast tests, not production (#1806, #1808).
#[must_use]
pub fn subagent_api_timeout_secs(&self) -> u64 {
let raw = self
.subagents
.as_ref()
.and_then(|cfg| cfg.api_timeout_secs)
.unwrap_or(DEFAULT_SUBAGENT_API_TIMEOUT_SECS);
if raw == 0 {
return DEFAULT_SUBAGENT_API_TIMEOUT_SECS;
}
raw.clamp(MIN_SUBAGENT_API_TIMEOUT_SECS, MAX_SUBAGENT_API_TIMEOUT_SECS)
}
/// Raw sub-agent model override map. Values are validated at spawn time
/// so an invalid role/type model fails before any partial agent spawn.
#[must_use]
@@ -3970,6 +4011,47 @@ mod tests {
assert_eq!(high.max_subagents(), MAX_SUBAGENTS);
}
#[test]
fn subagent_api_timeout_defaults_and_clamps() {
assert_eq!(
Config::default().subagent_api_timeout_secs(),
DEFAULT_SUBAGENT_API_TIMEOUT_SECS
);
let zero = Config {
subagents: Some(SubagentsConfig {
api_timeout_secs: Some(0),
..SubagentsConfig::default()
}),
..Config::default()
};
assert_eq!(
zero.subagent_api_timeout_secs(),
DEFAULT_SUBAGENT_API_TIMEOUT_SECS
);
let explicit_min = Config {
subagents: Some(SubagentsConfig {
api_timeout_secs: Some(MIN_SUBAGENT_API_TIMEOUT_SECS),
..SubagentsConfig::default()
}),
..Config::default()
};
assert_eq!(explicit_min.subagent_api_timeout_secs(), 1);
let high = Config {
subagents: Some(SubagentsConfig {
api_timeout_secs: Some(MAX_SUBAGENT_API_TIMEOUT_SECS + 60),
..SubagentsConfig::default()
}),
..Config::default()
};
assert_eq!(
high.subagent_api_timeout_secs(),
MAX_SUBAGENT_API_TIMEOUT_SECS
);
}
#[test]
fn save_api_key_writes_config_file_under_cfg_test() -> Result<()> {
// `save_api_key` writes to the shared user config file. This
+10
View File
@@ -166,6 +166,11 @@ pub struct EngineConfig {
pub search_provider: crate::config::SearchProvider,
/// API key for Tavily or Bocha. `None` for Bing or DuckDuckGo.
pub search_api_key: Option<String>,
/// Per-step DeepSeek API timeout for sub-agent `create_message` requests.
/// Resolved from `[subagents] api_timeout_secs` (clamped to 1..=1800)
/// once at engine construction, then threaded onto every
/// `SubAgentRuntime` the engine builds (#1806, #1808).
pub subagent_api_timeout: Duration,
}
impl Default for EngineConfig {
@@ -206,6 +211,9 @@ impl Default for EngineConfig {
workshop: None,
search_provider: crate::config::SearchProvider::default(),
search_api_key: None,
subagent_api_timeout: Duration::from_secs(
crate::config::DEFAULT_SUBAGENT_API_TIMEOUT_SECS,
),
}
}
}
@@ -656,6 +664,7 @@ impl Engine {
self.session.reasoning_effort_auto,
)
.with_max_spawn_depth(self.config.max_spawn_depth)
.with_step_api_timeout(self.config.subagent_api_timeout)
.background_runtime();
let route = resolve_subagent_assignment_route(&runtime, None, &prompt).await;
runtime.model = route.model;
@@ -1063,6 +1072,7 @@ impl Engine {
self.session.reasoning_effort_auto,
)
.with_max_spawn_depth(self.config.max_spawn_depth)
.with_step_api_timeout(self.config.subagent_api_timeout)
.with_parent_completion_tx(self.tx_subagent_completion.clone());
if let Some(context) = fork_context_for_runtime.clone() {
rt = rt.with_fork_context(context);
+1
View File
@@ -4679,6 +4679,7 @@ async fn run_exec_agent(
lsp_config,
runtime_services: crate::tools::spec::RuntimeToolServices::default(),
subagent_model_overrides: config.subagent_model_overrides(),
subagent_api_timeout: std::time::Duration::from_secs(config.subagent_api_timeout_secs()),
memory_enabled: config.memory_enabled(),
memory_path: config.memory_path(),
vision_config: config.vision_model_config(),
+3
View File
@@ -1964,6 +1964,9 @@ impl RuntimeThreadManager {
rlm_sessions: crate::rlm::session::new_shared_rlm_session_store(),
},
subagent_model_overrides: self.config.subagent_model_overrides(),
subagent_api_timeout: std::time::Duration::from_secs(
self.config.subagent_api_timeout_secs(),
),
memory_enabled: self.config.memory_enabled(),
memory_path: self.config.memory_path(),
vision_config: self.config.vision_model_config(),
+27 -3
View File
@@ -67,7 +67,13 @@ const TOOL_TIMEOUT: Duration = Duration::from_secs(30);
/// Per-step LLM API call timeout. Each `create_message` request must complete
/// within this window or the step is treated as timed out. Prevents a single
/// stuck API call from blocking the sub-agent indefinitely.
const STEP_API_TIMEOUT: Duration = Duration::from_secs(120);
/// Legacy fallback for the per-step DeepSeek API timeout. The active timeout
/// now travels on `SubAgentRuntime::step_api_timeout` so users can override
/// it via `[subagents] api_timeout_secs` in `~/.deepseek/config.toml`. The
/// constant only exists for tests/stub runtimes that need a hard-coded
/// default; production runtimes set the field explicitly (#1806, #1808).
const DEFAULT_STEP_API_TIMEOUT: Duration =
Duration::from_secs(crate::config::DEFAULT_SUBAGENT_API_TIMEOUT_SECS);
const RESULT_POLL_INTERVAL: Duration = Duration::from_millis(250);
const DEFAULT_RESULT_TIMEOUT_MS: u64 = 30_000;
#[allow(dead_code)] // Legacy agent_wait clamp; new agent_eval uses DEFAULT/MAX.
@@ -643,6 +649,12 @@ pub struct SubAgentRuntime {
pub parent_completion_tx: Option<mpsc::UnboundedSender<SubAgentCompletion>>,
/// Snapshot of the request prefix visible to an opt-in forked child.
pub fork_context: Option<SubAgentForkContext>,
/// Per-step DeepSeek API timeout for the child's `create_message` call.
/// Resolved from `[subagents] api_timeout_secs` (clamped to 1..=1800) at
/// engine construction so a slow but legitimate model turn does not
/// false-timeout the child mid-thinking. `child_runtime()` and
/// `background_runtime()` preserve the parent's value (#1806, #1808).
pub step_api_timeout: Duration,
}
impl SubAgentRuntime {
@@ -676,9 +688,20 @@ impl SubAgentRuntime {
mailbox: None,
parent_completion_tx: None,
fork_context: None,
step_api_timeout: DEFAULT_STEP_API_TIMEOUT,
}
}
/// Override the per-step DeepSeek API timeout (default
/// `DEFAULT_STEP_API_TIMEOUT`). Called by the engine after reading
/// `[subagents] api_timeout_secs`. Tests may use this to fail fast
/// without waiting the legacy 120 seconds (#1806, #1808).
#[must_use]
pub fn with_step_api_timeout(mut self, timeout: Duration) -> Self {
self.step_api_timeout = timeout;
self
}
/// Attach the wakeup channel so the engine's parent turn loop can resume
/// when this runtime's direct children finish (issue #756). The channel
/// is propagated to descendants via clone, but only `spawn_depth == 1`
@@ -799,6 +822,7 @@ impl SubAgentRuntime {
mailbox: self.mailbox.clone(),
parent_completion_tx: self.parent_completion_tx.clone(),
fork_context: self.fork_context.clone(),
step_api_timeout: self.step_api_timeout,
}
}
@@ -3534,8 +3558,8 @@ async fn run_subagent(
from_prior_session: false,
});
}
api = tokio::time::timeout(STEP_API_TIMEOUT, runtime.client.create_message(request)) => {
api.map_err(|_| anyhow!("API call timed out after {}s", STEP_API_TIMEOUT.as_secs()))??
api = tokio::time::timeout(runtime.step_api_timeout, runtime.client.create_message(request)) => {
api.map_err(|_| anyhow!("API call timed out after {}s", runtime.step_api_timeout.as_secs()))??
}
};
+14
View File
@@ -1164,6 +1164,7 @@ fn child_runtime_increments_depth_and_preserves_auto_approve() {
parent.context.auto_approve = false; // parent in suggest mode
let child = parent.child_runtime();
assert_eq!(child.spawn_depth, 2, "child depth = parent + 1");
assert_eq!(child.step_api_timeout, DEFAULT_STEP_API_TIMEOUT);
assert!(
!child.context.auto_approve,
"child must inherit parent approval state"
@@ -1178,6 +1179,18 @@ fn child_runtime_increments_depth_and_preserves_auto_approve() {
);
}
#[test]
fn child_and_background_runtimes_preserve_step_api_timeout() {
let timeout = Duration::from_secs(7);
let parent = stub_runtime().with_step_api_timeout(timeout);
let child = parent.child_runtime();
assert_eq!(child.step_api_timeout, timeout);
let background = parent.background_runtime();
assert_eq!(background.step_api_timeout, timeout);
}
#[tokio::test]
async fn subagent_registry_blocks_approval_tools_without_parent_auto_approve() {
let mut runtime = stub_runtime();
@@ -1434,6 +1447,7 @@ fn stub_runtime() -> SubAgentRuntime {
mailbox: None,
parent_completion_tx: None,
fork_context: None,
step_api_timeout: DEFAULT_STEP_API_TIMEOUT,
}
}
+1
View File
@@ -707,6 +707,7 @@ fn build_engine_config(app: &App, config: &Config) -> EngineConfig {
.map(crate::config::LspConfigToml::into_runtime),
runtime_services: app.runtime_services.clone(),
subagent_model_overrides: config.subagent_model_overrides(),
subagent_api_timeout: Duration::from_secs(config.subagent_api_timeout_secs()),
memory_enabled: config.memory_enabled(),
memory_path: config.memory_path(),
vision_config: config.vision_model_config(),
+7 -5
View File
@@ -447,12 +447,14 @@ If you are upgrading from older releases:
related persistent sub-agent sessions. Explicit tool `model` values win, then role/type
overrides, then the parent runtime model. Supported convenience keys are
`default_model`, `worker_model`, `explorer_model`, `awaiter_model`,
`review_model`, `custom_model`, and `max_concurrent`. The
`review_model`, `custom_model`, `max_concurrent`, and `api_timeout_secs`. The
`[subagents] max_concurrent` value overrides top-level `max_subagents` and is
also clamped to `1..=20`. `[subagents.models]` accepts lower-case role or type
keys such as `worker`, `explorer`, `general`, `explore`, `plan`, and
`review`. Values must normalize to a supported DeepSeek model id before an
agent is spawned.
also clamped to `1..=20`; `[subagents] api_timeout_secs` controls the
per-step API timeout for sub-agent model calls and is clamped to `1..=1800`,
with `0` or unset preserving the legacy 120 second default.
`[subagents.models]` accepts lower-case role or type keys such as `worker`,
`explorer`, `general`, `explore`, `plan`, and `review`. Values must normalize
to a supported DeepSeek model id before an agent is spawned.
- `skills_dir` (string, optional): defaults to `~/.deepseek/skills` (each skill is
a directory containing `SKILL.md`). Workspace-local `.agents/skills` or
`./skills` are preferred when present; the runtime also discovers global