fix: gate task_shell_start behind allow_shell like exec_shell

task_shell_start delegates to ExecShellTool, providing the same shell execution capability as exec_shell. Previously, task_shell_start was registered unconditionally in with_runtime_task_tools while exec_shell was gated behind allow_shell, creating an inconsistent security gate. This caused the model to try exec_shell first, fail, then fall back to task_shell_start — wasting tokens and bypassing the intended security boundary. Split TaskShellStartTool and TaskShellWaitTool out of with_runtime_task_tools into a new with_runtime_task_shell_tools method, and gate both behind the allow_shell check in with_agent_tools. Closes #2303
2026-05-31 11:51:14 +08:00
parent fc9a32be1a
commit ed81d13782
3 changed files with 47 additions and 6 deletions
@@ -85,7 +85,7 @@ impl Engine {
            && self.config.features.enabled(Feature::ShellTool)
            && self.session.allow_shell
        {
-            builder = builder.with_shell_tools();
+            builder = builder.with_shell_tools().with_runtime_task_shell_tools();
        }

        // Register the `remember` tool only when the user has opted in to
@@ -542,6 +542,10 @@ impl ToolRegistryBuilder {
    }

    /// Include durable task, gate, PR-attempt, GitHub, and automation tools.
+    ///
+    /// Shell-related task tools (`task_shell_start`, `task_shell_wait`) are
+    /// *not* included here — use [`with_runtime_task_shell_tools`] to register
+    /// them when `allow_shell` is true.
    #[must_use]
    pub fn with_runtime_task_tools(self) -> Self {
        use super::automation::{
@@ -555,7 +559,6 @@ impl ToolRegistryBuilder {
        use super::tasks::{
            PrAttemptListTool, PrAttemptPreflightTool, PrAttemptReadTool, PrAttemptRecordTool,
            TaskCancelTool, TaskCreateTool, TaskGateRunTool, TaskListTool, TaskReadTool,
-            TaskShellStartTool, TaskShellWaitTool,
        };

        self.with_tool(Arc::new(TaskCreateTool))
@@ -563,8 +566,6 @@ impl ToolRegistryBuilder {
            .with_tool(Arc::new(TaskReadTool))
            .with_tool(Arc::new(TaskCancelTool))
            .with_tool(Arc::new(TaskGateRunTool))
-            .with_tool(Arc::new(TaskShellStartTool))
-            .with_tool(Arc::new(TaskShellWaitTool))
            .with_tool(Arc::new(GithubIssueContextTool))
            .with_tool(Arc::new(GithubPrContextTool))
            .with_tool(Arc::new(PrAttemptRecordTool))
@@ -584,6 +585,18 @@ impl ToolRegistryBuilder {
            .with_tool(Arc::new(GithubClosePrTool))
    }

+    /// Include shell-related task tools (`task_shell_start`, `task_shell_wait`).
+    ///
+    /// These are gated behind `allow_shell` because `task_shell_start`
+    /// delegates directly to `ExecShellTool`, providing the same shell
+    /// execution capability as `exec_shell`.
+    #[must_use]
+    pub fn with_runtime_task_shell_tools(self) -> Self {
+        use super::tasks::{TaskShellStartTool, TaskShellWaitTool};
+        self.with_tool(Arc::new(TaskShellStartTool))
+            .with_tool(Arc::new(TaskShellWaitTool))
+    }
+
    /// Include only read-only durable task, PR-attempt, GitHub, and automation
    /// inspection tools. Plan mode uses this surface so it can observe state
    /// without starting work, changing remotes, or mutating automation config.
@@ -786,7 +799,7 @@ impl ToolRegistryBuilder {
            .with_image_ocr_tools();

        if allow_shell {
-            builder.with_shell_tools()
+            builder.with_shell_tools().with_runtime_task_shell_tools()
        } else {
            builder
        }
@@ -1379,4 +1392,32 @@ mod tests {

        assert!(registry.contains("finance"));
    }
+
+    #[test]
+    fn agent_tools_with_allow_shell_false_excludes_shell_tools() {
+        let tmp = tempdir().expect("tempdir");
+        let ctx = ToolContext::new(tmp.path().to_path_buf());
+
+        let registry = ToolRegistryBuilder::new()
+            .with_agent_tools(false)
+            .build(ctx);
+
+        assert!(!registry.contains("exec_shell"), "exec_shell should be excluded when allow_shell is false");
+        assert!(!registry.contains("task_shell_start"), "task_shell_start should be excluded when allow_shell is false");
+        assert!(!registry.contains("task_shell_wait"), "task_shell_wait should be excluded when allow_shell is false");
+    }
+
+    #[test]
+    fn agent_tools_with_allow_shell_true_includes_shell_tools() {
+        let tmp = tempdir().expect("tempdir");
+        let ctx = ToolContext::new(tmp.path().to_path_buf());
+
+        let registry = ToolRegistryBuilder::new()
+            .with_agent_tools(true)
+            .build(ctx);
+
+        assert!(registry.contains("exec_shell"), "exec_shell should be included when allow_shell is true");
+        assert!(registry.contains("task_shell_start"), "task_shell_start should be included when allow_shell is true");
+        assert!(registry.contains("task_shell_wait"), "task_shell_wait should be included when allow_shell is true");
+    }
 }
@@ -19,7 +19,7 @@ Run `/mode` to open the mode picker, or switch directly with `/mode agent`,
 `/mode plan`, `/mode yolo`, `/mode 1`, `/mode 2`, or `/mode 3`.

 - **Plan**: design-first prompting. Read-only investigation tools stay available; shell and patch execution stay off. Use this when you want to think out loud and produce a plan to hand to a human (yourself later, or a reviewer).
- **Agent**: multi-step tool use. Approvals for shell and paid tools (file writes are allowed without a prompt).
+- **Agent**: multi-step tool use. Shell execution (`exec_shell`, `task_shell_start`) requires `allow_shell = true` in config; approval prompts gate each call. File writes are allowed without a prompt.
 - **YOLO**: enables shell + trust mode and auto-approves all tools. Use only in trusted repos.

 All action-capable modes have access to persistent RLM sessions through `rlm_open`, `rlm_eval`, `rlm_configure`, and `rlm_close`. Inside an RLM Python REPL, `sub_query_batch` fans out 1-16 cheap parallel child calls pinned to `deepseek-v4-flash`. The model reaches for it when work is too large or repetitive for the parent transcript.