From b023d54c5d253a0e47e64857b34f7d42dc99a734 Mon Sep 17 00:00:00 2001
From: LinQ <linzhiqin2003@users.noreply.github.com>
Date: Sun, 10 May 2026 18:24:10 +0100
Subject: [PATCH] feat(prompts): XML-tagged execution discipline + tool-use
 enforcement
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds two structured sections to base.md, placed at the end of the
system prompt to keep DeepSeek's automatic prefix cache stable:

1. Execution discipline — five XML-tagged blocks:
   <tool_persistence>, <mandatory_tool_use>, <act_dont_ask>,
   <verification>, <missing_context>. They turn the existing
   "Preamble Rhythm" guidance into concrete, tool-aware rules:
   when V4 must reach for a tool instead of answering from memory,
   when to keep iterating, when to verify, when to name a missing
   gap. The ## Language and ## Output formatting sections still come
   first, so prefix-cache hits for already-running sessions are
   preserved.

2. Tool-use enforcement — a binary acceptance test ("every response
   = tool calls OR final result"). Closes the failure mode where V4
   narrates "I will run the tests" without actually firing the
   exec_shell call.

Also drops the five legacy `*_system_prompt()` facade functions
(`base_system_prompt`, `normal_system_prompt`, `agent_system_prompt`,
`yolo_system_prompt`, `plan_system_prompt`) — repo-wide grep confirms
they have no callers outside their own definition. The companion
.txt template files and `*_PROMPT` constants are out of scope here
and are being retired in #1379 / #1382 separately so the diffs stay
review-friendly.

Two new tests pin the contract:
- `base_prompt_carries_execution_discipline_block` — every required
  XML tag and the enforcement clause are present in BASE_PROMPT
- `execution_discipline_is_at_the_end_for_cache_stability` — the
  block sits after `## Language` so the cached prefix doesn't shift

Refs #718
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/tui/src/prompts.rs      | 60 +++++++++++++++++++++-------------
 crates/tui/src/prompts/base.md | 38 +++++++++++++++++++++
 2 files changed, 76 insertions(+), 22 deletions(-)
diff --git a/crates/tui/src/prompts.rs b/crates/tui/src/prompts.rs
index aa653b73..ede76c31 100644
--- a/crates/tui/src/prompts.rs
+++ b/crates/tui/src/prompts.rs
@@ -493,28 +493,6 @@ pub fn build_system_prompt(base: &str, project_context: Option<&ProjectContext>)
     SystemPrompt::Text(full_prompt)
 }
 
-// ── Legacy functions for backwards compatibility ──────────────────────
-
-pub fn base_system_prompt() -> SystemPrompt {
-    SystemPrompt::Text(BASE_PROMPT.trim().to_string())
-}
-
-pub fn normal_system_prompt() -> SystemPrompt {
-    system_prompt_for_mode(AppMode::Agent)
-}
-
-pub fn agent_system_prompt() -> SystemPrompt {
-    system_prompt_for_mode(AppMode::Agent)
-}
-
-pub fn yolo_system_prompt() -> SystemPrompt {
-    system_prompt_for_mode(AppMode::Yolo)
-}
-
-pub fn plan_system_prompt() -> SystemPrompt {
-    system_prompt_for_mode(AppMode::Plan)
-}
-
 #[cfg(test)]
 mod tests {
     // Don't assert on prose. If you wouldn't fail a code review for
@@ -526,6 +504,44 @@ mod tests {
     /// agent prompt's own discussion of the convention).
     const HANDOFF_BLOCK_MARKER: &str = "left a handoff at `.deepseek/handoff.md`";
 
+    #[test]
+    fn base_prompt_carries_execution_discipline_block() {
+        // The XML-tagged execution-discipline block is the contract —
+        // verify each section name is present so reviewers can't quietly
+        // strip the rules that herd V4 toward acting instead of narrating.
+        for tag in [
+            "<tool_persistence>",
+            "<mandatory_tool_use>",
+            "<act_dont_ask>",
+            "<verification>",
+            "<missing_context>",
+        ] {
+            assert!(
+                BASE_PROMPT.contains(tag),
+                "BASE_PROMPT missing required tag {tag}"
+            );
+        }
+        assert!(
+            BASE_PROMPT.contains("Tool-use enforcement"),
+            "BASE_PROMPT missing the tool-use enforcement clause"
+        );
+    }
+
+    #[test]
+    fn execution_discipline_is_at_the_end_for_cache_stability() {
+        // DeepSeek's prefix cache keys on a leading byte-stable run, so
+        // the new sections must be appended, not interleaved earlier.
+        let body = BASE_PROMPT;
+        let persistence_at = body
+            .find("<tool_persistence>")
+            .expect("tool_persistence anchor present");
+        let language_at = body.find("## Language").expect("Language anchor present");
+        assert!(
+            language_at < persistence_at,
+            "execution-discipline block must come after the early sections"
+        );
+    }
+
     #[test]
     fn render_environment_block_lists_supplied_locale_and_workspace() {
         let tmp = tempdir().expect("tempdir");
diff --git a/crates/tui/src/prompts/base.md b/crates/tui/src/prompts/base.md
index 6f248541..d6ec80fb 100644
--- a/crates/tui/src/prompts/base.md
+++ b/crates/tui/src/prompts/base.md
@@ -201,3 +201,41 @@ You're rendering into a terminal, not a browser. Markdown tables almost never re
 - **Definition-style lists** (`- **Label**: value`) when the user asked for a comparison or summary.
 
 If you genuinely need column-aligned data (e.g. the user asked for a table or for `/cost` style output), keep columns narrow, ASCII-only, and limit to 2–3 columns. Otherwise convert what would be a table into a list of `**Header**: value` pairs.
+
+## Execution discipline
+
+<tool_persistence>
+- Use tools whenever they improve correctness, completeness, or grounding.
+- Do not stop early when another tool call would materially improve the result.
+- If a tool returns empty or partial results, retry with a different query or strategy before giving up.
+- Keep calling tools until: (1) the task is complete, AND (2) you have verified the result.
+</tool_persistence>
+
+<mandatory_tool_use>
+NEVER answer these from memory or mental computation — ALWAYS use a tool:
+- Arithmetic, math, calculations → `exec_shell` (e.g. `python -c '…'`)
+- Hashes, encodings, checksums → `exec_shell` (e.g. `sha256sum`, `base64`)
+- Current time, date, timezone → `exec_shell` (e.g. `date`)
+- System state: OS, CPU, memory, disk, ports, processes → `exec_shell`
+- File contents, sizes, line counts → `read_file` or `grep_files`
+- Symbol or pattern search across the workspace → `grep_files`
+- Filename search → `file_search`
+</mandatory_tool_use>
+
+<act_dont_ask>
+When a question has an obvious default interpretation, act on it immediately instead of asking for clarification. Save clarification for genuinely ambiguous requests.
+</act_dont_ask>
+
+<verification>
+After making changes, verify them: read back the file you wrote, run the test you fixed, fetch the URL you posted to. Don't claim success on faith.
+</verification>
+
+<missing_context>
+If you need context (a file you haven't read, a variable's current value, an external URL), name the gap and fetch it before proceeding.
+</missing_context>
+
+## Tool-use enforcement
+
+You MUST use your tools to take action — do not describe what you would do or plan to do without actually doing it. When you say you will perform an action ("I will run the tests", "Let me check the file", "I will create the project"), you MUST immediately make the corresponding tool call in the same response. Never end your turn with a promise of future action — execute it now.
+
+Every response should either (a) contain tool calls that make progress, or (b) deliver a final result to the user. Responses that only describe intentions without acting are not acceptable.