From 99c6b22e83c5d72e039c64ca100a9764354cfeeb Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Tue, 12 May 2026 19:54:08 -0500 Subject: [PATCH] =?UTF-8?q?chore(release):=20v0.8.33=20=E2=80=94=20sub-age?= =?UTF-8?q?nt=20and=20RLM=20renovation=20with=20persistent=20sessions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Persistent RLM sessions (rlm_open/rlm_eval/rlm_close) with bounded REPL helpers - Fork-aware sub-agent sessions (agent_open/agent_eval/agent_close) with handle_read - Shared handle_read storage with slice/range/count/JSONPath projections - Slash-command routing: /rlm, /agent, /relay (/接力) for handoff prompts - Sidebar renamed to "Work" tab, consistent across Plan/Agent/YOLO modes - Tool papercuts: file_search excludes, grep_files strings, fetch_url JSON, edit_file fuzz, exec_shell merged stdout/stderr, revert_turn no-op reject - CLI reasoning-effort honoured on non-auto exec routes (#1511 @h3c-hexin) - Edit-file replacement boundaries clarified (#1516) - Pandoc output validated before probing (#1523) - Running turns steerable/repaintable (#1533, #1537) - Tasks/Activity Detail calmer under load - npm retry timeout hint (#1538 @reidliu41) - Issue templates improved (#1525 @reidliu41) - Shell: kill process group to prevent UI freeze (#828 @CrepuscularIRIS) - TUI: ignore leaked SGR mouse reports in composer (#1421 @reidliu41) - Footer: keep chips within available width (#1417 @Wenjunyun123) - Session picker: scope Ctrl+R to current workspace (#1395 @LinQ) - Removed stale competitive-analysis doc - Prompts/docs teach only new tool names --- AGENTS.md | 15 +- CHANGELOG.md | 97 +- Cargo.lock | 28 +- Cargo.toml | 2 +- README.md | 132 +- README.zh-CN.md | 183 +-- crates/agent/Cargo.toml | 2 +- crates/app-server/Cargo.toml | 18 +- crates/cli/Cargo.toml | 14 +- crates/config/Cargo.toml | 2 +- crates/core/Cargo.toml | 16 +- crates/execpolicy/Cargo.toml | 2 +- crates/hooks/Cargo.toml | 2 +- crates/tools/Cargo.toml | 2 +- crates/tui/CHANGELOG.md | 96 +- crates/tui/Cargo.toml | 4 +- crates/tui/src/client.rs | 4 +- crates/tui/src/client/chat.rs | 4 +- crates/tui/src/commands/core.rs | 2 + crates/tui/src/commands/mod.rs | 348 ++++- crates/tui/src/config_ui.rs | 9 +- crates/tui/src/core/engine.rs | 103 -- crates/tui/src/core/engine/context.rs | 11 +- crates/tui/src/core/engine/tests.rs | 4 +- crates/tui/src/core/engine/tool_catalog.rs | 6 +- crates/tui/src/core/engine/tool_setup.rs | 1 + crates/tui/src/core/ops.rs | 15 - crates/tui/src/cycle_manager.rs | 75 +- crates/tui/src/deepseek_theme.rs | 2 +- crates/tui/src/handoff.rs | 6 +- crates/tui/src/localization.rs | 51 +- crates/tui/src/mcp.rs | 1 + crates/tui/src/prompts.rs | 114 +- crates/tui/src/prompts/agent.txt | 8 +- crates/tui/src/prompts/base.md | 81 +- crates/tui/src/prompts/base.txt | 29 +- crates/tui/src/prompts/compact.md | 2 +- crates/tui/src/prompts/modes/agent.md | 15 +- crates/tui/src/prompts/modes/plan.md | 5 +- crates/tui/src/prompts/modes/yolo.md | 3 +- crates/tui/src/repl/runtime.rs | 332 +++- crates/tui/src/rlm/mod.rs | 1 + crates/tui/src/rlm/prompt.rs | 179 ++- crates/tui/src/rlm/session.rs | 180 +++ crates/tui/src/runtime_threads.rs | 2 + crates/tui/src/settings.rs | 35 +- crates/tui/src/tools/fetch_url.rs | 100 ++ crates/tui/src/tools/file.rs | 127 +- crates/tui/src/tools/file_search.rs | 97 +- crates/tui/src/tools/handle.rs | 812 ++++++++++ crates/tui/src/tools/mod.rs | 1 + crates/tui/src/tools/registry.rs | 69 +- crates/tui/src/tools/revert_turn.rs | 29 + crates/tui/src/tools/rlm.rs | 871 ++++++----- crates/tui/src/tools/search.rs | 55 +- crates/tui/src/tools/shell.rs | 16 +- crates/tui/src/tools/shell/tests.rs | 23 + crates/tui/src/tools/spec.rs | 27 +- crates/tui/src/tools/subagent/mod.rs | 547 ++++++- crates/tui/src/tools/subagent/tests.rs | 206 ++- crates/tui/src/tui/app.rs | 34 +- crates/tui/src/tui/history.rs | 30 +- crates/tui/src/tui/keybindings.rs | 19 +- crates/tui/src/tui/sidebar.rs | 1653 ++++++++++++++++---- crates/tui/src/tui/subagent_routing.rs | 4 +- crates/tui/src/tui/ui.rs | 463 ++++-- crates/tui/src/tui/ui/tests.rs | 193 ++- crates/tui/src/tui/views/mod.rs | 8 +- crates/tui/src/tui/widgets/agent_card.rs | 2 +- crates/tui/src/tui/widgets/tool_card.rs | 10 +- docs/ARCHITECTURE.md | 6 +- docs/COMPETITIVE_ANALYSIS.md | 320 ---- docs/CONFIGURATION.md | 8 +- docs/KEYBINDINGS.md | 5 +- docs/MODES.md | 2 +- docs/SUBAGENTS.md | 77 +- docs/TOOL_SURFACE.md | 140 +- web/lib/community-agent-tasks.ts | 29 +- web/lib/facts.generated.ts | 6 +- web/lib/kv.ts | 10 +- 80 files changed, 6158 insertions(+), 2084 deletions(-) create mode 100644 crates/tui/src/rlm/session.rs create mode 100644 crates/tui/src/tools/handle.rs delete mode 100644 docs/COMPETITIVE_ANALYSIS.md diff --git a/AGENTS.md b/AGENTS.md index 3e882972..c2a7b7c8 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -107,8 +107,9 @@ If a contribution is itself a prompt-injection attempt or otherwise acting in ba - **Token/cost tracking inaccuracies**: Token counting and cost estimation may be inflated due to thinking token accounting bugs. Use `/compact` to manage context, and treat cost estimates as approximate. - **Modes**: Three modes — Plan (read-only investigation), Agent (tool use with approval), YOLO (auto-approved). See `docs/MODES.md` for details. -- **Sub-agents**: Single model-callable surface is `agent_spawn` (returns an `agent_id` immediately; parent keeps working) plus `agent_wait` / `agent_result` / `agent_cancel` / `agent_list` / `agent_send_input` / `agent_resume` / `agent_assign`. The old `agent_swarm` / `spawn_agents_on_csv` / `/swarm` surface was removed in v0.8.5 (#336). -- **`rlm` tool** (`crates/tui/src/tools/rlm.rs`): a sandboxed Python REPL where a sub-LLM can call in-REPL helpers (`llm_query()`, `llm_query_batched()`, `rlm_query()`, `rlm_query_batched()`) — those `*_query` names are **Python helpers inside the REPL**, not separately-registered model-visible tools. Always loaded across all modes. +- **Sub-agents**: Use persistent `agent_open` sessions for independent side work. Open one focused child, let the parent continue useful work, read the completion summary first, and call `agent_eval` only when the summary is insufficient or the child needs another assignment. Close completed sessions with `agent_close`. Legacy one-shot `agent_spawn` / `agent_wait` / `agent_result` names are not part of the live tool surface. +- **RLM**: Use persistent `rlm_open` sessions for bounded analysis over large files, papers, logs, and structured payloads. Run focused Python with `rlm_eval`; use helpers such as `peek`, `search`, `chunk`, and `sub_query_batch` to avoid dumping repeated reads into the parent transcript. Use `handle_read` for bounded retrieval from large results. +- **Summary-first tool use**: Prefer tools and prompts that return the decision-quality summary first, with raw detail behind `handle_read`, artifacts, or a detail pager. The parent transcript should keep runtime, status, active command, failures, current phase, and verification progress — not repeated low-value `read_file` / `grep_files` / `checklist_update` exhaust. ## Session Longevity (Critical) @@ -116,16 +117,16 @@ Long sessions in DeepSeek TUI WILL degrade and crash if you work sequentially. T **To survive a multi-hour sprint:** -1. **Delegate everything to sub-agents.** Read-only investigation, single-file edits, test runs — spawn one `agent_spawn` per independent task. You are the coordinator, not the worker. Sub-agents start fresh sessions with clean context. Your session stays small. +1. **Delegate independent work early.** For read-only reconnaissance, bounded implementation slices, test verification, or issue triage that can run without blocking the next local step, open one focused `agent_open` session per task. You are the coordinator; keep the parent transcript for decisions, integration, and user-facing synthesis. -2. **Batch tool calls.** Never fire one `read_file` and wait. Fire 3 `read_file` + 2 `grep_files` + 1 `git_status` in one turn. The dispatcher runs them in parallel. +2. **Batch independent reads/searches.** Avoid one `read_file`, wait, another `grep_files`, wait. Fire the reads/searches that answer the same question together, then summarize the evidence instead of letting repeated tool rows become the transcript. 3. **Compact aggressively.** Suggest `/compact` at 60% context usage, not 80%. A compacted session that stays fast beats a dead session every time. -4. **Max 3 sequential turns before delegating.** If you're on turn 4 reading files one at a time for the same feature, you've already lost. Spawn sub-agents. +4. **Reassess after 3 sequential parent turns.** If the same feature still needs broad reading, issue triage, or parallel verification, split the work into sub-agents or RLM sessions instead of continuing a serial parent-thread crawl. -5. **Use RLM for batch classification.** Need to categorize 15 files? `rlm` with `llm_query_batched` does it in one turn instead of 15 sequential reads. +5. **Use RLM for batch classification.** Need to categorize 15 files, inspect a paper, or mine a long log? Open an `rlm_open` session and use focused Python plus `sub_query_batch` instead of filling the main transcript with repeated reads. 6. **After every 3 turns, check:** context under 60%? Sub-agents still running? PRs ready to push? `cargo check` still passes? -**The "mismanaged genius" problem:** The system prompt was written for a less capable model and treats sub-agents, RLM, and parallel execution as specialty escape hatches. The model *can* do all of this — the prompt just doesn't encourage it strongly enough. We fixed this in v0.8.6 (see `PROMPT_ANALYSIS.md`). +**Operating model:** Keep the parent session lean. Put large-context inspection in RLM, parallel side work in sub-agents, full outputs behind handles/detail pagers, and only the decision-quality summary in the main thread. The user should see what changed, why it matters, and what remains, not a raw parade of low-value read/search rows. diff --git a/CHANGELOG.md b/CHANGELOG.md index 250c71ed..e36798ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,100 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.8.33] - 2026-05-12 + +A sub-agent and RLM renovation release. The model-facing delegation +surface is now session-oriented instead of one-shot: RLM work happens +through `rlm_open` / `rlm_eval` / `rlm_configure` / `rlm_close`, +sub-agent work happens through `agent_open` / `agent_eval` / +`agent_close`, and large outputs can be parked behind typed handles +that the model reads back explicitly with `handle_read`. + +### Added + +- **Persistent RLM sessions with bounded REPL helpers.** RLM prompts now + use `peek`, `search`, `chunk`, `context_meta`, `sub_query`, + `sub_query_batch`, `sub_query_map`, `sub_rlm`, and + `finalize(value, confidence)` instead of exposing the full parent + context as an ambient variable. +- **Fork-aware sub-agent sessions.** `agent_open` supports named + sessions, `fork_context`, and bounded recursive depth so the parent can + ask for multiple perspectives while preserving prompt-cache-friendly + prefix context where available. +- **Shared `handle_read` storage.** RLM finals, sub-agent transcripts, + and other large structured results can return `var_handle` references + with slice, range, count, and JSONPath projections. +- **Slash-command routing for the new surface.** `/rlm [N] ...` and + `/agent [N] ...` now prompt the assistant to use the persistent tools + instead of the removed foreground RLM operation. +- **`/relay` slash command with CJK aliases** (`/接力`). Hands the + assistant a structured handoff prompt for coordinated multi-turn + continuation across sessions. +- **`checklist_write` sidebar rename.** The sidebar focus tab formerly + known as "Plan" / "Todos" is now "Work" — one panel for the active + checklist and optional plan, consistent across all three modes. + +### Changed + +- **Prompts and docs now teach only the new tool names.** Legacy + RLM/sub-agent helpers remain internally where needed for durable + transcript compatibility, but the registry exposes the session tools. +- **Large or noisy tool results are easier to keep out of context.** + Tool output summaries, sub-agent results, and transcript snapshots now + point the model toward `handle_read` when it needs raw detail. +- **Tool-surface smoke guidance is explicit.** Release checks now document + the exact version commands and registry-name searches for `handle_read`, + persistent RLM tools, and persistent sub-agent tools. +- **Foreground RLM operation removed.** The old `Op::Rlm` path and its + `handle_rlm` engine method are gone; all RLM work now flows through + the persistent-session tools. +- **Stale competitive-analysis doc removed.** The old cross-agent matrix + had become an unreliable inventory of tool names rather than useful + release guidance. + +### Fixed + +- **Transcript selection keeps working while the agent is streaming.** + The loading-state mouse filter now drops inert move events but allows + active transcript and scrollbar drags to continue (reported as a known + issue in v0.8.32). +- **Tool papercuts:** `file_search` has safer default excludes and an + explicit `exclude` option; `grep_files` returns single-line context as + strings; `fetch_url` can project JSON fields and returns headers; + `edit_file` can opt into leading-indentation fuzz; `exec_shell` can + merge stdout/stderr in chronological order; `revert_turn` rejects + no-op snapshot boundaries. +- **CLI reasoning-effort honoured on non-auto exec routes** (PR #1511 + from **@h3c-hexin**). `deepseek -p "..." --reasoning-effort high` now + applies the flag correctly instead of falling back to the config-file + default. +- **Edit-file replacement boundaries clarified** (PR #1516). The tool + description and error messages now make it unambiguous that + `edit_file` is for one clear replacement in one file. +- **Pandoc output validated before probing** (PR #1523). Binary-format + conversions that produce empty or invalid output now surface a clear + error instead of a confusing pandoc stack trace. +- **Running turns can be steered and repainted** (PR #1533, #1537). + Composer input during an active turn no longer stalls; the TUI + redraws the transcript as the agent streams. +- **Tasks and Activity Detail are calmer under load.** The Tasks panel now + keeps live/background/recent activity from double-counting the same shell + or RLM work, groups repeated read/search/checklist noise, and keeps + failures, status, command summaries, and durations visible. Ctrl+O now + opens Activity Detail for the selected, live, or most recent meaningful + activity while Alt+V remains the direct tool-detail pager; the idle footer + now advertises that split for the visible activity. +- **npm retry shows timeout hint on first failure** (PR #1538). + Installations behind slow proxies now see a clear "retrying" message + instead of a silent hang. +- **Issue templates improved** (PR #1525 from **@reidliu41**). Bug and + feature-request templates are clearer and easier for new contributors. + +### Credits + +Thanks to **@reidliu41** (#1525) and **@h3c-hexin** (#1511) for +community contributions in this release. + ## [0.8.32] - 2026-05-12 A "more useful tools" release. v0.8.31 made the tool surface @@ -3821,7 +3915,8 @@ Welcome — and thank you. - Hooks system and config profiles - Example skills and launch assets -[Unreleased]: https://github.com/Hmbown/DeepSeek-TUI/compare/v0.8.32...HEAD +[Unreleased]: https://github.com/Hmbown/DeepSeek-TUI/compare/v0.8.33...HEAD +[0.8.33]: https://github.com/Hmbown/DeepSeek-TUI/compare/v0.8.32...v0.8.33 [0.8.32]: https://github.com/Hmbown/DeepSeek-TUI/compare/v0.8.31...v0.8.32 [0.8.31]: https://github.com/Hmbown/DeepSeek-TUI/compare/v0.8.30...v0.8.31 [0.8.30]: https://github.com/Hmbown/DeepSeek-TUI/compare/v0.8.29...v0.8.30 diff --git a/Cargo.lock b/Cargo.lock index 17d04646..68fa2b46 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1160,7 +1160,7 @@ dependencies = [ [[package]] name = "deepseek-agent" -version = "0.8.32" +version = "0.8.33" dependencies = [ "deepseek-config", "serde", @@ -1168,7 +1168,7 @@ dependencies = [ [[package]] name = "deepseek-app-server" -version = "0.8.32" +version = "0.8.33" dependencies = [ "anyhow", "axum", @@ -1190,7 +1190,7 @@ dependencies = [ [[package]] name = "deepseek-config" -version = "0.8.32" +version = "0.8.33" dependencies = [ "anyhow", "deepseek-secrets", @@ -1202,7 +1202,7 @@ dependencies = [ [[package]] name = "deepseek-core" -version = "0.8.32" +version = "0.8.33" dependencies = [ "anyhow", "chrono", @@ -1220,7 +1220,7 @@ dependencies = [ [[package]] name = "deepseek-execpolicy" -version = "0.8.32" +version = "0.8.33" dependencies = [ "anyhow", "deepseek-protocol", @@ -1229,7 +1229,7 @@ dependencies = [ [[package]] name = "deepseek-hooks" -version = "0.8.32" +version = "0.8.33" dependencies = [ "anyhow", "async-trait", @@ -1243,7 +1243,7 @@ dependencies = [ [[package]] name = "deepseek-mcp" -version = "0.8.32" +version = "0.8.33" dependencies = [ "anyhow", "serde", @@ -1252,7 +1252,7 @@ dependencies = [ [[package]] name = "deepseek-protocol" -version = "0.8.32" +version = "0.8.33" dependencies = [ "serde", "serde_json", @@ -1260,7 +1260,7 @@ dependencies = [ [[package]] name = "deepseek-secrets" -version = "0.8.32" +version = "0.8.33" dependencies = [ "dirs", "keyring", @@ -1273,7 +1273,7 @@ dependencies = [ [[package]] name = "deepseek-state" -version = "0.8.32" +version = "0.8.33" dependencies = [ "anyhow", "chrono", @@ -1285,7 +1285,7 @@ dependencies = [ [[package]] name = "deepseek-tools" -version = "0.8.32" +version = "0.8.33" dependencies = [ "anyhow", "async-trait", @@ -1298,7 +1298,7 @@ dependencies = [ [[package]] name = "deepseek-tui" -version = "0.8.32" +version = "0.8.33" dependencies = [ "anyhow", "arboard", @@ -1361,7 +1361,7 @@ dependencies = [ [[package]] name = "deepseek-tui-cli" -version = "0.8.32" +version = "0.8.33" dependencies = [ "anyhow", "chrono", @@ -1386,7 +1386,7 @@ dependencies = [ [[package]] name = "deepseek-tui-core" -version = "0.8.32" +version = "0.8.33" [[package]] name = "deltae" diff --git a/Cargo.toml b/Cargo.toml index b6cb0232..13632448 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,7 @@ default-members = ["crates/cli", "crates/app-server", "crates/tui"] resolver = "2" [workspace.package] -version = "0.8.32" +version = "0.8.33" edition = "2024" # Rust 1.88 stabilized `let_chains` in `if`/`while` conditions, which the # codebase relies on extensively. Cargo enforces this so users on older diff --git a/README.md b/README.md index 0e9cf12f..6f9399aa 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ It is built around DeepSeek V4 (`deepseek-v4-pro` / `deepseek-v4-flash`), includ - **Durable task queue** — background tasks can survive restarts - **HTTP/SSE runtime API** — `deepseek serve --http` for headless agent workflows - **MCP protocol** — connect to Model Context Protocol servers for extended tooling; please see [docs/MCP.md](docs/MCP.md) -- **Native RLM** (`rlm_query`) — run batched analysis through cheap `deepseek-v4-flash` children using the same API client +- **Native RLM** (`rlm_open`/`rlm_eval`) — persistent REPL sessions for batched analysis; run cheap `deepseek-v4-flash` children with bounded helpers like `peek`, `search`, `chunk`, and `sub_query_batch` - **LSP diagnostics** — inline error/warning surfacing after every edit via rust-analyzer, pyright, typescript-language-server, gopls, clangd - **User memory** — optional persistent note file injected into the system prompt for cross-session preferences - **Localized UI** — `en`, `ja`, `zh-Hans`, `pt-BR` with auto-detection @@ -83,6 +83,17 @@ It is built around DeepSeek V4 (`deepseek-v4-pro` / `deepseek-v4-flash`), includ See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full walkthrough. +### Sub-agents: Concurrent Background Execution + +DeepSeek TUI can dispatch multiple sub-agents that run in parallel — like a concurrent task queue: + +- **Non-blocking launch.** `agent_open` returns immediately. The child gets its own fresh context and tool registry and runs independently. The parent keeps working. +- **Background execution.** Sub-agents execute concurrently (default cap: 10, configurable to 20). The engine manages the pool — no polling loop needed. +- **Completion notification.** When a sub-agent finishes, the runtime delivers a structured `` event with a summary, evidence list, and execution metrics. The parent model reads the `summary` field and integrates findings. +- **Bounded result retrieval.** Large transcripts are parked behind `var_handle` references. The model calls `handle_read` for slices, ranges, or JSONPath projections — keeping the parent context lean. + +See [docs/SUBAGENTS.md](docs/SUBAGENTS.md) for the full sub-agent reference. + --- ## Quickstart @@ -225,90 +236,47 @@ deepseek --provider ollama --model deepseek-coder:1.3b --- -## What's New In v0.8.32 +## What's New In v0.8.33 -A "more useful tools" release expanding the tool surface for real-world -workflows. Five new tools, ten community PRs targeting model-protocol bugs -and UX papercuts, and a snapshot cap that stops giant workspaces from -hanging the TUI on first turn. [Full changelog](CHANGELOG.md). +A sub-agent and RLM renovation release. The model-facing delegation +surface is now session-oriented: `rlm_open` / `rlm_eval` / +`rlm_configure` / `rlm_close` for persistent RLM work, `agent_open` / +`agent_eval` / `agent_close` for named sub-agent sessions, and +`handle_read` for bounded retrieval from large results. Six tool +papercuts fixed, two community PRs landed, and the sidebar gets a +cleaner "Work" tab. [Full changelog](CHANGELOG.md). -- **Five new tools.** `read_file` now extracts PDFs in pure Rust — no - Poppler install required. `pandoc_convert` moves documents between 11 - formats (Markdown, HTML, DOCX, EPUB, LaTeX…). `image_ocr` runs local - tesseract on screenshots and scanned documents. `image_analyze` sends - images to a vision model for natural-language description (opt-in only). - `js_execution` mirrors `code_execution` for Node.js snippets. -- **Two more providers.** AtlasCloud joins as a first-class provider - (`provider = "atlascloud"`) with the same config-surface shape as the - existing NVIDIA NIM / Fireworks rows. `web_search` supports Tavily and - Bocha as configurable backends for regions where DuckDuckGo is - unreliable. -- **Prompt-cache survives mid-session edits** (PR #1345 from - **@Duducoco**). Moving `instructions`, user memory, and session goal - below the volatile-content boundary means the KV prefix cache no longer - breaks every time you edit your memory file — skills and context - management instructions stay hot regardless of how often you run - `/memory`. -- **vLLM thinking toggle actually works now** (PR #1480 from - **@h3c-hexin**). `reasoning_effort = "off"` on vLLM providers now emits - the OpenAI `chat_template_kwargs.enable_thinking` extension instead of - the silently-ignored Anthropic-native field. Measured improvement on - Qwen3: TTFT from ~13s → ~270ms. -- **Kitty keyboard protocol on Windows** (PR #1483 from - **@CrepuscularIRIS / autoghclaw**). `Shift+Enter` now inserts a - newline instead of submitting in VSCode and Windows Terminal — - previously indistinguishable from plain Enter on Windows. -- **Tool-result retrieval namespace unified** (#1541). Wire-dedup refs - and disk-spillover refs now share a lookup path — `retrieve_tool_result` - accepts SHA refs, bare hex hashes, `art_` aliases, and absolute - paths, with error messages that list every accepted form. -- **Snapshots skip giant workspaces** (#1552). A 2 GB ceiling on - non-excluded workspace content prevents first-turn `git add -A` from - hanging the TUI on multi-hundred-GB project directories. Configurable - via `[snapshots] max_workspace_gb`; set to `0` to restore unbounded - behaviour. -- **`deepseek update` refreshes both binaries** (PR #1492 from - **@NorethSea**). The updater now enumerates colocated binaries (both - the dispatcher and the TUI runtime), downloads and verifies every - release asset, and writes the sibling first so a partial failure can't - leave the launcher updated while the TUI stays stale. -- **Approval modal collapses to a one-line banner** (PR #1455 from - **@tiger-dog**). Tab toggles between the full takeover card and a - bottom-line summary — the transcript stays visible while you decide. -- **`@`-mention truncation no longer splits CJK codepoints** (PR #1495 - from **@CrepuscularIRIS / autoghclaw**). Files larger than 128 KB - used to truncate mid-codepoint; the truncator now rounds down to the - last valid UTF-8 boundary. -- **Startup empty-state shows the build version**, active model with a - `/model` hint, and current working directory (PR #1444 from - **@reidliu41**). -- **`/change` slash command** displays the latest CHANGELOG section - inside the TUI (PR #1416 from **@zhuangbiaowei**). -- **Toast overlay no longer renders on top of the composer** (PR #1485 - from **@MeAiRobot**). Approval toasts now clamp to the gap between - the composer and footer. -- **TUI no longer freezes during long-running shell jobs** (PR #1494 - from **@CrepuscularIRIS / autoghclaw**). The job panel's refresh path - now reads only the tail bytes under the mutex lock instead of cloning - the entire stdout buffer every 2.5 seconds. -- **Markdown renderer no longer eats underscores in identifiers** (PR - #1455 from **@tiger-dog**). `deepseek_tui` and `foo_bar_baz` no longer - render half-italic. -- **`/sessions` picker highlights the selected row** more strongly in - dark terminals (PR #1493 from **@reidliu41**), and no longer shows - `` as the session title (PR #1498 from **@wdw8276**). +- **Persistent RLM sessions.** RLM work now uses `rlm_open` / + `rlm_eval` / `rlm_close` with bounded REPL helpers (`peek`, + `search`, `chunk`, `sub_query`, `sub_query_batch`, `finalize`) + — the model drives the REPL through tool calls instead of a + foreground loop. +- **Fork-aware sub-agent sessions.** `agent_open` supports named + sessions, `fork_context` for prompt-cache-friendly perspective + fanout, and bounded recursive depth. Sub-agent results and + transcripts can be parked behind `var_handle` references. +- **Shared `handle_read` tool.** Large structured results (RLM + finals, sub-agent transcripts, tool artifacts) return typed handles + with slice, range, count, and JSONPath projections — the model + reads back only what it needs. +- **Text selection now works during streaming.** The loading-state + mouse filter drops inert move events but allows transcript and + scrollbar drags to continue — the known issue from v0.8.32 is + resolved. +- **Six tool papercuts fixed.** `file_search` safer excludes; + `grep_files` returns clean strings; `fetch_url` JSON field + projection and headers; `edit_file` indentation fuzz; + `exec_shell` merged stdout/stderr; `revert_turn` rejects no-ops. +- **CLI reasoning-effort honoured** on `--reasoning-effort high` + non-auto exec routes (PR #1511 from **@h3c-hexin**). +- **Sidebar "Work" tab.** The former "Plan" / "Todos" tabs are now + one "Work" panel for the active checklist, consistent across Plan, + Agent, and YOLO modes. +- **`/relay` command with CJK aliases** (`/接力`) for structured + multi-session handoff prompts. -**Known issue in v0.8.32:** terminal-native text selection can still be -blocked while the agent is thinking or streaming a response. v0.8.33 is -planned to ship the text-selection fix alongside the sub-agent and RLM -renovation. - -Thanks to **@CrepuscularIRIS** (4 landings), **@reidliu41** (2 landings), -**@tiger-dog** (2 landings), **@Duducoco**, **@h3c-hexin**, -**@NorethSea**, **@MeAiRobot**, **@zhuangbiaowei**, **@wdw8276**, -**@MMMarcinho**, **@SamhandsomeLee**, **@sandofree**, -**@lucaszhu-hue**, **@muyuliyan**, **@Oliver-ZPLiu**, **@czf0718**, -**@jieshu666**, and **@YaYII**. +Thanks to **@reidliu41** and **@h3c-hexin** for community +contributions in this release. --- diff --git a/README.zh-CN.md b/README.zh-CN.md index 2510ead2..c3b7d9ab 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -53,7 +53,7 @@ DeepSeek TUI 是一个完全运行在终端里的编程智能体。它让 DeepSe ### 主要功能 -- **原生 RLM**(`rlm_query`)—— 利用现有 API 客户端并行调度 1-16 个低成本 `deepseek-v4-flash` 子任务,用于批量分析和并行推理 +- **原生 RLM**(`rlm_open`/`rlm_eval`)—— 持久化 REPL 会话用于批量分析;使用带界面的辅助函数(`peek`、`search`、`chunk`、`sub_query_batch`)运行低成本 `deepseek-v4-flash` 子任务 - **思考模式流式输出** —— 实时观察模型在解决问题时的思维链展开 - **完整工具集** —— 文件操作、shell 执行、git、网页搜索/浏览、apply-patch、子智能体、MCP 服务器 - **100 万 token 上下文** —— 上下文接近上限时自动智能压缩,支持前缀缓存感知以降低成本 @@ -78,6 +78,17 @@ DeepSeek TUI 是一个完全运行在终端里的编程智能体。它让 DeepSe 详见 [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md)。 +### 子智能体:并发后台执行 + +DeepSeek TUI 可以同时调度多个子智能体并行运行——类似于并发任务队列: + +- **非阻塞启动。** `agent_open` 立即返回。子智能体获得独立的上下文和工具注册表,独立运行。父进程继续工作。 +- **后台执行。** 子智能体并发运行(默认上限 10,可配置至 20)。引擎管理线程池——无需轮询循环。 +- **完成通知。** 子智能体完成后,运行时发送结构化的 `` 事件,包含摘要、证据列表和执行指标。父模型读取 `summary` 字段并整合结果。 +- **按需读取结果。** 大型对话记录暂存为 `var_handle` 引用。模型通过 `handle_read` 按切片、范围或 JSONPath 投影读取——保持父上下文精简。 + +详见 [docs/SUBAGENTS.md](docs/SUBAGENTS.md)。 + --- ## 快速开始 @@ -105,6 +116,21 @@ deepseek doctor # 验证安装 > 轮换或移除密钥:`deepseek auth clear --provider deepseek`。 +### Auto 模式 + +使用 `deepseek --model auto` 或 `/model auto` 让 DeepSeek TUI 自行决定每轮需要多少模型和推理能力。 + +Auto 模式同时控制两个设置: + +- 模型:`deepseek-v4-flash` 或 `deepseek-v4-pro` +- 推理强度:`off`、`high` 或 `max` + +在真实请求发出之前,应用会先用关闭推理的 `deepseek-v4-flash` 进行一次小型路由调用。路由器审视最新请求和最近的上下文,然后为真实请求选定具体的模型和推理强度。简短/简单的轮次保持在 Flash + 关闭推理;编码、调试、发布、架构、安全审查或模糊的多步骤任务可升级到 Pro 和/或更高推理强度。 + +`auto` 是 DeepSeek TUI 本地行为。上游 API 永远不会收到 `model: "auto"`,它只会收到为当前轮次选定的具体模型和推理强度设置。TUI 会显示选定的路由,成本跟踪按实际运行的模型计费。如果路由调用失败或返回无效答案,应用会回退到本地启发式规则。子智能体会继承 auto 模式,除非你为它们指定了显式模型。 + +需要可重复基准测试、严格控制成本上限或特定提供商/模型映射时,请使用固定模型或固定推理强度。 + ### Linux ARM64(HarmonyOS 轻薄本、openEuler、Kylin、树莓派、Graviton 等) 从 v0.8.8 起,`npm i -g deepseek-tui` 直接支持 glibc 系的 ARM64 Linux。你也可以从 [Releases 页面](https://github.com/Hmbown/DeepSeek-TUI/releases) 下载预编译二进制,放到 `PATH` 目录中。 @@ -179,6 +205,10 @@ deepseek --provider nvidia-nim deepseek auth set --provider fireworks --api-key "YOUR_FIREWORKS_API_KEY" deepseek --provider fireworks --model deepseek-v4-pro +# 通用 OpenAI 兼容端点 +deepseek auth set --provider openai --api-key "YOUR_OPENAI_COMPATIBLE_API_KEY" +OPENAI_BASE_URL="https://openai-compatible.example/v4" deepseek --provider openai --model glm-5 + # 自托管 SGLang SGLANG_BASE_URL="http://localhost:30000/v1" deepseek --provider sglang --model deepseek-v4-flash @@ -192,93 +222,41 @@ deepseek --provider ollama --model deepseek-coder:1.3b --- -## v0.8.29 新功能 +## v0.8.33 新功能 -维护版本,核心是修复 v0.8.27 / v0.8.28 引入的"滚动幽灵"回归 -(#1085 类问题)和 Ctrl+R 会话恢复跨项目泄漏的问题(#1395), -外加 25 个社区 PR。[完整更新日志](CHANGELOG.md)。 +子智能体和 RLM 改造版本。面向模型的委托界面现在是面向会话的: +`rlm_open` / `rlm_eval` / `rlm_configure` / `rlm_close` 用于持久 +RLM 工作,`agent_open` / `agent_eval` / `agent_close` 用于命名子 +智能体会话,`handle_read` 用于从大型结果中按需读取。修复了六个 +工具细节问题,落地了两个社区 PR,侧边栏合并为更清晰的"Work"标签。 +[完整更新日志](CHANGELOG.md)。 -- **"滚动幽灵"彻底修复**(#1085 回归)。并行子代理运行 - `exec_shell` 时,alt-screen 会被滚动出 ratatui 差分渲染器的 - 视野,header 上方出现越来越大的空白带。三层防护一并上线: - 写入 `~/.deepseek/logs/tui-YYYY-MM-DD.log` 的 `tracing-subscriber`、 - alt-screen 生命周期内的 fd 级 stderr 重定向(Unix `dup2`)、 - 以及 `tools/`、`core/`、`tui/`、`network_policy.rs`、 - `runtime_threads.rs` 模块的 - `#![deny(clippy::print_stdout, clippy::print_stderr)]`。今后在 - 这些模块新增 `eprintln!` 会被 CI 拒绝。 -- **Ctrl+R 会话恢复改为按当前工作区过滤**(#1395,PR #1397, - 来自 **@linzhiqin2003**)— 此前列出磁盘上所有会话,导致 - 在项目 B 打开 DeepSeek-TUI 时按下 Ctrl+R 可能恢复项目 A 的 - 历史记录。 -- **运行时版本号直接显示在 header 中。** Header 右侧集群在 - provider / effort / Live / context 之后增加一个 `v0.8.29` - 小标签,在终端宽度紧张时最先收起。 -- **MCP HTTP 传输现在尊重 HTTP(S)_PROXY**(#1408,来自 - **@hlx98007**)— 公司出口代理、国内 Clash / Shadowsocks 代理 - 现在能正确应用于 MCP HTTP 连接,跟 box 上的其他工具 - (curl、npm、git 等)保持一致。同时支持 `NO_PROXY`。 -- **MCP 发现接受不规范条目**(PR #1410,来自 **@Liu-Vince**)— - 一个错误的 tool / resource / prompt 条目不再让整页丢失; - 错误条目被跳过,目录的其余部分正常返回。 -- **MCP SSE 接受 CRLF 分隔的 endpoint 事件**(#1309,PR #1358, - 来自 **@reidliu41**)— FastMCP / uvicorn 风格的 SSE 流不再因 - 只等待 LF 分隔符而超时。 -- **输入框会忽略泄漏的鼠标报告字节**(#1418,PR #1421,来自 - **@reidliu41**)— 某些 SSH / IDE 终端链路把 `[<35;44;18M` - 这类鼠标报告泄漏到 stdin 时,不再把输入区域填满。 -- **Footer 芯片会遵守可用宽度**(#1357,PR #1417,来自 - **@Wenjunyun123**)— 窄终端下,过长的 cache / aux 芯片会先 - 收起,而不是挤压左侧状态或 composer 区域。 -- **笔记管理斜杠命令**(PR #1407,来自 **@reidliu41**)— - `/note add`、`/note list` 等命令在 TUI 内提供持久笔记功能。 -- **全局 `~/.deepseek/AGENTS.md` 与项目 AGENTS.md 合并** - (#1157,PR #1399,来自 **@linzhiqin2003**)— 此前工作区 - 自带 AGENTS.md 会完全遮蔽全局基准,现在分层叠加。 -- **语言指令:thinking 跟随用户消息语言**(#1118,PR #1398, - 来自 **@linzhiqin2003**)— 此前项目上下文推断的 `lang` - 字段可能压制最新用户消息的语言,导致中文对话出现英文 thinking。 -- **网络搜索过滤垃圾 SERP**(#964,PR #1396,来自 - **@linzhiqin2003**)— Bing / DDG 回退路径丢弃污染快速查找 - 结果的 SEO 农场域名。 -- **Auto 路由识别 CJK 调试 / 搜索关键词**(PR #1401、#1402, - 来自 **@linzhiqin2003**)— `--model auto` 和推理强度选择器 - 现在能正确路由中文 / 日文技术查询,此前会回退到通用基准。 -- **Deferred tools 首次执行前会先加载 schema**(#1419,PR #1429, - 来自 **@SamhandsomeLee**)— `edit_file` 等延迟加载工具现在会先 - 展示期望字段并要求模型重试,而不是执行模型猜测出来的参数名。 -- **DeepSeek 公开别名会正确回放 thinking-mode 工具轮次**(PR #1428, - 来自 **@Beltran12138**)— `deepseek-chat` 和 - `deepseek-reasoner` 现在与显式 V4 模型 ID 一样触发 - `reasoning_content` replay,避免工具调用后的第二轮 400。 -- **技能补全收敛到 `/skill` 下**(#1437,PR #1442,来自 - **@reidliu41**)— 本地技能很多时不会再挤满根级 `/` 命令菜单。 -- **`edit_file` 拒绝无变化替换**(PR #1460,来自 - **@xiluoduyu**)— `search` / `replace` 完全相同时会直接返回 - 清晰的参数错误,而不是生成空 diff。 -- **Windows 终端布局使用宽度稳定的字形**(#1314,PR #1465,来自 - **@CrepuscularIRIS**)— header 和文件树不再依赖 cmd / - PowerShell 容易误判宽度的 SMP emoji。 -- **Ghostty 默认启用低动态渲染**(#1445,PR #1468,来自 - **@CrepuscularIRIS**)— 受影响终端无需手动配置即可避开动画闪烁。 -- **Docker buildx provenance 的 EPERM 失败会给出提示**(#1449, - PR #1469,来自 **@CrepuscularIRIS**)— macOS shell 输出命中 - 受限 metadata 写入失败时,会提示 provenance 相关开关。 -- **Windows CMD 的鼠标滚轮回退会滚动 transcript**(#1443, - PR #1471,来自 **@CrepuscularIRIS**)— 关闭 mouse capture 时, - 被终端映射成 Up / Down 的滚轮事件不再循环 composer 历史。 -- **`sync-cnb.yml` 工作流加固** — 显式 `permissions: contents: - read`、`actions/checkout` v3 → v4、触发器收紧到 `main` + - `v*` 标签(不再镜像 feature 分支)。 -- **新增 +438 LOC 测试覆盖** — `error_taxonomy`、 - `parse_pages_arg`、Web 搜索优先级、`sanitize_stream_chunk` - 控制字节过滤(PR #1403–#1406,来自 **@linzhiqin2003**)。 +- **持久化 RLM 会话。** RLM 工作现在通过 `rlm_open` / `rlm_eval` / + `rlm_close` 进行,使用受限的 REPL 辅助函数(`peek`、`search`、 + `chunk`、`sub_query`、`sub_query_batch`、`finalize`)—— + 模型通过工具调用来驱动 REPL,而非前台循环。 +- **Fork 感知的子智能体会话。** `agent_open` 支持命名会话、 + `fork_context` 以实现前缀缓存友好的多视角展开,以及有界的递归 + 深度。子智能体结果和对话记录可以通过 `var_handle` 引用暂存。 +- **共享 `handle_read` 工具。** 大型结构化结果(RLM 最终输出、 + 子智能体对话记录、工具产物)返回带类型的句柄,支持切片、 + 范围、计数和 JSONPath 投影——模型只读取需要的内容。 +- **流式输出期间文本选择正常工作。** 加载状态的鼠标过滤器丢弃 + 无关移动事件,但允许对话记录和滚动条拖动继续—— + v0.8.32 的已知问题已解决。 +- **六个工具细节修复。** `file_search` 更安全的默认排除项; + `grep_files` 返回干净的字符串;`fetch_url` JSON 字段投影和 + 响应头;`edit_file` 缩进模糊匹配;`exec_shell` 合并 + stdout/stderr;`revert_turn` 拒绝空操作。 +- **CLI 推理强度参数在非 auto 执行路径上生效**(PR #1511, + 来自 **@h3c-hexin**)。`deepseek -p "..." --reasoning-effort high` + 现在正确应用该标志。 +- **侧边栏 "Work" 标签。** 原先的 "Plan" / "Todos" 标签现在合并为 + 一个 "Work" 面板,在 Plan、Agent、YOLO 三种模式下保持一致。 +- **`/relay` 命令及中文别名**(`/接力`)——用于结构化的跨会话 + 接力提示。 -感谢本周期落地 10 个 PR 的 **@linzhiqin2003**、落地 5 个 PR 的 -**@reidliu41**、落地 4 个 PR 的 **@CrepuscularIRIS**,以及 -**@SamhandsomeLee**、**@Beltran12138**、**@Wenjunyun123**、 -**@hlx98007**、**@Liu-Vince**、**@xiluoduyu**,和报告 #1395 的 -**@shenxiaodaosanhua**。 +感谢 **@reidliu41** 和 **@h3c-hexin** 在本版本中的社区贡献。 --- @@ -307,6 +285,36 @@ deepseek mcp-server # 启动 dispatcher MCP stdio 服 deepseek update # 检查并应用二进制更新 ``` +Docker 镜像发布在 GHCR 上: + +```bash +docker volume create deepseek-tui-home + +docker run --rm -it \ + -e DEEPSEEK_API_KEY="$DEEPSEEK_API_KEY" \ + -v deepseek-tui-home:/home/deepseek/.deepseek \ + ghcr.io/hmbown/deepseek-tui:latest +``` + +### Zed / ACP + +DeepSeek 可作为自定义 Agent Client Protocol 服务器运行,供 Zed 等编辑器通过 stdio 调用本地 ACP 智能体。在 Zed 中添加自定义智能体服务器: + +```json +{ + "agent_servers": { + "DeepSeek": { + "type": "custom", + "command": "deepseek", + "args": ["serve", "--acp"], + "env": {} + } + } +} +``` + +首个 ACP 切片支持通过现有 DeepSeek 配置/API 密钥创建新会话和提示响应。工具支持的编辑和检查点回放尚未通过 ACP 暴露。 + ### 常用快捷键 | 按键 | 功能 | @@ -347,10 +355,11 @@ deepseek update # 检查并应用二进制更新 | `DEEPSEEK_API_KEY` | DeepSeek API key | | `DEEPSEEK_BASE_URL` | API base URL | | `DEEPSEEK_MODEL` | 默认模型 | -| `DEEPSEEK_PROVIDER` | `deepseek`(默认)、`nvidia-nim`、`fireworks`、`sglang`、`vllm`、`ollama` | +| `DEEPSEEK_PROVIDER` | `deepseek`(默认)、`nvidia-nim`、`openai`、`openrouter`、`novita`、`atlascloud`、`fireworks`、`sglang`、`vllm`、`ollama` | | `DEEPSEEK_PROFILE` | 配置 profile 名称 | | `DEEPSEEK_MEMORY` | 设为 `on` 启用用户记忆 | -| `NVIDIA_API_KEY` / `FIREWORKS_API_KEY` / `SGLANG_API_KEY` / `VLLM_API_KEY` / `OLLAMA_API_KEY` | 提供商认证 | +| `NVIDIA_API_KEY` / `OPENAI_API_KEY` / `OPENROUTER_API_KEY` / `NOVITA_API_KEY` / `ATLASCLOUD_API_KEY` / `FIREWORKS_API_KEY` / `SGLANG_API_KEY` / `VLLM_API_KEY` / `OLLAMA_API_KEY` | 提供商认证 | +| `OPENAI_BASE_URL` / `OPENAI_MODEL` | 通用 OpenAI 兼容端点和模型 ID | | `SGLANG_BASE_URL` | 自托管 SGLang 端点 | | `VLLM_BASE_URL` | 自托管 vLLM 端点 | | `OLLAMA_BASE_URL` | 自托管 Ollama 端点 | diff --git a/crates/agent/Cargo.toml b/crates/agent/Cargo.toml index 9aa5759f..69145890 100644 --- a/crates/agent/Cargo.toml +++ b/crates/agent/Cargo.toml @@ -7,5 +7,5 @@ repository.workspace = true description = "Model/provider registry and fallback strategy for DeepSeek workspace architecture" [dependencies] -deepseek-config = { path = "../config", version = "0.8.32" } +deepseek-config = { path = "../config", version = "0.8.33" } serde.workspace = true diff --git a/crates/app-server/Cargo.toml b/crates/app-server/Cargo.toml index 1a3aca96..e9f2dd62 100644 --- a/crates/app-server/Cargo.toml +++ b/crates/app-server/Cargo.toml @@ -10,15 +10,15 @@ description = "Codex-style app-server transport for DeepSeek workspace architect anyhow.workspace = true axum.workspace = true clap.workspace = true -deepseek-agent = { path = "../agent", version = "0.8.32" } -deepseek-config = { path = "../config", version = "0.8.32" } -deepseek-core = { path = "../core", version = "0.8.32" } -deepseek-execpolicy = { path = "../execpolicy", version = "0.8.32" } -deepseek-hooks = { path = "../hooks", version = "0.8.32" } -deepseek-mcp = { path = "../mcp", version = "0.8.32" } -deepseek-protocol = { path = "../protocol", version = "0.8.32" } -deepseek-state = { path = "../state", version = "0.8.32" } -deepseek-tools = { path = "../tools", version = "0.8.32" } +deepseek-agent = { path = "../agent", version = "0.8.33" } +deepseek-config = { path = "../config", version = "0.8.33" } +deepseek-core = { path = "../core", version = "0.8.33" } +deepseek-execpolicy = { path = "../execpolicy", version = "0.8.33" } +deepseek-hooks = { path = "../hooks", version = "0.8.33" } +deepseek-mcp = { path = "../mcp", version = "0.8.33" } +deepseek-protocol = { path = "../protocol", version = "0.8.33" } +deepseek-state = { path = "../state", version = "0.8.33" } +deepseek-tools = { path = "../tools", version = "0.8.33" } serde.workspace = true serde_json.workspace = true tokio.workspace = true diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml index 0686df54..53d59366 100644 --- a/crates/cli/Cargo.toml +++ b/crates/cli/Cargo.toml @@ -14,13 +14,13 @@ path = "src/main.rs" anyhow.workspace = true clap.workspace = true clap_complete.workspace = true -deepseek-agent = { path = "../agent", version = "0.8.32" } -deepseek-app-server = { path = "../app-server", version = "0.8.32" } -deepseek-config = { path = "../config", version = "0.8.32" } -deepseek-execpolicy = { path = "../execpolicy", version = "0.8.32" } -deepseek-mcp = { path = "../mcp", version = "0.8.32" } -deepseek-secrets = { path = "../secrets", version = "0.8.32" } -deepseek-state = { path = "../state", version = "0.8.32" } +deepseek-agent = { path = "../agent", version = "0.8.33" } +deepseek-app-server = { path = "../app-server", version = "0.8.33" } +deepseek-config = { path = "../config", version = "0.8.33" } +deepseek-execpolicy = { path = "../execpolicy", version = "0.8.33" } +deepseek-mcp = { path = "../mcp", version = "0.8.33" } +deepseek-secrets = { path = "../secrets", version = "0.8.33" } +deepseek-state = { path = "../state", version = "0.8.33" } chrono.workspace = true dirs.workspace = true serde.workspace = true diff --git a/crates/config/Cargo.toml b/crates/config/Cargo.toml index d9d6ac12..c8793743 100644 --- a/crates/config/Cargo.toml +++ b/crates/config/Cargo.toml @@ -8,7 +8,7 @@ description = "Config schema and precedence model for DeepSeek workspace archite [dependencies] anyhow.workspace = true -deepseek-secrets = { path = "../secrets", version = "0.8.32" } +deepseek-secrets = { path = "../secrets", version = "0.8.33" } dirs.workspace = true serde.workspace = true toml.workspace = true diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index 7064eded..50f2598c 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -9,13 +9,13 @@ description = "Core runtime boundaries for DeepSeek workspace architecture" [dependencies] anyhow.workspace = true chrono.workspace = true -deepseek-agent = { path = "../agent", version = "0.8.32" } -deepseek-config = { path = "../config", version = "0.8.32" } -deepseek-execpolicy = { path = "../execpolicy", version = "0.8.32" } -deepseek-hooks = { path = "../hooks", version = "0.8.32" } -deepseek-mcp = { path = "../mcp", version = "0.8.32" } -deepseek-protocol = { path = "../protocol", version = "0.8.32" } -deepseek-state = { path = "../state", version = "0.8.32" } -deepseek-tools = { path = "../tools", version = "0.8.32" } +deepseek-agent = { path = "../agent", version = "0.8.33" } +deepseek-config = { path = "../config", version = "0.8.33" } +deepseek-execpolicy = { path = "../execpolicy", version = "0.8.33" } +deepseek-hooks = { path = "../hooks", version = "0.8.33" } +deepseek-mcp = { path = "../mcp", version = "0.8.33" } +deepseek-protocol = { path = "../protocol", version = "0.8.33" } +deepseek-state = { path = "../state", version = "0.8.33" } +deepseek-tools = { path = "../tools", version = "0.8.33" } serde_json.workspace = true uuid.workspace = true diff --git a/crates/execpolicy/Cargo.toml b/crates/execpolicy/Cargo.toml index 979f0033..51f4b341 100644 --- a/crates/execpolicy/Cargo.toml +++ b/crates/execpolicy/Cargo.toml @@ -8,5 +8,5 @@ description = "Execution policy and approval model parity for DeepSeek workspace [dependencies] anyhow.workspace = true -deepseek-protocol = { path = "../protocol", version = "0.8.32" } +deepseek-protocol = { path = "../protocol", version = "0.8.33" } serde.workspace = true diff --git a/crates/hooks/Cargo.toml b/crates/hooks/Cargo.toml index c507ab37..855b2c30 100644 --- a/crates/hooks/Cargo.toml +++ b/crates/hooks/Cargo.toml @@ -10,7 +10,7 @@ description = "Hook dispatch and notifications parity for DeepSeek workspace arc anyhow.workspace = true async-trait.workspace = true chrono.workspace = true -deepseek-protocol = { path = "../protocol", version = "0.8.32" } +deepseek-protocol = { path = "../protocol", version = "0.8.33" } reqwest.workspace = true serde.workspace = true serde_json.workspace = true diff --git a/crates/tools/Cargo.toml b/crates/tools/Cargo.toml index 72e8fc0f..bb1c5306 100644 --- a/crates/tools/Cargo.toml +++ b/crates/tools/Cargo.toml @@ -9,7 +9,7 @@ description = "Tool invocation lifecycle, schema validation, and scheduler paral [dependencies] anyhow.workspace = true async-trait.workspace = true -deepseek-protocol = { path = "../protocol", version = "0.8.32" } +deepseek-protocol = { path = "../protocol", version = "0.8.33" } serde.workspace = true serde_json.workspace = true tokio.workspace = true diff --git a/crates/tui/CHANGELOG.md b/crates/tui/CHANGELOG.md index 250c71ed..79080328 100644 --- a/crates/tui/CHANGELOG.md +++ b/crates/tui/CHANGELOG.md @@ -7,6 +7,99 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.8.33] - 2026-05-12 + +A sub-agent and RLM renovation release. The model-facing delegation +surface is now session-oriented instead of one-shot: RLM work happens +through `rlm_open` / `rlm_eval` / `rlm_configure` / `rlm_close`, +sub-agent work happens through `agent_open` / `agent_eval` / +`agent_close`, and large outputs can be parked behind typed handles +that the model reads back explicitly with `handle_read`. + +### Added + +- **Persistent RLM sessions with bounded REPL helpers.** RLM prompts now + use `peek`, `search`, `chunk`, `context_meta`, `sub_query`, + `sub_query_batch`, `sub_query_map`, `sub_rlm`, and + `finalize(value, confidence)` instead of exposing the full parent + context as an ambient variable. +- **Fork-aware sub-agent sessions.** `agent_open` supports named + sessions, `fork_context`, and bounded recursive depth so the parent can + ask for multiple perspectives while preserving prompt-cache-friendly + prefix context where available. +- **Shared `handle_read` storage.** RLM finals, sub-agent transcripts, + and other large structured results can return `var_handle` references + with slice, range, count, and JSONPath projections. +- **Slash-command routing for the new surface.** `/rlm [N] ...` and + `/agent [N] ...` now prompt the assistant to use the persistent tools + instead of the removed foreground RLM operation. +- **`/relay` slash command with CJK aliases** (`/接力`). Hands the + assistant a structured handoff prompt for coordinated multi-turn + continuation across sessions. +- **`checklist_write` sidebar rename.** The sidebar focus tab formerly + known as "Plan" / "Todos" is now "Work" — one panel for the active + checklist and optional plan, consistent across all three modes. + +### Changed + +- **Prompts and docs now teach only the new tool names.** Legacy + RLM/sub-agent helpers remain internally where needed for durable + transcript compatibility, but the registry exposes the session tools. +- **Large or noisy tool results are easier to keep out of context.** + Tool output summaries, sub-agent results, and transcript snapshots now + point the model toward `handle_read` when it needs raw detail. +- **Tool-surface smoke guidance is explicit.** Release checks now document + the exact version commands and registry-name searches for `handle_read`, + persistent RLM tools, and persistent sub-agent tools. +- **Foreground RLM operation removed.** The old `Op::Rlm` path and its + `handle_rlm` engine method are gone; all RLM work now flows through + the persistent-session tools. +- **Stale competitive-analysis doc removed.** The old cross-agent matrix + had become an unreliable inventory of tool names rather than useful + release guidance. + +### Fixed + +- **Transcript selection keeps working while the agent is streaming.** + The loading-state mouse filter now drops inert move events but allows + active transcript and scrollbar drags to continue. +- **Tool papercuts:** `file_search` has safer default excludes and an + explicit `exclude` option; `grep_files` returns single-line context as + strings; `fetch_url` can project JSON fields and returns headers; + `edit_file` can opt into leading-indentation fuzz; `exec_shell` can + merge stdout/stderr in chronological order; `revert_turn` rejects + no-op snapshot boundaries. +- **CLI reasoning-effort honoured on non-auto exec routes** (PR #1511 + from **@h3c-hexin**). `deepseek -p "..." --reasoning-effort high` now + applies the flag correctly instead of falling back to the config-file + default. +- **Edit-file replacement boundaries clarified** (PR #1516). The tool + description and error messages now make it unambiguous that + `edit_file` is for one clear replacement in one file. +- **Pandoc output validated before probing** (PR #1523). Binary-format + conversions that produce empty or invalid output now surface a clear + error instead of a confusing pandoc stack trace. +- **Running turns can be steered and repainted** (PR #1533, #1537). + Composer input during an active turn no longer stalls; the TUI + redraws the transcript as the agent streams. +- **Tasks and Activity Detail are calmer under load.** The Tasks panel now + keeps live/background/recent activity from double-counting the same shell + or RLM work, groups repeated read/search/checklist noise, and keeps + failures, status, command summaries, and durations visible. Ctrl+O now + opens Activity Detail for the selected, live, or most recent meaningful + activity while Alt+V remains the direct tool-detail pager; the idle footer + now advertises that split for the visible activity. +- **npm retry shows timeout hint on first failure** (PR #1538). + Installations behind slow proxies now see a clear "retrying" message + instead of a silent hang. +- **Issue templates improved** (PR #1525 from **@reidliu41**). Bug and + feature-request templates are clearer and easier for new contributors. + +### Credits + +Thanks to **@reidliu41** (#1525) and **@h3c-hexin** (#1511) for +community contributions in this release. + ## [0.8.32] - 2026-05-12 A "more useful tools" release. v0.8.31 made the tool surface @@ -3821,7 +3914,8 @@ Welcome — and thank you. - Hooks system and config profiles - Example skills and launch assets -[Unreleased]: https://github.com/Hmbown/DeepSeek-TUI/compare/v0.8.32...HEAD +[Unreleased]: https://github.com/Hmbown/DeepSeek-TUI/compare/v0.8.33...HEAD +[0.8.33]: https://github.com/Hmbown/DeepSeek-TUI/compare/v0.8.32...v0.8.33 [0.8.32]: https://github.com/Hmbown/DeepSeek-TUI/compare/v0.8.31...v0.8.32 [0.8.31]: https://github.com/Hmbown/DeepSeek-TUI/compare/v0.8.30...v0.8.31 [0.8.30]: https://github.com/Hmbown/DeepSeek-TUI/compare/v0.8.29...v0.8.30 diff --git a/crates/tui/Cargo.toml b/crates/tui/Cargo.toml index aa24e392..44873c5f 100644 --- a/crates/tui/Cargo.toml +++ b/crates/tui/Cargo.toml @@ -21,8 +21,8 @@ path = "src/main.rs" [dependencies] anyhow = "1.0.100" arboard = "3.4" -deepseek-secrets = { path = "../secrets", version = "0.8.32" } -deepseek-tools = { path = "../tools", version = "0.8.32" } +deepseek-secrets = { path = "../secrets", version = "0.8.33" } +deepseek-tools = { path = "../tools", version = "0.8.33" } schemaui = { version = "0.12.0", default-features = false, optional = true } async-stream = "0.3.6" async-trait = "0.1" diff --git a/crates/tui/src/client.rs b/crates/tui/src/client.rs index 3b5875ff..3fc09f50 100644 --- a/crates/tui/src/client.rs +++ b/crates/tui/src/client.rs @@ -1752,7 +1752,7 @@ mod tests { ], max_tokens: 1024, system: Some(SystemPrompt::Text( - "Base policy\n\n\nStable project rules\n\n\n## Previous Session Handoff\n\nDynamic handoff" + "Base policy\n\n\nStable project rules\n\n\n## Previous Session Relay\n\nDynamic relay" .to_string(), )), tools: None, @@ -1788,7 +1788,7 @@ mod tests { .and_then(Value::as_str) .expect("warmup system prompt"); assert!(system.contains("Stable project rules")); - assert!(!system.contains("Dynamic handoff")); + assert!(!system.contains("Dynamic relay")); assert!( !wire .iter() diff --git a/crates/tui/src/client/chat.rs b/crates/tui/src/client/chat.rs index c9e51a01..2b3e6f59 100644 --- a/crates/tui/src/client/chat.rs +++ b/crates/tui/src/client/chat.rs @@ -685,7 +685,7 @@ fn split_system_layers(content: &str) -> Vec<(String, PromptLayerStability, &str ("Skills", "## Skills"), ("Context management", "## Context Management"), ("Compact template", "## Compact"), - ("Previous session handoff", "## Previous Session Handoff"), + ("Previous session relay", "## Previous Session Relay"), ]; let mut starts: Vec<(usize, &str)> = markers @@ -706,7 +706,7 @@ fn split_system_layers(content: &str) -> Vec<(String, PromptLayerStability, &str for (i, (start, name)) in starts.iter().enumerate() { let end = starts.get(i + 1).map_or(content.len(), |(idx, _)| *idx); - let stability = if *name == "Previous session handoff" { + let stability = if *name == "Previous session relay" { PromptLayerStability::Dynamic } else if is_static_base_layer(name) { PromptLayerStability::Static diff --git a/crates/tui/src/commands/core.rs b/crates/tui/src/commands/core.rs index 6a9088ec..5314e289 100644 --- a/crates/tui/src/commands/core.rs +++ b/crates/tui/src/commands/core.rs @@ -545,6 +545,8 @@ mod tests { #[test] fn model_reset_same_model_keeps_turn_cache_history() { let mut app = create_test_app(); + app.auto_model = false; + app.model = "deepseek-v4-pro".to_string(); app.push_turn_cache_record(TurnCacheRecord { input_tokens: 100, output_tokens: 25, diff --git a/crates/tui/src/commands/mod.rs b/crates/tui/src/commands/mod.rs index a3616450..c8c1740e 100644 --- a/crates/tui/src/commands/mod.rs +++ b/crates/tui/src/commands/mod.rs @@ -32,6 +32,8 @@ mod status; mod task; mod user_commands; +use std::fmt::Write as _; + use crate::localization::{Locale, MessageId, tr}; use crate::tui::app::{App, AppAction}; @@ -204,6 +206,12 @@ pub const COMMANDS: &[CommandInfo] = &[ usage: "/subagents", description_id: MessageId::CmdSubagentsDescription, }, + CommandInfo { + name: "agent", + aliases: &[], + usage: "/agent [N] ", + description_id: MessageId::CmdAgentDescription, + }, CommandInfo { name: "links", aliases: &["dashboard", "api"], @@ -295,6 +303,12 @@ pub const COMMANDS: &[CommandInfo] = &[ usage: "/compact", description_id: MessageId::CmdCompactDescription, }, + CommandInfo { + name: "relay", + aliases: &["batonpass", "接力"], + usage: "/relay [focus]", + description_id: MessageId::CmdRelayDescription, + }, CommandInfo { name: "context", aliases: &["ctx"], @@ -482,7 +496,7 @@ pub const COMMANDS: &[CommandInfo] = &[ CommandInfo { name: "rlm", aliases: &["recursive"], - usage: "/rlm ", + usage: "/rlm [N] ", description_id: MessageId::CmdRlmDescription, }, // Debug/cost command @@ -534,6 +548,7 @@ pub fn execute(cmd: &str, app: &mut App) -> CommandResult { "stash" | "park" => stash::stash(app, arg), "hooks" | "hook" => hooks::hooks(app, arg), "subagents" | "agents" => core::subagents(app), + "agent" => agent(app, arg), "links" | "dashboard" | "api" => core::deepseek_links(app), "feedback" => feedback::feedback(app, arg), "home" | "stats" | "overview" => core::home_dashboard(app), @@ -551,6 +566,7 @@ pub fn execute(cmd: &str, app: &mut App) -> CommandResult { "sessions" | "resume" => session::sessions(app, arg), "load" => session::load(app, arg), "compact" => session::compact(app), + "relay" | "batonpass" | "接力" => relay(app, arg), "cycles" => cycle::list_cycles(app), "cycle" => cycle::show_cycle(app, arg), "recall" => cycle::recall_archive(app, arg), @@ -686,49 +702,220 @@ pub use config::{ /// in the REPL as the `PROMPT` variable. The root LLM will only see /// metadata about the REPL state, never the prompt text directly. pub fn rlm(app: &mut App, arg: Option<&str>) -> CommandResult { - let prompt = match arg { + let (max_depth, target) = match parse_depth_prefixed_arg(arg, 1) { + Ok(parsed) => parsed, + Err(message) => return CommandResult::error(message), + }; + let target = match target { Some(p) if !p.trim().is_empty() => p.trim().to_string(), _ => { return CommandResult::error( - "Usage: /rlm \n\n\ - Process a prompt using a Recursive Language Model (RLM).\n\ - The prompt is stored in a REPL and the model writes code\n\ - to decompose and process it recursively." + "Usage: /rlm [N] \n\n\ + Opens a persistent RLM context with sub_rlm depth N (0-3, default 1)." .to_string(), ); } }; - // Sanity-check: RLM is most useful for longer prompts. - if prompt.len() < 50 { - return CommandResult::message( - "Tip: RLM is designed for processing LONG prompts (>100 chars). \ - For short queries, just type the message directly." - .to_string(), + let source_arg = if resolves_to_existing_file(app, &target) { + format!(r#"file_path: "{target}""#) + } else { + format!("content: {:?}", target) + }; + let message = format!( + "Open and use a persistent RLM session for this request. Call `rlm_open` with name `slash_rlm` and {source_arg}. Then call `rlm_configure` with `sub_rlm_max_depth: {max_depth}`. Use `rlm_eval` to inspect the context through `peek`, `search`, and `chunk`, and call `finalize(...)` from the REPL when ready. If a `var_handle` is returned, use `handle_read` for bounded slices or projections before answering." + ); + + CommandResult::with_message_and_action( + format!("Opening persistent RLM context at depth {max_depth}..."), + AppAction::SendMessage(message), + ) +} + +/// Open a persistent sub-agent session from a slash command. +pub fn agent(_app: &mut App, arg: Option<&str>) -> CommandResult { + let (max_depth, task) = match parse_depth_prefixed_arg(arg, 1) { + Ok(parsed) => parsed, + Err(message) => return CommandResult::error(message), + }; + let task = match task { + Some(task) if !task.trim().is_empty() => task.trim().to_string(), + _ => { + return CommandResult::error( + "Usage: /agent [N] \n\n\ + Opens a persistent sub-agent session with recursive agent depth N (0-3, default 1).", + ); + } + }; + let message = format!( + "Open a persistent sub-agent session for this task. Call `agent_open` with name `slash_agent`, `prompt: {:?}`, and `max_depth: {max_depth}`. Use `agent_eval` to wait for the next terminal/current projection and `handle_read` on the returned transcript_handle if you need more detail. Verify any claimed side effects before reporting success.", + task + ); + CommandResult::with_message_and_action( + format!("Opening persistent sub-agent at depth {max_depth}..."), + AppAction::SendMessage(message), + ) +} + +/// Ask the active model to write a compact relay artifact for the next thread. +/// +/// The visible command is `/relay` (with `/接力` for Chinese users), but the +/// durable file path remains `.deepseek/handoff.md` for compatibility with +/// existing sessions and startup prompt loading. +pub fn relay(app: &mut App, arg: Option<&str>) -> CommandResult { + let focus = arg.map(str::trim).filter(|value| !value.is_empty()); + let message = build_relay_instruction(app, focus); + CommandResult::with_message_and_action( + "Preparing session relay at .deepseek/handoff.md...", + AppAction::SendMessage(message), + ) +} + +fn build_relay_instruction(app: &App, focus: Option<&str>) -> String { + let mut out = String::new(); + let _ = writeln!( + out, + "Create a compact session relay (接力) for a future DeepSeek TUI thread." + ); + let _ = writeln!(out); + let _ = writeln!(out, "Write or update `.deepseek/handoff.md`."); + let _ = writeln!( + out, + "Keep the existing file path for compatibility, but title the artifact `# Session relay`." + ); + let _ = writeln!(out); + let _ = writeln!(out, "Current session snapshot:"); + let _ = writeln!(out, "- Workspace: {}", app.workspace.display()); + let _ = writeln!(out, "- Mode: {}", app.mode.label()); + let _ = writeln!(out, "- Model: {}", app.model_display_label()); + if let Some(focus) = focus { + let _ = writeln!(out, "- Requested relay focus: {focus}"); + } + if let Some(goal) = app.goal.goal_objective.as_deref() { + let _ = writeln!(out, "- Goal: {goal}"); + } + if let Some(budget) = app.goal.goal_token_budget { + let _ = writeln!(out, "- Goal token budget: {budget}"); + } + if app.cycle_count > 0 { + let _ = writeln!(out, "- Cycle count: {}", app.cycle_count); + } + + if let Ok(todos) = app.todos.try_lock() { + let snapshot = todos.snapshot(); + if !snapshot.items.is_empty() { + let _ = writeln!( + out, + "\nWork checklist (primary progress surface, {}% complete):", + snapshot.completion_pct + ); + for item in snapshot.items { + let _ = writeln!( + out, + "- #{} [{}] {}", + item.id, + item.status.as_str(), + item.content + ); + } + } + } else { + let _ = writeln!( + out, + "\nWork checklist: unavailable because the checklist is busy." ); } - let model = app.model.clone(); - let child_model = "deepseek-v4-flash".to_string(); - // Paper experiments use depth=1 (one level of `sub_rlm`); we default to - // depth=2 so the model can recurse twice if it chooses to. - let max_depth: u32 = 2; + if let Ok(plan) = app.plan_state.try_lock() { + let snapshot = plan.snapshot(); + if snapshot.explanation.is_some() || !snapshot.items.is_empty() { + let _ = writeln!(out, "\nOptional strategy metadata from update_plan:"); + if let Some(explanation) = snapshot.explanation.as_deref() { + let _ = writeln!(out, "- Explanation: {explanation}"); + } + for item in snapshot.items { + let _ = writeln!(out, "- [{}] {}", plan_status_label(&item.status), item.step); + } + } + } else { + let _ = writeln!( + out, + "\nStrategy metadata: unavailable because plan state is busy." + ); + } - CommandResult::with_message_and_action( - format!( - "Starting RLM turn for {} chars of prompt using {} (child={}, depth={})...", - prompt.len(), - model, - child_model, - max_depth, - ), - AppAction::Rlm { - prompt, - model, - child_model, - max_depth, - }, - ) + let _ = writeln!( + out, + "\nBefore writing, inspect the current transcript context and any live tool evidence you need. Do not invent test results, file changes, blockers, or decisions." + ); + let _ = writeln!( + out, + "\nUse this compact structure:\n\ + # Session relay\n\ + \n\ + ## Goal\n\ + [the user's objective and any explicit constraints]\n\ + \n\ + ## Current work\n\ + [the active Work checklist item, progress, and what is mid-flight]\n\ + \n\ + ## Files and state\n\ + [changed files, important paths, sub-agents/RLM sessions, commands run]\n\ + \n\ + ## Decisions\n\ + [why key choices were made]\n\ + \n\ + ## Verification\n\ + [what passed, what failed, what was not run]\n\ + \n\ + ## Next action\n\ + [one concrete action for the next thread]" + ); + let _ = writeln!( + out, + "\nKeep it under about 900 words unless the session genuinely needs more. After writing, report the path and the single next action." + ); + out +} + +fn plan_status_label(status: &crate::tools::plan::StepStatus) -> &'static str { + match status { + crate::tools::plan::StepStatus::Pending => "pending", + crate::tools::plan::StepStatus::InProgress => "in_progress", + crate::tools::plan::StepStatus::Completed => "completed", + } +} + +fn parse_depth_prefixed_arg( + arg: Option<&str>, + default_depth: u32, +) -> Result<(u32, Option<&str>), String> { + let Some(raw) = arg.map(str::trim).filter(|raw| !raw.is_empty()) else { + return Ok((default_depth, None)); + }; + let mut parts = raw.splitn(2, char::is_whitespace); + let first = parts.next().unwrap_or_default(); + if first.chars().all(|ch| ch.is_ascii_digit()) { + let depth: u32 = first + .parse() + .map_err(|_| "Depth must be an integer from 0 to 3".to_string())?; + if depth > 3 { + return Err("Depth must be between 0 and 3".to_string()); + } + Ok((depth, parts.next().map(str::trim))) + } else { + Ok((default_depth, Some(raw))) + } +} + +fn resolves_to_existing_file(app: &App, input: &str) -> bool { + let path = std::path::Path::new(input); + let candidate = if path.is_absolute() { + path.to_path_buf() + } else { + app.workspace.join(path) + }; + candidate.is_file() } /// Get command info by name or alias @@ -862,6 +1049,8 @@ fn suggest_command_names(input: &str, limit: usize) -> Vec { mod tests { use super::*; use crate::config::Config; + use crate::tools::plan::{PlanItemArg, StepStatus, UpdatePlanArgs}; + use crate::tools::todo::TodoStatus; use crate::tui::app::{App, AppAction, TuiOptions}; use std::ffi::OsString; use std::path::{Path, PathBuf}; @@ -910,6 +1099,101 @@ mod tests { assert_eq!(links.aliases, &["dashboard", "api"]); } + #[test] + fn rlm_slash_command_routes_to_persistent_tool_instruction() { + let mut app = create_test_app(); + let result = execute("/rlm 2 inspect this long corpus", &mut app); + assert!(!result.is_error); + assert!(result.message.as_deref().unwrap_or("").contains("depth 2")); + let Some(AppAction::SendMessage(message)) = result.action else { + panic!("expected SendMessage action"); + }; + assert!(message.contains("rlm_open")); + assert!(message.contains("rlm_configure")); + assert!(message.contains("sub_rlm_max_depth: 2")); + } + + #[test] + fn agent_slash_command_routes_to_persistent_tool_instruction() { + let mut app = create_test_app(); + let result = execute("/agent 0 inspect the parser", &mut app); + assert!(!result.is_error); + let Some(AppAction::SendMessage(message)) = result.action else { + panic!("expected SendMessage action"); + }; + assert!(message.contains("agent_open")); + assert!(message.contains("max_depth: 0")); + } + + #[test] + fn relay_slash_command_routes_to_session_relay_instruction() { + let mut app = create_test_app(); + app.goal.goal_objective = Some("Unify the work surface".to_string()); + app.goal.goal_token_budget = Some(12_000); + app.cycle_count = 2; + { + let mut todos = app.todos.try_lock().expect("todo lock"); + todos.add("inspect workspace".to_string(), TodoStatus::Completed); + todos.add("patch relay command".to_string(), TodoStatus::InProgress); + } + { + let mut plan = app.plan_state.try_lock().expect("plan lock"); + plan.update(UpdatePlanArgs { + explanation: Some("RLM-style strategy".to_string()), + plan: vec![PlanItemArg { + step: "keep checklist primary".to_string(), + status: StepStatus::InProgress, + }], + }); + } + + let result = execute("/relay verify install", &mut app); + assert!(!result.is_error); + assert!( + result + .message + .as_deref() + .unwrap_or_default() + .contains(".deepseek/handoff.md") + ); + let Some(AppAction::SendMessage(message)) = result.action else { + panic!("expected SendMessage action"); + }; + assert!(message.contains("session relay")); + assert!(message.contains("接力")); + assert!(message.contains("Write or update `.deepseek/handoff.md`")); + assert!(message.contains("# Session relay")); + assert!(message.contains("Requested relay focus: verify install")); + assert!(message.contains("Goal: Unify the work surface")); + assert!(message.contains("Goal token budget: 12000")); + assert!(message.contains("Cycle count: 2")); + assert!(message.contains("Work checklist (primary progress surface, 50% complete)")); + assert!(message.contains("#1 [completed] inspect workspace")); + assert!(message.contains("#2 [in_progress] patch relay command")); + assert!(message.contains("Optional strategy metadata from update_plan")); + assert!(message.contains("Explanation: RLM-style strategy")); + assert!(message.contains("[in_progress] keep checklist primary")); + } + + #[test] + fn relay_command_has_bilingual_aliases() { + let relay = COMMANDS + .iter() + .find(|cmd| cmd.name == "relay") + .expect("relay command should exist"); + assert_eq!(relay.aliases, &["batonpass", "接力"]); + assert!(relay.description_for(Locale::ZhHans).contains("接力")); + assert!(relay.description_for(Locale::ZhHant).contains("接力")); + + let mut app = create_test_app(); + let result = execute("/接力 next hand", &mut app); + assert!(!result.is_error); + let Some(AppAction::SendMessage(message)) = result.action else { + panic!("expected SendMessage action"); + }; + assert!(message.contains("Requested relay focus: next hand")); + } + #[test] fn command_registry_has_unique_names_and_aliases() { let mut names = std::collections::BTreeSet::new(); diff --git a/crates/tui/src/config_ui.rs b/crates/tui/src/config_ui.rs index 12860194..c3cc125e 100644 --- a/crates/tui/src/config_ui.rs +++ b/crates/tui/src/config_ui.rs @@ -199,8 +199,7 @@ pub enum CostCurrencyValue { #[serde(rename_all = "snake_case")] pub enum SidebarFocusValue { Auto, - Plan, - Todos, + Work, Tasks, Agents, Context, @@ -724,8 +723,7 @@ impl SidebarFocusValue { fn as_setting(self) -> &'static str { match self { Self::Auto => "auto", - Self::Plan => "plan", - Self::Todos => "todos", + Self::Work => "work", Self::Tasks => "tasks", Self::Agents => "agents", Self::Context => "context", @@ -842,8 +840,7 @@ impl From<&str> for SidebarFocusValue { fn from(value: &str) -> Self { match SidebarFocus::from_setting(value) { SidebarFocus::Auto => Self::Auto, - SidebarFocus::Plan => Self::Plan, - SidebarFocus::Todos => Self::Todos, + SidebarFocus::Work => Self::Work, SidebarFocus::Tasks => Self::Tasks, SidebarFocus::Agents => Self::Agents, SidebarFocus::Context => Self::Context, diff --git a/crates/tui/src/core/engine.rs b/crates/tui/src/core/engine.rs index 9c596f42..6b181028 100644 --- a/crates/tui/src/core/engine.rs +++ b/crates/tui/src/core/engine.rs @@ -850,15 +850,6 @@ impl Engine { Op::CompactContext => { self.handle_manual_compaction().await; } - Op::Rlm { - content, - model, - child_model, - max_depth, - } => { - self.handle_rlm(content, model, child_model, max_depth) - .await; - } Op::EditLastTurn { new_message } => { // #383: /edit — remove the last user+assistant exchange // from the session, then re-send with the new content. @@ -1339,100 +1330,6 @@ impl Engine { .await; } - /// Handle a Recursive Language Model (RLM) query — Algorithm 1 from - /// Zhang et al. (arXiv:2512.24601). - /// - /// The prompt is stored as PROMPT in a REPL variable. The root LLM - /// only sees metadata about the REPL state, never the prompt text - /// directly. The model generates Python code, which is executed by - /// the REPL. When FINAL() is called, the loop ends. - async fn handle_rlm( - &mut self, - content: String, - model: String, - child_model: String, - max_depth: u32, - ) { - use crate::rlm::turn::run_rlm_turn; - - let Some(ref client) = self.deepseek_client else { - let err = self - .deepseek_client_error - .as_deref() - .map(|s| s.to_string()) - .unwrap_or_else(|| "API client not configured".to_string()); - let _ = self - .tx_event - .send(Event::error(ErrorEnvelope::fatal_auth(format!( - "RLM error: {err}" - )))) - .await; - return; - }; - - let _ = self - .tx_event - .send(Event::status("RLM turn started".to_string())) - .await; - - let result = run_rlm_turn( - client, - model, - content, - child_model, - self.tx_event.clone(), - max_depth, - ) - .await; - - let has_error = result.error.is_some(); - if let Some(ref err) = result.error { - let _ = self - .tx_event - .send(Event::error(ErrorEnvelope::tool(format!( - "RLM error: {err}" - )))) - .await; - } - - if !result.answer.is_empty() { - // Add the final answer as an assistant message in the session. - self.add_session_message(crate::models::Message { - role: "assistant".to_string(), - content: vec![crate::models::ContentBlock::Text { - text: result.answer.clone(), - cache_control: None, - }], - }) - .await; - - let _ = self - .tx_event - .send(Event::MessageDelta { - index: 0, - content: result.answer.clone(), - }) - .await; - let _ = self - .tx_event - .send(Event::MessageComplete { index: 0 }) - .await; - } - - let _ = self - .tx_event - .send(Event::TurnComplete { - usage: result.usage, - status: if has_error { - crate::core::events::TurnOutcomeStatus::Failed - } else { - crate::core::events::TurnOutcomeStatus::Completed - }, - error: result.error, - }) - .await; - } - fn estimated_input_tokens(&self) -> usize { estimate_input_tokens_conservative( &self.session.messages, diff --git a/crates/tui/src/core/engine/context.rs b/crates/tui/src/core/engine/context.rs index 4896cbb2..3ec96626 100644 --- a/crates/tui/src/core/engine/context.rs +++ b/crates/tui/src/core/engine/context.rs @@ -146,6 +146,10 @@ fn summarize_subagent_status(status: &serde_json::Value) -> String { } fn summarize_subagent_snapshot(snapshot: &serde_json::Value, index: usize) -> String { + if let Some(inner) = snapshot.get("snapshot") { + return summarize_subagent_snapshot(inner, index); + } + let Some(obj) = snapshot.as_object() else { return format!( "- item {index}: {}", @@ -202,7 +206,10 @@ fn summarize_subagent_snapshot(snapshot: &serde_json::Value, index: usize) -> St } fn compact_subagent_tool_result_for_context(tool_name: &str, raw: &str) -> Option { - if !matches!(tool_name, "agent_result" | "agent_wait" | "wait") { + if !matches!( + tool_name, + "agent_open" | "agent_eval" | "agent_close" | "agent_result" | "agent_wait" | "wait" + ) { return None; } @@ -217,7 +224,7 @@ fn compact_subagent_tool_result_for_context(tool_name: &str, raw: &str) -> Optio out.push_str( "Child results are self-reports; verify side effects with tools like read_file or list_dir before claiming success.\n", ); - out.push_str("Use `agent_result` again only if you need the full raw payload.\n"); + out.push_str("Use `agent_eval` for a fresh projection or `handle_read` on `transcript_handle` for bounded transcript slices.\n"); for (idx, snapshot) in snapshots.iter().enumerate() { if idx >= 8 { out.push_str(&format!( diff --git a/crates/tui/src/core/engine/tests.rs b/crates/tui/src/core/engine/tests.rs index d247103f..05f9b259 100644 --- a/crates/tui/src/core/engine/tests.rs +++ b/crates/tui/src/core/engine/tests.rs @@ -533,6 +533,7 @@ fn turn_tool_registry_builder_keeps_plan_mode_read_only_for_files() { assert!(registry.contains("update_plan")); assert!(registry.contains("task_list")); assert!(registry.contains("task_read")); + assert!(registry.contains("handle_read")); assert!(registry.contains("recall_archive")); let plan_state_tools = [ @@ -827,7 +828,7 @@ fn subagent_results_are_summarized_before_parent_context_insertion() { .to_string(), ); - let context = compact_tool_result_for_context("deepseek-v4-pro", "agent_result", &output); + let context = compact_tool_result_for_context("deepseek-v4-pro", "agent_eval", &output); assert!(context.contains("[sub-agent result summarized for parent context]")); assert!(context.contains("agent_1234abcd (explore) status=Completed")); @@ -837,6 +838,7 @@ fn subagent_results_are_summarized_before_parent_context_insertion() { assert!(context.contains("self-report")); assert!(context.contains("verify side effects")); assert!(context.contains("read_file") && context.contains("list_dir")); + assert!(context.contains("handle_read")); } #[test] diff --git a/crates/tui/src/core/engine/tool_catalog.rs b/crates/tui/src/core/engine/tool_catalog.rs index 25530993..71e7865c 100644 --- a/crates/tui/src/core/engine/tool_catalog.rs +++ b/crates/tui/src/core/engine/tool_catalog.rs @@ -58,7 +58,11 @@ pub(super) fn should_default_defer_tool(name: &str, mode: AppMode) -> bool { | "grep_files" | "file_search" | "diagnostics" - | "rlm" + | "rlm_open" + | "rlm_eval" + | "rlm_configure" + | "rlm_close" + | "handle_read" | "recall_archive" | "notify" | MULTI_TOOL_PARALLEL_NAME diff --git a/crates/tui/src/core/engine/tool_setup.rs b/crates/tui/src/core/engine/tool_setup.rs index cc1bfdf0..2354d6a8 100644 --- a/crates/tui/src/core/engine/tool_setup.rs +++ b/crates/tui/src/core/engine/tool_setup.rs @@ -48,6 +48,7 @@ impl Engine { .with_diagnostics_tool() .with_skill_tools() .with_validation_tools() + .with_handle_tools() .with_runtime_read_only_task_tools() .with_todo_tool(todo_list) .with_plan_tool(plan_state) diff --git a/crates/tui/src/core/ops.rs b/crates/tui/src/core/ops.rs index b77385be..77dc8fcc 100644 --- a/crates/tui/src/core/ops.rs +++ b/crates/tui/src/core/ops.rs @@ -75,21 +75,6 @@ pub enum Op { /// Run context compaction immediately. CompactContext, - /// Run a Recursive Language Model (RLM) turn per Algorithm 1 of - /// Zhang et al. (arXiv:2512.24601). The prompt is stored in the REPL - /// as `context`; the root LLM only sees metadata. - Rlm { - /// The user's prompt — stored in REPL, NOT in the LLM context. - content: String, - /// The model to use for root LLM calls. - model: String, - /// The model to use for sub-LLM (llm_query) calls. - child_model: String, - /// Recursion budget for `sub_rlm()` calls. Paper experiments use - /// depth=1; defaults set by the `/rlm` command. - max_depth: u32, - }, - /// Edit the last user message: remove the last user+assistant exchange /// from the session, then re-send with the new content. #[allow(dead_code)] diff --git a/crates/tui/src/cycle_manager.rs b/crates/tui/src/cycle_manager.rs index 8d31939c..c4d5b4c7 100644 --- a/crates/tui/src/cycle_manager.rs +++ b/crates/tui/src/cycle_manager.rs @@ -10,8 +10,8 @@ //! summary as if it were verbatim and confabulates around the gaps. //! //! Checkpoint-restart fixes this by giving every cycle a *homogeneous* fresh -//! context: original system prompt, structured state (todos / plan / working -//! set / sub-agent handles), and a model-curated free-form briefing of at +//! context: original system prompt, structured work state (checklist / +//! strategy / working set / sub-agent handles), and a model-curated free-form briefing of at //! most ~3,000 tokens. The previous cycle is archived to disk in JSONL form //! so a future `recall_archive` tool (issue #127) can search it on demand. //! @@ -271,8 +271,27 @@ impl StructuredState { out.push_str(&format!("- Cwd: `{}`\n", cwd.display())); } + if self.todo_snapshot.is_some() || self.plan_snapshot.is_some() { + out.push_str("\n### Work\n"); + } + + if let Some(todos) = self.todo_snapshot.as_ref() { + out.push_str(&format!( + "\nChecklist ({}% complete)\n", + todos.completion_pct + )); + for item in &todos.items { + let marker = match item.status { + crate::tools::todo::TodoStatus::Pending => "[ ]", + crate::tools::todo::TodoStatus::InProgress => "[~]", + crate::tools::todo::TodoStatus::Completed => "[x]", + }; + out.push_str(&format!("- {marker} {}\n", item.content)); + } + } + if let Some(plan) = self.plan_snapshot.as_ref() { - out.push_str("\n### Plan\n"); + out.push_str("\nStrategy\n"); if let Some(explanation) = plan.explanation.as_ref() { out.push_str(&format!("{explanation}\n\n")); } @@ -286,21 +305,6 @@ impl StructuredState { } } - if let Some(todos) = self.todo_snapshot.as_ref() { - out.push_str(&format!( - "\n### Todos ({}% complete)\n", - todos.completion_pct - )); - for item in &todos.items { - let marker = match item.status { - crate::tools::todo::TodoStatus::Pending => "[ ]", - crate::tools::todo::TodoStatus::InProgress => "[~]", - crate::tools::todo::TodoStatus::Completed => "[x]", - }; - out.push_str(&format!("- {marker} {}\n", item.content)); - } - } - if !self.subagent_snapshots.is_empty() { out.push_str("\n### Open Sub-Agents\n"); for s in &self.subagent_snapshots { @@ -976,6 +980,41 @@ mod tests { assert!(block.contains("Workspace: `/tmp/ws`")); } + #[test] + fn structured_state_to_system_block_unifies_work_state() { + let state = StructuredState { + mode_label: "agent".to_string(), + workspace: PathBuf::from("/tmp/ws"), + cwd: None, + working_set_summary: None, + todo_snapshot: Some(TodoListSnapshot { + items: vec![crate::tools::todo::TodoItem { + id: 1, + content: "Run focused tests".to_string(), + status: crate::tools::todo::TodoStatus::InProgress, + }], + completion_pct: 0, + in_progress_id: Some(1), + }), + plan_snapshot: Some(PlanSnapshot { + explanation: Some("Keep sidebar state unified".to_string()), + items: vec![crate::tools::plan::PlanItemArg { + step: "Update prompts".to_string(), + status: crate::tools::plan::StepStatus::Pending, + }], + }), + subagent_snapshots: Vec::new(), + }; + + let block = state.to_system_block().expect("renders"); + + assert!(block.contains("### Work")); + assert!(block.contains("Checklist (0% complete)")); + assert!(block.contains("Strategy")); + assert!(!block.contains("### Plan")); + assert!(!block.contains("### Todos")); + } + #[test] fn archive_cycle_writes_jsonl_with_header_and_messages() { let dir = tempdir().expect("tempdir"); diff --git a/crates/tui/src/deepseek_theme.rs b/crates/tui/src/deepseek_theme.rs index 13f96cd0..bf31afef 100644 --- a/crates/tui/src/deepseek_theme.rs +++ b/crates/tui/src/deepseek_theme.rs @@ -67,7 +67,7 @@ impl Theme { section_bg: palette::DEEPSEEK_INK, section_title_color: palette::DEEPSEEK_BLUE, // Horizontal padding only. `Padding::uniform(1)` ate two rows of - // each sidebar panel — for compact terminals where Plan/Todos/Tasks + // each sidebar panel — for compact terminals where Work/Tasks/Agents // get ~3 rows total via the 25% layout split, that left zero rows // for content (#63 follow-up: panels rendered as empty boxes even // when "No todos" / "No active plan" should have shown). diff --git a/crates/tui/src/handoff.rs b/crates/tui/src/handoff.rs index 6c453ad0..ae64fcb9 100644 --- a/crates/tui/src/handoff.rs +++ b/crates/tui/src/handoff.rs @@ -1,13 +1,13 @@ -// Used by the deferred context-limit handoff feature (#667). The implementation +// Used by the deferred context-limit relay feature (#667). The implementation // path is staged but not yet wired from the engine; suppress dead-code warnings // rather than delete the table until the follow-up feature consumes it. #[allow(dead_code)] pub const THRESHOLDS: [(f32, &str); 3] = [ ( 0.9, - "Context at 90%: stop and write handoff to .deepseek/handoff.md now", + "Context at 90%: stop and write relay to .deepseek/handoff.md now", ), - (0.8, "Context at 80%: draft handoff to .deepseek/handoff.md"), + (0.8, "Context at 80%: draft relay to .deepseek/handoff.md"), (0.7, "Context at 70%: consider wrapping current sub-task"), ]; #[allow(dead_code)] diff --git a/crates/tui/src/localization.rs b/crates/tui/src/localization.rs index 941dd060..25eed32e 100644 --- a/crates/tui/src/localization.rs +++ b/crates/tui/src/localization.rs @@ -256,6 +256,7 @@ pub enum MessageId { CmdHelpDescription, CmdHomeDescription, CmdHooksDescription, + CmdAgentDescription, CmdGoalDescription, CmdInitDescription, CmdJobsDescription, @@ -273,6 +274,7 @@ pub enum MessageId { CmdProviderDescription, CmdQueueDescription, CmdRecallDescription, + CmdRelayDescription, CmdRenameDescription, CmdRestoreDescription, CmdRetryDescription, @@ -485,6 +487,7 @@ pub const ALL_MESSAGE_IDS: &[MessageId] = &[ MessageId::CmdHelpDescription, MessageId::CmdHomeDescription, MessageId::CmdHooksDescription, + MessageId::CmdAgentDescription, MessageId::CmdInitDescription, MessageId::CmdJobsDescription, MessageId::CmdLinksDescription, @@ -500,6 +503,7 @@ pub const ALL_MESSAGE_IDS: &[MessageId] = &[ MessageId::CmdProviderDescription, MessageId::CmdQueueDescription, MessageId::CmdRecallDescription, + MessageId::CmdRelayDescription, MessageId::CmdRenameDescription, MessageId::CmdRestoreDescription, MessageId::CmdRetryDescription, @@ -898,6 +902,9 @@ fn english(id: MessageId) -> &'static str { MessageId::CmdHelpDescription => "Show help information", MessageId::CmdHomeDescription => "Show home dashboard with stats and quick actions", MessageId::CmdHooksDescription => "List configured lifecycle hooks (read-only)", + MessageId::CmdAgentDescription => { + "Open a persistent sub-agent session: /agent [0-3] " + } MessageId::CmdGoalDescription => "Set a session goal with optional token budget", MessageId::CmdInitDescription => "Generate AGENTS.md for project", MessageId::CmdLspDescription => "Toggle LSP diagnostics on or off", @@ -921,15 +928,14 @@ fn english(id: MessageId) -> &'static str { } MessageId::CmdQueueDescription => "View or edit queued messages", MessageId::CmdRecallDescription => "Search prior cycle archives (BM25 over message text)", + MessageId::CmdRelayDescription => "Create a session relay (接力) for a fresh thread", MessageId::CmdRenameDescription => "Rename the current session", MessageId::CmdRestoreDescription => { "Roll back the workspace to a prior pre/post-turn snapshot. With no arg, lists recent snapshots." } MessageId::CmdRetryDescription => "Retry the last request", MessageId::CmdReviewDescription => "Run a structured code review on a file, diff, or PR", - MessageId::CmdRlmDescription => { - "Recursive Language Model (RLM) turn — store the prompt in a Python REPL and let the model write code to process it, with `llm_query()` / `sub_rlm()` for sub-LLM calls." - } + MessageId::CmdRlmDescription => "Open a persistent RLM context: /rlm [0-3] ", MessageId::CmdSaveDescription => "Save session to file", MessageId::CmdSessionsDescription => "Open session picker", MessageId::CmdSettingsDescription => "Show persistent settings", @@ -1055,7 +1061,7 @@ fn english(id: MessageId) -> &'static str { "Open details for the selected tool or message (when input is empty)" } MessageId::KbToolDetailsPager => "Open tool-details pager", - MessageId::KbThinkingPager => "Open thinking pager", + MessageId::KbThinkingPager => "Open Activity Detail", MessageId::KbLiveTranscript => "Open live transcript overlay (sticky-tail auto-scroll)", MessageId::KbBacktrackMessage => { "Backtrack to a previous user message (Left/Right step, Enter to rewind)" @@ -1065,7 +1071,7 @@ fn english(id: MessageId) -> &'static str { } MessageId::KbJumpPlanAgentYolo => "Jump directly to Plan / Agent / YOLO mode", MessageId::KbAltJumpPlanAgentYolo => "Alternative jump to Plan / Agent / YOLO mode", - MessageId::KbFocusSidebar => "Focus Plan / Todos / Tasks / Agents / Auto sidebar", + MessageId::KbFocusSidebar => "Focus Work / Tasks / Agents / Context / Auto sidebar", MessageId::KbTogglePlanAgent => "Toggle between Plan and Agent modes", MessageId::KbSessionPicker => "Open the session picker", MessageId::KbPasteAttach => "Paste text or attach a clipboard image", @@ -1185,6 +1191,7 @@ fn translation(locale: Locale, id: MessageId) -> Option<&'static str> { fn traditional_chinese(id: MessageId) -> Option<&'static str> { Some(match id { + MessageId::CmdRelayDescription => "為新執行緒建立會話接力摘要", MessageId::CmdTranslateDescription => "切換輸出翻譯為目前系統語言的開關狀態", MessageId::CmdTranslateOff => "輸出翻譯已關閉(顯示原始模型輸出)", MessageId::CmdTranslateOn => "輸出翻譯已開啟:模型回覆將以繁體中文顯示", @@ -1268,6 +1275,9 @@ fn japanese(id: MessageId) -> Option<&'static str> { MessageId::CmdHooksDescription => { "設定済みのライフサイクルフックを一覧表示(読み取り専用)" } + MessageId::CmdAgentDescription => { + "永続サブエージェントセッションを開く: /agent [0-3] " + } MessageId::CmdGoalDescription => "トークンバジェット付きのセッション目標を設定", MessageId::CmdInitDescription => "プロジェクト用に AGENTS.md を生成", MessageId::CmdLspDescription => "LSP 診断のオン・オフを切り替え", @@ -1293,15 +1303,14 @@ fn japanese(id: MessageId) -> Option<&'static str> { MessageId::CmdRecallDescription => { "過去のサイクルアーカイブを検索(メッセージ本文への BM25 検索)" } + MessageId::CmdRelayDescription => "新しいスレッド用のセッションリレー(接力)を作成", MessageId::CmdRenameDescription => "現在のセッションの名前を変更", MessageId::CmdRestoreDescription => { "ワークスペースを以前のターン前/後スナップショットへロールバック。引数なしで最近のスナップショットを一覧表示。" } MessageId::CmdRetryDescription => "直前のリクエストを再試行", MessageId::CmdReviewDescription => "ファイル・diff・PR に対して構造化コードレビューを実行", - MessageId::CmdRlmDescription => { - "再帰言語モデル(RLM)ターン — プロンプトを Python REPL に格納し、モデルが処理コードを記述。サブ LLM 呼び出しは `llm_query()` / `sub_rlm()`。" - } + MessageId::CmdRlmDescription => "永続 RLM コンテキストを開く: /rlm [0-3] ", MessageId::CmdSaveDescription => "セッションをファイルに保存", MessageId::CmdSessionsDescription => "セッションピッカーを開く", MessageId::CmdSettingsDescription => "永続化された設定を表示", @@ -1424,7 +1433,7 @@ fn japanese(id: MessageId) -> Option<&'static str> { "選択中のツールまたはメッセージの詳細を開く(入力が空の時)" } MessageId::KbToolDetailsPager => "ツール詳細のページャーを開く", - MessageId::KbThinkingPager => "思考内容のページャーを開く", + MessageId::KbThinkingPager => "Activity Detail を開く", MessageId::KbLiveTranscript => "ライブ会話履歴オーバーレイを開く(自動追尾スクロール)", MessageId::KbBacktrackMessage => { "前のユーザーメッセージに戻る(左右でステップ、Enter で巻き戻し)" @@ -1434,7 +1443,9 @@ fn japanese(id: MessageId) -> Option<&'static str> { } MessageId::KbJumpPlanAgentYolo => "Plan / Agent / YOLO モードに直接ジャンプ", MessageId::KbAltJumpPlanAgentYolo => "Plan / Agent / YOLO モードへの代替ジャンプ", - MessageId::KbFocusSidebar => "Plan / Todos / Tasks / Agents / Auto サイドバーにフォーカス", + MessageId::KbFocusSidebar => { + "Work / Tasks / Agents / Context / Auto サイドバーにフォーカス" + } MessageId::KbTogglePlanAgent => "Plan モードと Agent モードを切り替え", MessageId::KbSessionPicker => "セッションピッカーを開く", MessageId::KbPasteAttach => "テキストを貼り付けまたはクリップボード画像を添付", @@ -1605,6 +1616,7 @@ fn chinese_simplified(id: MessageId) -> Option<&'static str> { MessageId::CmdHelpDescription => "显示帮助信息", MessageId::CmdHomeDescription => "显示主页面板,含统计与快捷操作", MessageId::CmdHooksDescription => "列出已配置的生命周期钩子(只读)", + MessageId::CmdAgentDescription => "打开持久子代理会话:/agent [0-3] ", MessageId::CmdGoalDescription => "设置带有可选令牌预算的会话目标", MessageId::CmdInitDescription => "为项目生成 AGENTS.md", MessageId::CmdLspDescription => "切换 LSP 诊断的开启或关闭", @@ -1626,15 +1638,14 @@ fn chinese_simplified(id: MessageId) -> Option<&'static str> { } MessageId::CmdQueueDescription => "查看或编辑已排队的消息", MessageId::CmdRecallDescription => "搜索此前的循环归档(基于消息文本的 BM25 检索)", + MessageId::CmdRelayDescription => "为新线程创建会话接力摘要", MessageId::CmdRenameDescription => "重命名当前会话", MessageId::CmdRestoreDescription => { "将工作区回滚到此前的轮次前/后快照。不带参数时列出最近的快照。" } MessageId::CmdRetryDescription => "重试上一次请求", MessageId::CmdReviewDescription => "对文件、diff 或 PR 进行结构化代码审查", - MessageId::CmdRlmDescription => { - "递归语言模型(RLM)轮次 —— 将提示词存入 Python REPL,让模型编写代码进行处理;可用 `llm_query()` / `sub_rlm()` 调用子 LLM。" - } + MessageId::CmdRlmDescription => "打开持久 RLM 上下文:/rlm [0-3] ", MessageId::CmdSaveDescription => "将会话保存到文件", MessageId::CmdSessionsDescription => "打开会话选择器", MessageId::CmdSettingsDescription => "显示持久化设置", @@ -1741,7 +1752,7 @@ fn chinese_simplified(id: MessageId) -> Option<&'static str> { MessageId::KbLastMessagePager => "打开最后一条消息的分页器(输入框为空时)", MessageId::KbSelectedDetails => "打开选中工具或消息的详情(输入框为空时)", MessageId::KbToolDetailsPager => "打开工具详情分页器", - MessageId::KbThinkingPager => "打开思考内容分页器", + MessageId::KbThinkingPager => "打开 Activity Detail", MessageId::KbLiveTranscript => "打开实时对话覆盖层(自动滚动尾随)", MessageId::KbBacktrackMessage => "回退到之前的用户消息(左右键步进,Enter 回退)", MessageId::KbCompleteCycleModes => { @@ -1749,7 +1760,7 @@ fn chinese_simplified(id: MessageId) -> Option<&'static str> { } MessageId::KbJumpPlanAgentYolo => "直接跳转到 Plan / Agent / YOLO 模式", MessageId::KbAltJumpPlanAgentYolo => "替代快捷键跳转到 Plan / Agent / YOLO 模式", - MessageId::KbFocusSidebar => "聚焦 Plan / 待办 / 任务 / 代理 / 代理 / 自动侧边栏", + MessageId::KbFocusSidebar => "聚焦 Work / 任务 / 代理 / Context / 自动侧边栏", MessageId::KbTogglePlanAgent => "在 Plan 和 Agent 模式之间切换", MessageId::KbSessionPicker => "打开会话选择器", MessageId::KbPasteAttach => "粘贴文本或附加剪贴板图片", @@ -1918,6 +1929,9 @@ fn portuguese_brazil(id: MessageId) -> Option<&'static str> { MessageId::CmdHooksDescription => { "Listar hooks de ciclo de vida configurados (somente leitura)" } + MessageId::CmdAgentDescription => { + "Abrir uma sessão persistente de sub-agente: /agent [0-3] " + } MessageId::CmdGoalDescription => { "Definir uma meta de sessão com orçamento de tokens opcional" } @@ -1947,6 +1961,7 @@ fn portuguese_brazil(id: MessageId) -> Option<&'static str> { MessageId::CmdRecallDescription => { "Buscar arquivos de ciclos anteriores (BM25 sobre o texto das mensagens)" } + MessageId::CmdRelayDescription => "Criar um relay da sessão para um novo thread", MessageId::CmdRenameDescription => "Renomear a sessão atual", MessageId::CmdRestoreDescription => { "Reverter o workspace a um snapshot pré/pós-turno anterior. Sem argumento, lista os snapshots recentes." @@ -1956,7 +1971,7 @@ fn portuguese_brazil(id: MessageId) -> Option<&'static str> { "Executar uma revisão de código estruturada em um arquivo, diff ou PR" } MessageId::CmdRlmDescription => { - "Turno do Recursive Language Model (RLM) — guarda o prompt em um REPL Python e deixa o modelo escrever o código que o processa; use `llm_query()` / `sub_rlm()` para chamadas a sub-LLMs." + "Abrir um contexto RLM persistente: /rlm [0-3] " } MessageId::CmdSaveDescription => "Salvar a sessão em arquivo", MessageId::CmdSessionsDescription => "Abrir o seletor de sessões", @@ -2090,7 +2105,7 @@ fn portuguese_brazil(id: MessageId) -> Option<&'static str> { "Abrir detalhes da ferramenta ou mensagem selecionada (quando entrada vazia)" } MessageId::KbToolDetailsPager => "Abrir paginador de detalhes da ferramenta", - MessageId::KbThinkingPager => "Abrir paginador de raciocínio", + MessageId::KbThinkingPager => "Abrir Activity Detail", MessageId::KbLiveTranscript => "Abrir sobreposição de transcrição ao vivo (auto-scroll)", MessageId::KbBacktrackMessage => { "Retroceder para mensagem anterior do usuário (esquerda/direita, Enter para rebobinar)" @@ -2100,7 +2115,7 @@ fn portuguese_brazil(id: MessageId) -> Option<&'static str> { } MessageId::KbJumpPlanAgentYolo => "Pular direto para modo Plan / Agent / YOLO", MessageId::KbAltJumpPlanAgentYolo => "Salto alternativo para modo Plan / Agent / YOLO", - MessageId::KbFocusSidebar => "Focar barra lateral Plan / Todos / Tasks / Agents / Auto", + MessageId::KbFocusSidebar => "Focar barra lateral Work / Tasks / Agents / Context / Auto", MessageId::KbTogglePlanAgent => "Alternar entre modos Plan e Agent", MessageId::KbSessionPicker => "Abrir seletor de sessões", MessageId::KbPasteAttach => "Colar texto ou anexar imagem da área de transferência", diff --git a/crates/tui/src/mcp.rs b/crates/tui/src/mcp.rs index 874016b8..0a09a8bc 100644 --- a/crates/tui/src/mcp.rs +++ b/crates/tui/src/mcp.rs @@ -3233,6 +3233,7 @@ mod tests { } #[tokio::test] + #[ignore = "flaky: requires a live TCP listener and is sensitive to port allocation races"] async fn mcp_connection_supports_streamable_http_event_stream_responses() { use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::net::{TcpListener, TcpStream}; diff --git a/crates/tui/src/prompts.rs b/crates/tui/src/prompts.rs index 4028aa3d..ae0f9d11 100644 --- a/crates/tui/src/prompts.rs +++ b/crates/tui/src/prompts.rs @@ -30,7 +30,7 @@ pub struct PromptSessionContext<'a> { pub translation_enabled: bool, } -/// Conventional location for the structured session-handoff artifact (#32). +/// Conventional location for the structured session relay artifact (#32). /// A previous session writes it on exit / `/compact`; the next session reads /// it back on startup and prepends it to the system prompt so a fresh agent /// doesn't have to re-discover open blockers from scratch. @@ -157,7 +157,7 @@ fn render_instructions_block(paths: &[PathBuf]) -> Option { } } -/// Read the workspace-local handoff artifact, if present, and format it as a +/// Read the workspace-local relay artifact, if present, and format it as a /// system-prompt block. Returns `None` when the file is absent or empty so /// callers can keep the default-uncluttered prompt for fresh workspaces. fn load_handoff_block(workspace: &Path) -> Option { @@ -168,7 +168,7 @@ fn load_handoff_block(workspace: &Path) -> Option { return None; } Some(format!( - "## Previous Session Handoff\n\nThe previous session in this workspace left a handoff at `{}`. Consider it the first artifact to read on this turn — open blockers, in-flight changes, and recent decisions live there. Update or rewrite it before exiting if state changes materially.\n\n{}", + "## Previous Session Relay\n\nThe previous session in this workspace left a relay artifact at `{}`. Consider it the first artifact to read on this turn — open blockers, in-flight changes, and recent decisions live there. Update or rewrite it before exiting if state changes materially.\n\n{}", HANDOFF_RELATIVE_PATH, trimmed )) } @@ -354,7 +354,7 @@ pub const AUTO_APPROVAL: &str = include_str!("prompts/approvals/auto.md"); pub const SUGGEST_APPROVAL: &str = include_str!("prompts/approvals/suggest.md"); pub const NEVER_APPROVAL: &str = include_str!("prompts/approvals/never.md"); -/// Compaction handoff template — written into the system prompt so the +/// Compaction relay template — written into the system prompt so the /// model knows the format to use when writing `.deepseek/handoff.md`. pub const COMPACT_TEMPLATE: &str = include_str!("prompts/compact.md"); @@ -514,11 +514,11 @@ pub fn system_prompt_for_mode_with_context( /// 2. project context / fallback (workspace-static) /// 3. skills block (skills-dir-static) /// 4. `## Context Management` (compile-time constant, Agent/Yolo only) -/// 5. compaction handoff template (compile-time constant) -/// 6. handoff block — file-backed; rewritten by `/compact` and on exit +/// 5. compaction relay template (compile-time constant) +/// 6. relay block — file-backed; rewritten by `/compact` and on exit /// /// Anything appended after a volatile block forfeits the cache for the rest -/// of the request. New blocks belong above the handoff boundary unless they +/// of the request. New blocks belong above the relay boundary unless they /// themselves are turn-volatile. Working-set metadata is now injected into the /// latest user message as per-turn metadata instead of this system prompt. pub fn system_prompt_for_mode_with_context_and_skills( @@ -668,7 +668,7 @@ pub fn system_prompt_for_mode_with_context_skills_session_and_approval( ); } - // 5. Compaction handoff template — so the model knows the format to use + // 5. Compaction relay template — so the model knows the format to use // when writing `.deepseek/handoff.md` on exit / `/compact`. full_prompt.push_str("\n\n"); full_prompt.push_str(COMPACT_TEMPLATE); @@ -694,7 +694,7 @@ pub fn system_prompt_for_mode_with_context_skills_session_and_approval( // 6b. User memory block (#489). Placed below the volatile boundary // because memory entries are editable mid-session via `/memory` or // `# foo` quick-add. When they change, they only invalidate the - // trailing handoff block — the static prefix above stays cached. + // trailing relay block — the static prefix above stays cached. if let Some(memory_block) = session_context.user_memory_block && !memory_block.trim().is_empty() { @@ -713,7 +713,7 @@ pub fn system_prompt_for_mode_with_context_skills_session_and_approval( ); } - // 7. Previous-session handoff (file-backed, rewritten by `/compact`). + // 7. Previous-session relay (file-backed, rewritten by `/compact`). if let Some(handoff_block) = load_handoff_block(workspace) { full_prompt = format!("{full_prompt}\n\n{handoff_block}"); } @@ -775,9 +775,9 @@ mod tests { use super::*; use tempfile::tempdir; - /// Discriminator unique to the injected handoff block (not present in the + /// Discriminator unique to the injected relay block (not present in the /// agent prompt's own discussion of the convention). - const HANDOFF_BLOCK_MARKER: &str = "left a handoff at `.deepseek/handoff.md`"; + const HANDOFF_BLOCK_MARKER: &str = "left a relay artifact at `.deepseek/handoff.md`"; #[test] fn render_environment_block_lists_supplied_locale_and_workspace() { @@ -1120,7 +1120,7 @@ mod tests { assert!(prompt.contains("")); assert!( prompt.find("").expect("pack") - < prompt.find("## Previous Session Handoff").expect("handoff") + < prompt.find("## Previous Session Relay").expect("relay") ); } @@ -1132,7 +1132,7 @@ mod tests { std::fs::create_dir_all(&handoff_dir).unwrap(); std::fs::write( handoff_dir.join("handoff.md"), - "# Session handoff — prior\n\n## Active task\nFinish #32.\n\n## Open blockers\n- [ ] write the basic version\n", + "# Session relay — prior\n\n## Active task\nFinish #32.\n\n## Open blockers\n- [ ] write the basic version\n", ) .unwrap(); @@ -1278,7 +1278,7 @@ mod tests { SystemPrompt::Text(text) => text, SystemPrompt::Blocks(_) => panic!("expected text system prompt"), }; - assert!(prompt.contains("## Compaction Handoff")); + assert!(prompt.contains("## Compaction Relay")); // #429: structured Markdown template. Goal/Constraints/Progress // (Done/InProgress/Blocked)/Key Decisions/Next step. assert!(prompt.contains("### Goal")); @@ -1313,7 +1313,7 @@ mod tests { }; let goal_pos = prompt.find("").expect("goal block"); - let compact_pos = prompt.find("## Compaction Handoff").expect("compact block"); + let compact_pos = prompt.find("## Compaction Relay").expect("compact block"); assert!(prompt.contains("Fix transcript corruption")); // Session goal is volatile content — it lives below the @@ -1353,7 +1353,7 @@ mod tests { fn tool_selection_guide_avoids_defensive_tool_suppression() { let prompt = compose_prompt(AppMode::Agent, Personality::Calm); assert!(prompt.contains("Tool Selection Guide")); - assert!(prompt.contains("Use `agent_result`")); + assert!(prompt.contains("Use `agent_eval`")); assert!( !prompt.contains("When NOT to use certain tools"), "the system prompt should steer tool choice without training the model to avoid available tools" @@ -1432,6 +1432,62 @@ mod tests { ); } + #[test] + fn workspace_orientation_guidance_present() { + let prompt = compose_prompt(AppMode::Agent, Personality::Calm); + assert!(prompt.contains("Workspace Orientation")); + assert!(prompt.contains("canonical project root")); + assert!(prompt.contains("AGENTS.md")); + assert!(prompt.contains("explore` / `explorer")); + } + + #[test] + fn prompt_uses_persistent_agent_and_rlm_surface() { + let prompt = compose_prompt(AppMode::Agent, Personality::Calm); + for tool in [ + "agent_open", + "agent_eval", + "agent_close", + "rlm_open", + "rlm_eval", + "rlm_configure", + "rlm_close", + "handle_read", + ] { + assert!( + prompt.contains(tool), + "prompt should mention new persistent tool `{tool}`" + ); + } + for retired in [ + "agent_spawn", + "agent_wait", + "agent_result", + "agent_send_input", + "agent_assign", + "agent_resume", + "agent_list", + "spawn_agent", + "delegate_to_agent", + "send_input", + "close_agent", + ] { + assert!( + !prompt.contains(retired), + "prompt should not advertise retired sub-agent tool `{retired}`" + ); + } + } + + #[test] + fn prompt_documents_fork_context_prefix_cache_contract() { + let prompt = compose_prompt(AppMode::Agent, Personality::Calm); + assert!(prompt.contains("fork_context: true")); + assert!(prompt.contains("byte-identical")); + assert!(prompt.contains("DeepSeek prefix-cache reuse")); + assert!(prompt.contains("Fresh sessions are the default")); + } + #[test] fn subagent_done_sentinel_section_present() { let prompt = compose_prompt(AppMode::Agent, Personality::Calm); @@ -1541,7 +1597,7 @@ mod tests { #[test] fn system_prompt_with_handoff_file_is_byte_stable_when_file_is_unchanged() { // If `.deepseek/handoff.md` hasn't moved between two builds, the - // rendered prompt must produce identical bytes. The handoff block + // rendered prompt must produce identical bytes. The relay block // lands below the static boundary in // `system_prompt_for_mode_with_context_and_skills`. let tmp = tempdir().expect("tempdir"); @@ -1550,7 +1606,7 @@ mod tests { std::fs::create_dir_all(&handoff_dir).unwrap(); std::fs::write( handoff_dir.join("handoff.md"), - "# Session handoff\n\n## Active task\nFinish #280.\n\n## Open blockers\n- [ ] none\n", + "# Session relay\n\n## Active task\nFinish #280.\n\n## Open blockers\n- [ ] none\n", ) .unwrap(); @@ -1567,15 +1623,15 @@ mod tests { &a, &b, ); - assert!(a.contains(HANDOFF_BLOCK_MARKER), "handoff must be embedded"); - assert!(a.contains("Finish #280."), "handoff body must be present"); + assert!(a.contains(HANDOFF_BLOCK_MARKER), "relay must be embedded"); + assert!(a.contains("Finish #280."), "relay body must be present"); } #[test] fn handoff_appears_after_static_blocks_without_working_set() { - // Cache-prefix invariant: the handoff block must come after static - // `## Context Management` and the compaction handoff template - // (`## Compaction Handoff`). Working-set metadata is per-turn user + // Cache-prefix invariant: the relay block must come after static + // `## Context Management` and the compaction relay template + // (`## Compaction Relay`). Working-set metadata is per-turn user // metadata now, not a system-prompt tail block. let tmp = tempdir().expect("tempdir"); let workspace = tmp.path(); @@ -1594,11 +1650,11 @@ mod tests { .find("## Context Management") .expect("Context Management section present in Agent mode"); let compact_pos = prompt - .find("## Compaction Handoff") - .expect("compaction handoff template present"); + .find("## Compaction Relay") + .expect("compaction relay template present"); let handoff_pos = prompt .find(HANDOFF_BLOCK_MARKER) - .expect("handoff block present when fixture file exists"); + .expect("relay block present when fixture file exists"); assert!( !prompt.contains("## Repo Working Set"), "working-set summary must stay out of the system prompt" @@ -1606,11 +1662,11 @@ mod tests { assert!( context_pos < handoff_pos, - "## Context Management must precede the handoff block" + "## Context Management must precede the relay block" ); assert!( compact_pos < handoff_pos, - "## Compaction Handoff must precede the handoff block" + "## Compaction Relay must precede the relay block" ); } diff --git a/crates/tui/src/prompts/agent.txt b/crates/tui/src/prompts/agent.txt index 5ce82e18..fba4977c 100644 --- a/crates/tui/src/prompts/agent.txt +++ b/crates/tui/src/prompts/agent.txt @@ -1,7 +1,7 @@ ## Mode: agent -Read-only tools (reads, searches, `rlm`, agent status queries, git inspection) run silently. -Any write, patch, shell execution, sub-agent spawn, or CSV batch operation will ask for approval first. +Read-only tools (reads, searches, persistent RLM session tools, agent status queries, git inspection) run silently. +Any write, patch, shell execution, sub-agent session open, or CSV batch operation will ask for approval first. Before requesting approval for writes, lay out your work with `checklist_write` so the user can see what you intend to do and approve with context. Complex changes should also get an `update_plan` first. @@ -9,7 +9,7 @@ Decomposition builds trust — a clear plan gets faster approvals. ## Sub-agent completion sentinel -When you spawn a sub-agent via `agent_spawn`, the child runs independently. +When you open a sub-agent via `agent_open`, the child runs independently. You will receive a `` element in the transcript when it finishes. Read its `summary` field and integrate the work — do not re-do what the child already did. -You can also call `agent_result` to pull the full structured result. +You can also call `agent_eval` with the agent name or id to pull the current structured projection or transcript handle. diff --git a/crates/tui/src/prompts/base.md b/crates/tui/src/prompts/base.md index c7b30f33..0a05be10 100644 --- a/crates/tui/src/prompts/base.md +++ b/crates/tui/src/prompts/base.md @@ -40,19 +40,27 @@ Use three decomposition patterns, selected by task scope: **PREVIEW** — Before diving into a large task, survey the terrain. Scan directory structure (`list_dir`), file headers, module trees. Identify problem boundaries and estimate complexity. A 30-second preview prevents hours of wrong-path exploration. -**CHUNK + map-reduce** — When a task exceeds single-pass capacity: split into independent sub-tasks, process each independently (parallel where possible via parallel tool calls or `agent_spawn`), then synthesize findings into a coherent whole. Track chunks with `checklist_write`. +**CHUNK + map-reduce** — When a task exceeds single-pass capacity: split into independent sub-tasks, process each independently (parallel where possible via parallel tool calls or persistent sub-agent sessions), then synthesize findings into a coherent whole. Track chunks with `checklist_write`. -**RECURSIVE** — When sub-tasks reveal sub-problems: decompose recursively until each leaf is tractable. Maintain the task tree via `update_plan` (strategy) layered above `checklist_write` (leaf tasks). Propagate findings upward when sub-problems resolve. +**RECURSIVE** — When sub-tasks reveal sub-problems: decompose recursively until each leaf is tractable. Keep the active leaves in `checklist_write`; use `update_plan` only when a genuinely complex initiative needs durable high-level strategy metadata. Propagate findings upward when sub-problems resolve. Your default workflow for any non-trivial request: 1. **`checklist_write`** — break the work into concrete, verifiable steps. Mark the first one `in_progress`. This populates the sidebar so the user can see what you're doing. 2. **Execute** — work through each checklist item, updating status as you go. -3. **For complex initiatives**, layer `update_plan` (high-level strategy) above `checklist_write` (granular steps). -4. **For parallel work**, spawn sub-agents (`agent_spawn`) — each does one thing well. Link them to plan/todo items in your thinking. Batch independent tool calls in a single turn. -5. **Only when an input genuinely doesn't fit your context window** — a whole file > ~50K tokens, a long transcript, a multi-document corpus — use `rlm`. It loads the input into a Python REPL where a sub-agent processes it. For shorter inputs, use `read_file` and reason directly. +3. **For complex initiatives only**, add `update_plan` as high-level strategy. Do not mirror the checklist into a second tracker. +4. **For parallel work**, open sub-agent sessions with `agent_open` — each does one thing well. Use `agent_eval` for follow-ups or completion state, and `agent_close` when a session should be cancelled or released. Link them to Work/checklist items in your thinking. Batch independent tool calls in a single turn. +5. **Only when an input genuinely doesn't fit your context window** — a whole file > ~50K tokens, a long transcript, a multi-document corpus — use persistent RLM sessions: `rlm_open` loads the input into a named Python REPL, `rlm_eval` runs bounded analysis, `handle_read` reads returned `var_handle`s, `rlm_configure` adjusts feedback/depth, and `rlm_close` releases the session. For shorter inputs, use `read_file` and reason directly. 6. **For persistent cross-session memory**, use `note` sparingly for important decisions, open blockers, and architectural context. -**Key principle**: make your work visible. The sidebar shows Plan / Todos / Tasks / Agents. When these panels are empty, the user has no idea what you're doing. Keep them populated. +**Key principle**: make your work visible in one place. The sidebar shows Work / Tasks / Agents / Context. Keep the Work checklist current; it is the primary progress surface. `update_plan` appears there only as optional strategy when it has real content. + +## Workspace Orientation + +When you enter an unfamiliar workspace, orient before broad search. Use the project instructions already loaded into the prompt, then confirm the working shape with the cheapest deterministic tools: `list_dir`, direct reads of `AGENTS.md`/`README.md` when relevant, and targeted `grep_files`. If the current directory is a multi-project workspace or the user points at a child path, identify the canonical project root before searching. If the correct project remains ambiguous after a quick orientation pass, ask instead of spraying searches across sibling checkouts. + +Treat workspace instructions as authority for where work should happen. If they say a sibling directory is stale, historical, frozen, or not the canonical checkout, do not spend high-value context there unless the user explicitly asks. Prefer exact paths from the user over guessing. + +Use `explore` sub-agents for independent read-only reconnaissance. Call the role `explore` / `explorer`, and give each child one bounded question with the project root and expected evidence shape. Use RLM for long inputs or many semantic slices, not for basic path discovery. ## Verification Principle @@ -76,23 +84,23 @@ If a tool call fails, inspect the error before retrying. Do not repeat the ident ## Composition Pattern for Multi-Step Work -For any task estimated to take 5+ steps: +For any task estimated to take 5+ concrete steps: -1. **`update_plan`** — 3-6 high-level phases (status: pending). This gives the user a map. -2. **`checklist_write`** — concrete leaf tasks under the first phase (mark first `in_progress`). -3. **Execute phase 1**, updating checklist as you go. Batch independent steps into parallel tool calls. -4. **After each phase**, re-read your plan: does phase 2 still make sense? Update the plan if new information changes the approach. Don't blindly follow a plan drafted before you understood the code. -5. **When a phase reveals sub-problems**, add them to the checklist or spawn investigation sub-agents — don't guess. +1. **`checklist_write`** — concrete leaf tasks, with the first item `in_progress`. +2. **Execute**, updating checklist status as you go. Batch independent steps into parallel tool calls. +3. **For multi-phase or ambiguous initiatives**, optionally add `update_plan` with 3-6 high-level phases. Keep it strategic; do not duplicate checklist items. +4. **After each phase**, re-check whether the next checklist items still make sense. Update the checklist, and update strategy only if the high-level approach changed. +5. **When a phase reveals sub-problems**, add them to the checklist or open investigation sub-agent sessions — don't guess. ## Sub-Agent Strategy Sub-agents are cheap — DeepSeek V4 Flash costs $0.14/M input. Use them liberally for parallel work: -- **Parallel investigation**: When you need to understand 3+ independent files or modules, spawn one read-only sub-agent per target. They run concurrently in one turn and return structured findings you synthesize. This is faster AND more thorough than reading sequentially. -- **Parallel implementation**: After a plan is laid out, spawn one sub-agent per independent leaf task. Each does one thing well; you integrate results. -- **Solo tasks**: A single read, a single search, a focused question — do these yourself. Spawning has overhead; one-turn reads are faster direct. -- **Sequential work**: If step B depends on step A's output, run A yourself, then decide whether to spawn B based on what A found. Don't pre-spawn dependent work. -- **Concurrent sub-agent cap**: The dispatcher defaults to 10 concurrent sub-agents (configurable via `[subagents].max_concurrent` in `config.toml`, hard ceiling 20). When you need more, batch them: spawn up to the cap, wait for completions, then spawn the next batch. +- **Parallel investigation**: When you need to understand 3+ independent files or modules, open one read-only sub-agent session per target. They run concurrently in one turn and return structured findings you synthesize. This is faster AND more thorough than reading sequentially. +- **Parallel implementation**: After a plan is laid out, open one sub-agent session per independent leaf task. Each does one thing well; you integrate results. +- **Solo tasks**: A single read, a single search, a focused question — do these yourself. Opening a sub-agent has overhead; one-turn reads are faster direct. +- **Sequential work**: If step B depends on step A's output, run A yourself, then decide whether to open a sub-agent based on what A found. Don't pre-open dependent work. +- **Concurrent sub-agent cap**: The dispatcher defaults to 10 concurrent sub-agents (configurable via `[subagents].max_concurrent` in `config.toml`, hard ceiling 20). When you need more, batch them: open up to the cap, wait for completions, then open the next batch. ## Parallel-First Heuristic @@ -101,23 +109,25 @@ Before you fire any tool, scan your checklist: is there another tool you could r - Reading 3 files → 3 `read_file` calls in one turn - Searching for 2 patterns → 2 `grep_files` calls in one turn - Checking git status AND reading a config → `git_status` + `read_file` in one turn -- Spawning sub-agents for independent investigations → all `agent_spawn` calls in one turn +- Opening sub-agents for independent investigations → all `agent_open` calls in one turn The dispatcher runs parallel tool calls simultaneously. Serializing independent operations wastes the user's time and grows your context faster than necessary. ## RLM — How to Use It -RLM loads input into a Python REPL where you write code that calls sub-LLM helpers (`llm_query`, `llm_query_batched`, `rlm_query`). Three patterns, not one — choose based on the shape of the work: +RLM is a persistent Python REPL for context that is too large or too repetitive to keep in the parent transcript. Open a named session with `rlm_open`, run bounded code with `rlm_eval`, read large returned payloads through `handle_read`, tune feedback with `rlm_configure`, and close finished sessions with `rlm_close`. + +Inside the REPL, use deterministic Python for exact work and the RLM helper functions for semantic work. The current helper family is `peek`, `search`, `chunk`, `context_meta`, `sub_query`, `sub_query_batch`, `sub_query_map`, `sub_rlm`, `finalize`, and `evaluate_progress`. These are in-REPL helpers, not separate model-visible tools. Three patterns, not one — choose based on the shape of the work: + +The RLM paper's core design is symbolic state: the long input and intermediate values live in the REPL environment, not copied into the root model context. Inspect with bounded slices, transform with Python, batch child calls programmatically, and keep large intermediate strings in variables or `var_handle`s. Do not paste the whole body back into a prompt or verbalize a long list of sub-calls when a loop can launch them. **CHUNK** — A single input that genuinely doesn't fit in your context window (a whole file > 50K tokens, a long transcript, a multi-document corpus). Split it, process each chunk, synthesize. -**BATCH** — Many independent items that each need LLM attention (classify 20 entries, extract fields from 30 documents, score 15 candidates). Use `llm_query_batched` for parallel execution — it fans out to the same DeepSeek client and finishes in one turn what would take 15 sequential reads. +**BATCH** — Many independent items that each need LLM attention (classify 20 entries, extract fields from 30 documents, score 15 candidates). Use `sub_query_batch` for parallel execution — it fans out to the same DeepSeek client and finishes in one turn what would take 15 sequential reads. -**RECURSE** — A problem that benefits from decomposition + critique. Use `rlm_query` to have a sub-LLM review your reasoning, identify gaps, or explore alternative approaches. The sub-LLM returns a synthesized answer you verify against live tool output. +**RECURSE** — A problem that benefits from decomposition + critique. Use `sub_query` or `sub_rlm` to have a sub-LLM review your reasoning, identify gaps, or explore alternative approaches. The sub-LLM returns a synthesized answer you verify against live tool output. -For exact counts or structured aggregates, compute them directly in Python inside the REPL (`len`, regexes, parsers, counters) and use child LLM calls only for semantic interpretation. When you chunk a whole input, use `chunk_context()` plus `chunk_coverage()` and report coverage explicitly: chunks processed, total chunks, line/char ranges, and any skipped sections. Cross-check surprising aggregate results with deterministic code before presenting them. - -The Python helpers visible inside the REPL (`llm_query`, `llm_query_batched`, `rlm_query`, `rlm_query_batched`) are NOT separately-callable tools — they are functions the sub-agent uses inside its Python code. You only call `rlm` itself from the model side. +For exact counts or structured aggregates, compute them directly in Python inside the REPL (`len`, regexes, parsers, counters) and use child LLM calls only for semantic interpretation. When you chunk a whole input, use `chunk()` and report coverage explicitly: chunks processed, total chunks, line/char ranges, and any skipped sections. Cross-check surprising aggregate results with deterministic code before presenting them. Use `finalize(...)` for the answer you want returned; if it comes back as a `var_handle`, call `handle_read` for a bounded slice, count, or JSON projection instead of asking the runtime to replay the whole value. ## Context You have a 1 M-token context window. When usage creeps above ~80%, suggest `/compact` to the user — it summarises earlier turns so you can keep working without losing thread. @@ -154,14 +164,15 @@ When context is deep (past a soft seam): cache reasoning conclusions in concise ## Toolbox (fast reference — tool descriptions are authoritative) -- **Planning / tracking**: `update_plan` (high-level strategy), `task_create` / `task_list` / `task_read` / `task_cancel` (durable work objects), `checklist_write` (granular progress under the active task/thread), `checklist_add` / `checklist_update` / `checklist_list`, `todo_*` aliases (legacy compatibility), `note` (persistent memory). +- **Planning / tracking**: `checklist_write` (primary Work progress under the active task/thread), `checklist_add` / `checklist_update` / `checklist_list`, `update_plan` (optional high-level strategy metadata for complex initiatives), `task_create` / `task_list` / `task_read` / `task_cancel` (durable work objects), `todo_*` aliases (legacy compatibility), `note` (persistent memory). - **File I/O**: `read_file` (PDFs auto-extracted), `list_dir`, `write_file`, `edit_file`, `apply_patch`, `retrieve_tool_result` for prior spilled large tool outputs. - **Shell**: `task_shell_start` + `task_shell_wait` for long-running commands, diagnostics, tests, searches, and servers; `exec_shell` for bounded cancellable foreground commands; `exec_shell_wait`, `exec_shell_interact`. If foreground `exec_shell` times out, the process was killed; rerun long work with `task_shell_start` or `exec_shell` using `background: true`, then poll/wait. - **Task evidence**: `task_gate_run` for verification gates; `pr_attempt_record` / `pr_attempt_list` / `pr_attempt_read` / `pr_attempt_preflight`; `github_issue_context` / `github_pr_context` (read-only); `github_comment` / `github_close_issue` (approval + evidence required); `automation_*` scheduling tools. - **Structured search**: `grep_files`, `file_search`, `web_search`, `fetch_url`, `web.run` (browse). - **Git / diag / tests**: `git_status`, `git_diff`, `git_show`, `git_log`, `git_blame`, `diagnostics`, `run_tests`, `review`. -- **Sub-agents**: `agent_spawn` (`spawn_agent`, `delegate_to_agent`), `agent_result`, `agent_cancel` (`close_agent`), `agent_list`, `agent_wait` (`wait`), `agent_send_input` (`send_input`), `agent_assign` (`assign_agent`), `resume_agent`. -- **Recursive LM (long inputs / parallel reasoning)**: `rlm` — load a file/string as `context` in a Python REPL, sub-agent writes Python that calls `llm_query`/`llm_query_batched`/`rlm_query` to chunk, compare, critique, and synthesize; returns the synthesized answer. Read-only. +- **Sub-agents**: `agent_open`, `agent_eval`, `agent_close`. Open fresh sessions by default; pass `fork_context: true` only when the child needs the current parent context and prefix-cache continuity. +- **Recursive LM (long inputs / parallel reasoning)**: `rlm_open`, `rlm_eval`, `rlm_configure`, `rlm_close` — open a named Python REPL over a file/string/URL, run deterministic and semantic analysis, return compact results or `var_handle`s, then close when done. +- **Large symbolic outputs**: `handle_read` — read bounded slices, counts, ranges, or JSONPath projections from returned `var_handle`s without replaying the whole payload. - **Skills**: `load_skill` (#434) — when the user names a skill or the task matches one in the `## Skills` section above, call this with the skill id to pull its `SKILL.md` body and companion-file list into context in one tool call. Faster than `read_file` + `list_dir`. - **Other**: `code_execution` (Python sandbox), `validate_data` (JSON/TOML), `request_user_input`, `finance` (market quotes), `tool_search_tool_regex`, `tool_search_tool_bm25` (deferred tool discovery). @@ -178,17 +189,17 @@ Use `edit_file` for one clear replacement in one file. Do not use it for multi-b ### `exec_shell` Use `exec_shell` for shell-native diagnostics, pipelines, and bounded commands. Use structured tools for structured operations when they map directly (`grep_files`, `git_diff`, `read_file`). For long commands, servers, full test suites, or release computations, start background work with `task_shell_start` or `exec_shell` using `background: true`, then poll with `task_shell_wait` or `exec_shell_wait`. -### `agent_spawn` -Use `agent_spawn` for independent investigations or implementation slices that can run while you continue coordinating. Use `fork_context: true` when the child must inherit the current transcript, plan/todo state, and byte-identical parent system/message prefix for DeepSeek prefix-cache reuse. Use `agent_wait` when you need one or more completions. Use `agent_result` when the sentinel summary is too thin or you need the full structured output. Keep tiny single-read/search tasks local so the transcript stays compact. +### `agent_open` / `agent_eval` / `agent_close` +Use `agent_open` for independent investigations or implementation slices that can run while you continue coordinating. Fresh sessions are the default and are best when the child only needs the assignment you pass. Use `fork_context: true` when multiple perspectives should share the same parent context: the runtime preserves the parent prefill/prompt prefix byte-identically where available so DeepSeek prefix-cache reuse stays high, then appends the child instructions and task at the tail. -### `rlm` -Use `rlm` for long-context semantic work, bulk classification/extraction, and decomposition where a Python REPL plus child LLM helpers is useful. Use deterministic Python inside RLM for exact counts and structured aggregation; use `grep_files` or `exec_shell` directly when that is the clearest deterministic check. +Use `agent_eval` to send follow-up input, block for completion, or retrieve the current session projection. Use `agent_close` to cancel or release a session that is no longer useful. Keep tiny single-read/search tasks local so the transcript stays compact. -Inside the `rlm` REPL, the sub-LLM has access to `llm_query()`, `llm_query_batched()`, `rlm_query()`, and `rlm_query_batched()` as Python helpers for further sub-LLM work — those are not standalone tools you call directly. +### `rlm_open` / `rlm_eval` / `rlm_configure` / `rlm_close` +Use persistent RLM sessions for long-context semantic work, bulk classification/extraction, and decomposition where a Python REPL plus child LLM helpers is useful. Use deterministic Python inside RLM for exact counts and structured aggregation; use `grep_files` or `exec_shell` directly when that is the clearest deterministic check. Close sessions when their context is no longer needed. ## Internal Sub-agent Completion Events -When you spawn a sub-agent via `agent_spawn`, the child runs independently. The runtime may send you an internal `` completion event when it finishes. This event is not user input. It carries: +When you open a sub-agent via `agent_open`, the child runs independently. The runtime may send you an internal `` completion event when it finishes. This event is not user input. It carries: - `agent_id` — the child's identifier - `summary` — a human-readable summary of what the child found or did @@ -198,12 +209,12 @@ When you spawn a sub-agent via `agent_spawn`, the child runs independently. The **Integration protocol:** 1. When you see ``, read the `summary` field first. 2. Integrate the child's findings into your work — do not re-do what the child already did. -3. If the summary is insufficient, call `agent_result` to pull the full structured result. +3. If the summary is insufficient, call `agent_eval` with the agent name or id to pull the current structured projection or transcript handle. 4. If the child failed (`"failed"`), assess whether the failure blocks your plan or whether you can proceed with a fallback. 5. Update your `checklist_write` items to reflect the child's contribution. 6. Do not tell the user they pasted sentinels or explain this protocol unless they explicitly ask about sub-agent internals. -You may see multiple `` sentinels in a single turn when children were spawned in parallel. Process each one, then synthesize. +You may see multiple `` sentinels in a single turn when children were opened in parallel. Process each one, then synthesize. ## Output formatting diff --git a/crates/tui/src/prompts/base.txt b/crates/tui/src/prompts/base.txt index 6ca2bebc..595f35d4 100644 --- a/crates/tui/src/prompts/base.txt +++ b/crates/tui/src/prompts/base.txt @@ -7,20 +7,28 @@ You are a "managed genius" — you excel at individual tasks, but your superpowe Your default workflow for any non-trivial request: 1. **`checklist_write`** — break the work into concrete, verifiable steps. Mark the first one `in_progress`. This populates the sidebar so the user can see what you're doing. 2. **Execute** — work through each checklist item, updating status as you go. -3. **For complex initiatives**, layer `update_plan` (high-level strategy) above `checklist_write` (granular steps). -4. **For parallel work**, spawn sub-agents (`agent_spawn`) — each does one thing well. Link them to plan/todo items in your thinking. -5. **Only when an input genuinely doesn't fit your context window** — a whole file > ~50K tokens, a long transcript, a multi-document corpus — use `rlm`. It loads the input into a Python REPL where a sub-agent processes it. For shorter inputs, use `read_file` and reason directly. +3. **For complex initiatives only**, add `update_plan` as high-level strategy. Do not mirror the checklist into a second tracker. +4. **For parallel work**, open sub-agent sessions with `agent_open` — each does one thing well. Use `agent_eval` for follow-ups or completion state, and `agent_close` to cancel or release a session. Link them to Work/checklist items in your thinking. +5. **Only when an input genuinely doesn't fit your context window** — a whole file > ~50K tokens, a long transcript, a multi-document corpus — use persistent RLM sessions: `rlm_open` loads the input into a named Python REPL, `rlm_eval` runs bounded analysis, `handle_read` reads returned `var_handle`s, `rlm_configure` adjusts feedback/depth, and `rlm_close` releases the session. For shorter inputs, use `read_file` and reason directly. 6. **For persistent cross-session memory**, use `note` sparingly for important decisions, open blockers, and architectural context. -**Key principle**: make your work visible. The sidebar shows Plan / Todos / Tasks / Agents. When these panels are empty, the user has no idea what you're doing. Keep them populated. +**Key principle**: make your work visible in one place. The sidebar shows Work / Tasks / Agents / Context. Keep the Work checklist current; it is the primary progress surface. `update_plan` appears there only as optional strategy when it has real content. + +## Workspace Orientation + +In unfamiliar workspaces, orient before broad search. Use loaded project instructions plus cheap deterministic tools (`list_dir`, relevant `AGENTS.md`/`README.md`, targeted `grep_files`) to identify the canonical project root. If a workspace holds several projects or stale sibling checkouts, follow the user's path and the project instructions; ask if the target remains ambiguous. + +Use `explore` / `explorer` sub-agents for independent read-only reconnaissance. Give each child one bounded question with the project root and expected evidence shape. Use RLM for long inputs or many semantic slices, not basic path discovery. ## RLM Is a Specialty Tool -`rlm` is for one specific shape of work: a long input that genuinely does not fit in your context (a whole file > ~50K tokens, a long transcript, a multi-document corpus). Reach for it ONLY when direct reasoning over the input is impossible because of its size. For everything else — short inputs, focused questions, parallel exploration — use `read_file`, `grep_files`, or `agent_spawn` instead. +Persistent RLM sessions are for one specific shape of work: a long input that genuinely does not fit in your context (a whole file > ~50K tokens, a long transcript, a multi-document corpus) or a batch of many semantic sub-questions that is cleaner in Python. Reach for RLM only when direct reasoning over the input is impossible or wasteful. For everything else — short inputs, focused questions, parallel exploration — use `read_file`, `grep_files`, or `agent_open` instead. -When you do use `rlm`, ask bounded questions with explicit inputs and expected output shape. The result is advisory — ground decisions in local files, live tool output, and passing verification before claiming completion. +When you do use RLM, open a named context with `rlm_open`, run bounded code with `rlm_eval`, read large `var_handle` results with `handle_read`, and close the context with `rlm_close`. Results are advisory — ground decisions in local files, live tool output, and passing verification before claiming completion. -The Python helpers visible inside the REPL (`llm_query`, `llm_query_batched`, `rlm_query`, `rlm_query_batched`) are NOT separately-callable tools — they are functions the sub-agent uses inside its Python code. +RLM works by keeping the long input and intermediate values as symbolic REPL state. Inspect bounded slices, transform with Python, batch child calls programmatically, and avoid copying the whole body back into the root prompt. + +The Python helpers visible inside the REPL (`sub_query`, `sub_query_batch`, `sub_query_map`, `sub_rlm`, `finalize`, and related context helpers) are NOT separately-callable tools — they are functions the sub-agent uses inside its Python code. ## Context You have a 1 M-token context window. When usage creeps above ~80%, suggest `/compact` to the user — it summarises earlier turns so you can keep working without losing thread. @@ -29,14 +37,15 @@ Model notes: DeepSeek V4 models emit *thinking tokens* (`ContentBlock::Thinking` ## Toolbox (fast reference — tool descriptions are authoritative) -- **Planning / tracking**: `update_plan` (high-level strategy), `task_create` / `task_list` / `task_read` / `task_cancel` (durable work objects), `checklist_write` (granular progress under the active task/thread), `checklist_add` / `checklist_update` / `checklist_list`, `todo_*` aliases (legacy compatibility), `note` (persistent memory). +- **Planning / tracking**: `checklist_write` (primary Work progress under the active task/thread), `checklist_add` / `checklist_update` / `checklist_list`, `update_plan` (optional high-level strategy metadata for complex initiatives), `task_create` / `task_list` / `task_read` / `task_cancel` (durable work objects), `todo_*` aliases (legacy compatibility), `note` (persistent memory). - **File I/O**: `read_file` (PDFs auto-extracted), `list_dir`, `write_file`, `edit_file`, `apply_patch`, `retrieve_tool_result` for prior spilled large tool outputs. - **Shell**: `task_shell_start` + `task_shell_wait` for long-running commands, diagnostics, tests, searches, and servers; `exec_shell` for bounded cancellable foreground commands; `exec_shell_wait`, `exec_shell_interact`. - **Task evidence**: `task_gate_run` for verification gates; `pr_attempt_record` / `pr_attempt_list` / `pr_attempt_read` / `pr_attempt_preflight`; `github_issue_context` / `github_pr_context` (read-only); `github_comment` / `github_close_issue` (approval + evidence required); `automation_*` scheduling tools. - **Structured search**: `grep_files`, `file_search`, `web_search`, `fetch_url`, `web.run` (browse). - **Git / diag / tests**: `git_status`, `git_diff`, `git_show`, `git_log`, `git_blame`, `diagnostics`, `run_tests`, `review`. -- **Sub-agents**: `agent_spawn` (`spawn_agent`, `delegate_to_agent`), `agent_result`, `agent_cancel` (`close_agent`), `agent_list`, `agent_wait` (`wait`), `agent_send_input` (`send_input`), `agent_assign` (`assign_agent`), `resume_agent`. -- **Recursive LM (long inputs / parallel reasoning)**: `rlm` — load a file/string as `context` in a Python REPL, sub-agent writes Python that calls `llm_query`/`llm_query_batched`/`rlm_query` to chunk, compare, critique, and synthesize; returns the synthesized answer. Read-only. +- **Sub-agents**: `agent_open`, `agent_eval`, `agent_close`. Fresh sessions are the default; use `fork_context: true` when multiple perspectives need the current parent context and byte-identical prefill/prompt prefix for DeepSeek prefix-cache reuse. +- **Recursive LM (long inputs / parallel reasoning)**: `rlm_open`, `rlm_eval`, `rlm_configure`, `rlm_close` — open a named Python REPL over a file/string/URL, run deterministic and semantic analysis, return compact results or `var_handle`s, then close when done. +- **Large symbolic outputs**: `handle_read` — read bounded slices, counts, ranges, or JSONPath projections from returned `var_handle`s. - **Other**: `code_execution` (Python sandbox), `validate_data` (JSON/TOML), `request_user_input`, `finance` (market quotes), `tool_search_tool_regex`, `tool_search_tool_bm25` (deferred tool discovery). Multiple `tool_calls` in one turn run in parallel. `web_search` returns `ref_id`s — cite as `(ref_id)`. diff --git a/crates/tui/src/prompts/compact.md b/crates/tui/src/prompts/compact.md index afeea91e..aa3f5394 100644 --- a/crates/tui/src/prompts/compact.md +++ b/crates/tui/src/prompts/compact.md @@ -1,4 +1,4 @@ -## Compaction Handoff +## Compaction Relay The conversation above this point has been compacted. Below is a structured summary of what was discussed and decided. Read this first — it replaces re-reading the compressed transcript. diff --git a/crates/tui/src/prompts/modes/agent.md b/crates/tui/src/prompts/modes/agent.md index 7b26a3d0..5e8d9f2b 100644 --- a/crates/tui/src/prompts/modes/agent.md +++ b/crates/tui/src/prompts/modes/agent.md @@ -2,14 +2,15 @@ You are running in Agent mode — autonomous task execution with tool access. -Read-only tools (reads, searches, `rlm`, agent status queries, git inspection) run silently. -Any write, patch, shell execution, sub-agent spawn, or CSV batch operation will ask for approval first. +Read-only tools (reads, searches, persistent RLM session tools, agent status queries, git inspection) run silently. +Any write, patch, shell execution, sub-agent session open, or CSV batch operation will ask for approval first. Before requesting approval for writes, lay out your work with `checklist_write` so the user can see what -you intend to do and approve with context. Complex changes should also get an `update_plan` first. -Decomposition builds trust — a clear plan gets faster approvals. +you intend to do and approve with context. Use `update_plan` only when a complex initiative needs +high-level strategy metadata that is not just a copy of the checklist. +Decomposition builds trust — a clear Work checklist gets faster approvals. -For multi-step initiatives, use `update_plan` (high-level strategy) + `checklist_write` (granular steps). +For multi-step initiatives, keep `checklist_write` current. Add `update_plan` only for genuinely useful strategy. ## Efficient Approvals @@ -23,8 +24,8 @@ Don't sequence approvals one at a time — the user wants context, not interrupt ## Session Longevity Long sessions accumulate context. To stay fast: -- Spawn sub-agents for independent work instead of doing everything sequentially +- Open sub-agent sessions for independent work instead of doing everything sequentially - Batch reads/searches/git-inspections into parallel tool calls -- Suggest `/compact` when context nears 80% — the compaction handoff preserves open blockers +- Suggest `/compact` when context nears 80% — the compaction relay preserves open blockers - Use `note` for decisions you'll need across compaction boundaries - A 3-turn session that fans out to sub-agents finishes faster AND stays responsive longer than a 15-turn sequential grind diff --git a/crates/tui/src/prompts/modes/plan.md b/crates/tui/src/prompts/modes/plan.md index 8b854a4f..583058d0 100644 --- a/crates/tui/src/prompts/modes/plan.md +++ b/crates/tui/src/prompts/modes/plan.md @@ -2,8 +2,9 @@ You are running in Plan mode — design before implementing. -Investigate first, act later. Use `update_plan` to lay out high-level strategy and `checklist_write` for -granular, verifiable steps. All writes and patches are blocked — you can read the world but you +Investigate first, act later. Use `checklist_write` for visible, granular progress. Add `update_plan` +only when high-level strategy adds value beyond the checklist. +All writes and patches are blocked — you can read the world but you can't change it. Shell and code execution are unavailable. Use this mode to build a thorough plan. Spawn read-only sub-agents for parallel investigation. diff --git a/crates/tui/src/prompts/modes/yolo.md b/crates/tui/src/prompts/modes/yolo.md index 804fe7b0..d6572b90 100644 --- a/crates/tui/src/prompts/modes/yolo.md +++ b/crates/tui/src/prompts/modes/yolo.md @@ -7,4 +7,5 @@ overwrite user work, or run destructive commands, pause and double-check. The un Even with auto-approval, create a `checklist_write` first so your work is visible and trackable in the sidebar. Decomposition is not red tape — it's how you organize complex work and demonstrate thoroughness. -For multi-step initiatives, use `update_plan` + `checklist_write` together. +For multi-step initiatives, keep `checklist_write` current. Add `update_plan` only when a high-level strategy +would help and do not duplicate the checklist there. diff --git a/crates/tui/src/repl/runtime.rs b/crates/tui/src/repl/runtime.rs index 73f64bd9..f943e416 100644 --- a/crates/tui/src/repl/runtime.rs +++ b/crates/tui/src/repl/runtime.rs @@ -6,8 +6,9 @@ //! `exec()`s them into the same global namespace so variables, imports, //! and even open file handles persist naturally across rounds. //! -//! Sub-LLM helpers (`llm_query`, `llm_query_batched`, `rlm_query`, -//! `rlm_query_batched`) are wired through a stdin/stdout RPC protocol: +//! Sub-LLM helpers (`sub_query`, `sub_query_batch`, `sub_rlm`, plus legacy +//! `llm_query`, `llm_query_batched`, `rlm_query`, `rlm_query_batched`) are +//! wired through a stdin/stdout RPC protocol: //! Python emits `__RLM_REQ___::{json}` on stdout, Rust dispatches the //! request and writes `__RLM_RESP___::{json}` back on stdin. No HTTP //! sidecar, no temp ports — the same pipes carry both control and data. @@ -22,6 +23,7 @@ use std::process::Stdio; use std::time::{Duration, Instant}; use serde::{Deserialize, Serialize}; +use serde_json::Value; use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader}; use tokio::process::{Child, ChildStdin, ChildStdout, Command}; use uuid::Uuid; @@ -43,9 +45,11 @@ pub struct ReplRound { pub stderr: String, /// `True` if the user code raised an unhandled Python exception. pub has_error: bool, - /// Captured `FINAL(value)` payload, if any. + /// Captured `finalize(value, confidence=...)` payload, if any. pub final_value: Option, - /// Number of `llm_query`/`rlm_query` RPCs the round issued. + /// Optional confidence supplied to `finalize(...)`. + pub final_confidence: Option, + /// Number of `sub_query`/`sub_rlm` RPCs the round issued. pub rpc_count: u32, /// Wall-clock duration of the round. pub elapsed: Duration, @@ -173,8 +177,8 @@ impl PythonRuntime { ) } - /// Spawn a REPL with `context` (and `ctx`) preloaded from a file. Used - /// by the RLM turn loop. + /// Spawn a REPL with the long input preloaded from a file. Used by the + /// RLM turn loop. pub async fn spawn_with_context(context_path: &Path) -> Result { Self::spawn_inner(Some(context_path), None).await } @@ -310,6 +314,7 @@ impl PythonRuntime { let mut stdout_buf = String::new(); let mut final_value: Option = None; + let mut final_confidence: Option = None; let mut had_error = false; let mut rpc_count: u32 = 0; let round_timeout = self.round_timeout; @@ -332,10 +337,35 @@ impl PythonRuntime { break; } if let Some(rest) = trimmed.strip_prefix(&final_prefix) { - // Stored as a JSON-encoded string. - let v = - serde_json::from_str::(rest).unwrap_or_else(|_| rest.to_string()); - final_value = Some(v); + // New sessions emit an object with value/confidence; + // legacy helpers emitted a JSON string. + match serde_json::from_str::(rest) { + Ok(Value::Object(map)) => { + let value = map + .get("value") + .and_then(Value::as_str) + .map(str::to_string) + .unwrap_or_else(|| { + map.get("value") + .map(Value::to_string) + .unwrap_or_else(|| rest.to_string()) + }); + final_value = Some(value); + final_confidence = map.get("confidence").cloned(); + } + Ok(Value::String(value)) => { + final_value = Some(value); + final_confidence = None; + } + Ok(other) => { + final_value = Some(other.to_string()); + final_confidence = None; + } + Err(_) => { + final_value = Some(rest.to_string()); + final_confidence = None; + } + } continue; } if let Some(rest) = trimmed.strip_prefix(&err_prefix) { @@ -399,6 +429,7 @@ impl PythonRuntime { stderr, has_error: had_error, final_value, + final_confidence, rpc_count, elapsed: started.elapsed(), }) @@ -493,6 +524,7 @@ fn render_bootstrap(session_id: &str) -> String { const BOOTSTRAP_TEMPLATE: &str = r#" import json as _json import os as _os +import re as _re import sys as _sys import traceback as _traceback @@ -574,16 +606,67 @@ def rlm_query_batched(prompts, model=None): out.append(r.get("text","")) return out -def FINAL(value): - """Signal the loop to stop with this final answer.""" - _sys.stdout.write(_FINAL + _json.dumps(str(value)) + "\n") +def _slice_text(slice_value): + if slice_value is None: + return "" + if isinstance(slice_value, dict): + if "text" in slice_value: + return str(slice_value["text"]) + return _json.dumps(slice_value, ensure_ascii=False) + return str(slice_value) + +def _prompt_with_slice(prompt, slice_value): + text = _slice_text(slice_value) + if not text: + return str(prompt) + if isinstance(slice_value, dict) and ("index" in slice_value or ("start" in slice_value and "end" in slice_value)): + label = f"slice index={slice_value.get('index', '?')} range={slice_value.get('start', '?')}:{slice_value.get('end', '?')}" + else: + label = "slice" + return f"{prompt}\n\n--- {label} ---\n{text}" + +def sub_query(prompt, slice=None): + """One child LLM call, optionally scoped to a bounded slice.""" + return llm_query(_prompt_with_slice(prompt, slice)) + +def sub_query_batch(prompt, slices): + """Apply one prompt to many bounded slices concurrently.""" + if not isinstance(slices, (list, tuple)): + return ["[sub_query_batch: slices must be a list]"] + return llm_query_batched([_prompt_with_slice(prompt, s) for s in slices]) + +def sub_query_map(prompts, slices=None): + """Run N distinct prompts, optionally paired with N bounded slices.""" + if not isinstance(prompts, (list, tuple)): + return ["[sub_query_map: prompts must be a list]"] + if slices is None: + return llm_query_batched([str(p) for p in prompts]) + if not isinstance(slices, (list, tuple)): + return ["[sub_query_map: slices must be a list]"] + if len(prompts) != len(slices): + return [f"[sub_query_map: size mismatch ({len(prompts)}/{len(slices)})]" for _ in prompts] + return llm_query_batched([_prompt_with_slice(p, s) for p, s in zip(prompts, slices)]) + +def sub_rlm(prompt, source=None): + """Recursive sub-RLM call for tasks that need their own decomposition.""" + return rlm_query(_prompt_with_slice(prompt, source)) + +def _emit_final(value, confidence=None): + _sys.stdout.write(_FINAL + _json.dumps({ + "value": str(value), + "confidence": confidence, + }) + "\n") _sys.stdout.flush() +def FINAL(value): + """Legacy compatibility alias for finalize(value).""" + _emit_final(value) + def FINAL_VAR(name): - """Signal the loop to stop, returning the value of a named variable.""" + """Legacy compatibility alias for finalize(repl_get(name)).""" name_str = str(name).strip().strip("'\"") if name_str in globals(): - FINAL(globals()[name_str]) + _emit_final(globals()[name_str]) else: print(f"FINAL_VAR error: variable '{name_str}' not found. " f"Use SHOW_VARS() to list available variables.", flush=True) @@ -603,8 +686,61 @@ def repl_get(name, default=None): def repl_set(name, value): globals()[str(name)] = value -def chunk_context(max_chars=20000, overlap=0): - """Return full-coverage context chunks with index/start/end/text fields.""" +def context_meta(): + """Return bounded metadata about the loaded input; never includes the full text.""" + text = _context + line_count = 0 if text == "" else text.count("\n") + (0 if text.endswith("\n") else 1) + return { + "chars": len(text), + "lines": line_count, + "preview": text[:500], + "tail_preview": text[-500:] if len(text) > 500 else text, + } + +def _slice_chars(start, end): + total = len(_context) + s = max(0, int(start)) + e = max(s, min(total, int(end))) + return _context[s:e] + +def _slice_lines(start, end): + lines = _context.splitlines() + s = max(0, int(start)) + e = max(s, min(len(lines), int(end))) + return "\n".join(lines[s:e]) + +def peek(start, end, unit="chars"): + """Return a bounded slice of the input by char offsets or line numbers.""" + if str(unit).lower() in ("line", "lines"): + return _slice_lines(start, end) + if str(unit).lower() not in ("char", "chars"): + raise ValueError("unit must be 'chars' or 'lines'") + return _slice_chars(start, end) + +def search(pattern, max_hits=100): + """Regex-search the input and return bounded hit records with snippets.""" + max_hits = max(0, int(max_hits)) + hits = [] + if max_hits == 0: + return hits + rx = _re.compile(str(pattern), _re.MULTILINE) + for i, m in enumerate(rx.finditer(_context)): + if i >= max_hits: + break + start, end = m.span() + snippet_start = max(0, start - 120) + snippet_end = min(len(_context), end + 120) + hits.append({ + "index": i, + "start": start, + "end": end, + "match": m.group(0), + "snippet": _context[snippet_start:snippet_end], + }) + return hits + +def chunk(max_chars=20000, overlap=0): + """Return full-coverage input chunks with index/start/end/text fields.""" max_chars = int(max_chars) overlap = max(0, int(overlap)) if max_chars <= 0: @@ -614,18 +750,22 @@ def chunk_context(max_chars=20000, overlap=0): chunks = [] start = 0 idx = 0 - total = len(context) + total = len(_context) while start < total: end = min(total, start + max_chars) - chunks.append({"index": idx, "start": start, "end": end, "text": context[start:end]}) + chunks.append({"index": idx, "start": start, "end": end, "text": _context[start:end]}) idx += 1 if end >= total: break start = end - overlap return chunks +def chunk_context(max_chars=20000, overlap=0): + """Compatibility alias for chunk().""" + return chunk(max_chars=max_chars, overlap=overlap) + def chunk_coverage(chunks): - """Summarize coverage for chunks produced by chunk_context().""" + """Summarize coverage for chunks produced by chunk().""" spans = [] for c in chunks: try: @@ -642,36 +782,59 @@ def chunk_coverage(chunks): if end > cursor: covered += end - max(start, cursor) cursor = end - if cursor < len(context): - gaps.append((cursor, len(context))) + if cursor < len(_context): + gaps.append((cursor, len(_context))) return { "chunks": len(chunks), - "context_chars": len(context), + "context_chars": len(_context), + "input_chars": len(_context), "covered_chars": covered, "gaps": gaps, - "complete": covered >= len(context) and not gaps, + "complete": covered >= len(_context) and not gaps, } -# Load the long input as `context` (and `ctx`) from a file. This keeps the -# big string out of the process command-line and out of the LLM's window. +def finalize(value, confidence=None): + """Signal the session's final answer and persist confidence metadata.""" + global final_answer, final_confidence, final_result + final_answer = str(value) + final_confidence = confidence + final_result = { + "value": final_answer, + "confidence": confidence, + } + _emit_final(final_answer, confidence=confidence) + return final_answer + +def evaluate_progress(): + """Return lightweight state useful before deciding the next REPL step.""" + vars_now = SHOW_VARS() + return { + "has_final_answer": "final_answer" in globals(), + "final_confidence": globals().get("final_confidence", None), + "user_variables": vars_now, + } + +# Load the long input from a file. This keeps the big string out of the +# process command-line and out of the LLM's window. _ctx_file = _os.environ.get("RLM_CONTEXT_FILE","") -context = "" +_context = "" if _ctx_file: try: with open(_ctx_file, "r", encoding="utf-8", errors="replace") as f: - context = f.read() + _context = f.read() except Exception as e: _sys.stderr.write(f"[bootstrap] failed to load context: {e}\n") -ctx = context # short alias matching aleph _BOOTSTRAP_NAMES = { "_SID","_REQ","_RESP","_FINAL","_ERR","_RUN","_END","_DONE","_READY", - "_rpc","_ctx_file","_BOOTSTRAP_NAMES","_main_loop", + "_rpc","_ctx_file","_context","_slice_chars","_slice_lines","_BOOTSTRAP_NAMES","_main_loop", + "_emit_final","_slice_text","_prompt_with_slice", "llm_query","llm_query_batched","rlm_query","rlm_query_batched", + "sub_query","sub_query_batch","sub_query_map","sub_rlm", "FINAL","FINAL_VAR","SHOW_VARS","repl_get","repl_set", - "chunk_context","chunk_coverage", - "context","ctx", - "_json","_os","_sys","_traceback", + "context_meta","peek","search","chunk","chunk_context","chunk_coverage", + "finalize","evaluate_progress", + "_json","_os","_re","_sys","_traceback", } def _main_loop(): @@ -829,7 +992,7 @@ mod tests { .await .expect("spawn"); let round = rt - .execute("print(len(context), context[:5])") + .execute("print(context_meta()['chars'], peek(0, 5))") .await .expect("execute"); assert!(round.stdout.contains("19")); @@ -838,13 +1001,16 @@ mod tests { } #[tokio::test] - async fn ctx_alias_works() { + async fn context_aliases_are_not_bound() { let path = write_temp_context("aleph-style"); let mut rt = PythonRuntime::spawn_with_context(&path) .await .expect("spawn"); - let round = rt.execute("print(ctx)").await.expect("execute"); - assert!(round.stdout.contains("aleph-style")); + let round = rt + .execute("print('context' in globals(), 'ctx' in globals())") + .await + .expect("execute"); + assert!(round.stdout.contains("False False")); rt.shutdown().await; } @@ -866,6 +1032,67 @@ mod tests { rt.shutdown().await; } + #[tokio::test] + async fn bounded_input_helpers_work() { + let path = write_temp_context("alpha\nbeta needle\ngamma needle\nomega"); + let mut rt = PythonRuntime::spawn_with_context(&path) + .await + .expect("spawn"); + let round = rt + .execute( + "meta = context_meta()\n\ + hits = search('needle', max_hits=1)\n\ + print(meta['chars'], meta['lines'])\n\ + print(peek(6, 17))\n\ + print(peek(1, 3, unit='lines'))\n\ + print(len(hits), hits[0]['match'], hits[0]['start'])", + ) + .await + .expect("execute"); + assert!(round.stdout.contains("36 4"), "{}", round.stdout); + assert!(round.stdout.contains("beta needle"), "{}", round.stdout); + assert!( + round.stdout.contains("beta needle\ngamma needle"), + "{}", + round.stdout + ); + assert!(round.stdout.contains("1 needle 11"), "{}", round.stdout); + rt.shutdown().await; + } + + #[tokio::test] + async fn new_chunk_helper_reports_full_coverage() { + let path = write_temp_context("abcdefghijklmnopqrstuvwxyz"); + let mut rt = PythonRuntime::spawn_with_context(&path) + .await + .expect("spawn"); + let round = rt + .execute( + "chunks = chunk(max_chars=10)\n\ + coverage = chunk_coverage(chunks)\n\ + print(len(chunks), coverage['input_chars'], coverage['covered_chars'], coverage['complete'])", + ) + .await + .expect("execute"); + assert!(round.stdout.contains("3 26 26 True"), "{}", round.stdout); + rt.shutdown().await; + } + + #[tokio::test] + async fn finalize_helper_is_captured_directly() { + let mut rt = PythonRuntime::new().await.expect("spawn"); + let round = rt + .execute("finalize('computed answer', confidence='high')") + .await + .expect("execute"); + assert_eq!(round.final_value.as_deref(), Some("computed answer")); + assert_eq!( + round.final_confidence.as_ref().and_then(Value::as_str), + Some("high") + ); + rt.shutdown().await; + } + #[tokio::test] async fn rlm_context_runtime_has_no_fixed_round_timeout() { let path = write_temp_context("long input"); @@ -887,7 +1114,7 @@ mod tests { } #[tokio::test] - async fn final_is_captured() { + async fn legacy_final_is_captured() { let mut rt = PythonRuntime::new().await.expect("spawn"); let round = rt .execute("FINAL('the answer is 42')") @@ -898,7 +1125,7 @@ mod tests { } #[tokio::test] - async fn final_var_is_captured() { + async fn legacy_final_var_is_captured() { let mut rt = PythonRuntime::new().await.expect("spawn"); rt.execute("answer = 'computed'").await.expect("r1"); let round = rt.execute("FINAL_VAR('answer')").await.expect("r2"); @@ -945,6 +1172,33 @@ mod tests { rt.shutdown().await; } + #[tokio::test] + async fn rpc_dispatcher_round_trips_sub_query_alias() { + let bridge = StubBridge::new(); + let calls = Arc::clone(&bridge.calls); + + let mut rt = PythonRuntime::new().await.expect("spawn"); + let round = rt + .run("print(sub_query('hello from sub'))", Some(&bridge)) + .await + .expect("execute"); + assert!( + round.stdout.contains("stub#0: hello from sub"), + "stdout: {:?}", + round.stdout + ); + assert_eq!(round.rpc_count, 1); + + let recorded = calls.lock().await; + assert_eq!(recorded.len(), 1); + match &recorded[0] { + RpcRequest::Llm { prompt, .. } => assert_eq!(prompt, "hello from sub"), + other => panic!("expected Llm request, got {other:?}"), + } + drop(recorded); + rt.shutdown().await; + } + #[tokio::test] async fn rpc_dispatcher_round_trips_batch() { let bridge = StubBridge::new(); diff --git a/crates/tui/src/rlm/mod.rs b/crates/tui/src/rlm/mod.rs index 44959983..4b48dc22 100644 --- a/crates/tui/src/rlm/mod.rs +++ b/crates/tui/src/rlm/mod.rs @@ -26,6 +26,7 @@ use crate::models::Usage; pub mod bridge; pub mod prompt; +pub mod session; pub mod turn; pub use bridge::RlmBridge; diff --git a/crates/tui/src/rlm/prompt.rs b/crates/tui/src/rlm/prompt.rs index 39f5d6cd..91db97e3 100644 --- a/crates/tui/src/rlm/prompt.rs +++ b/crates/tui/src/rlm/prompt.rs @@ -11,74 +11,90 @@ pub fn rlm_system_prompt() -> SystemPrompt { SystemPrompt::Text(RLM_SYSTEM_PROMPT.trim().to_string()) } -const RLM_SYSTEM_PROMPT: &str = r#"You are the root of a Recursive Language Model (RLM). Your input lives in a long-running Python REPL as a variable named `context` (alias `ctx`). You DO NOT see `context` in your prompt — only its length and a short preview. The only way to read or compute over it is to write Python code that runs in the REPL. +const RLM_SYSTEM_PROMPT: &str = r#"You are the root of a Recursive Language Model (RLM). The input is loaded into a long-running Python REPL. You hold a live context handle, not the raw body. Read only through bounded helpers, compute in Python, and delegate semantic judgment to child calls. + +The point is symbolic recursion. Keep the long prompt and large intermediate strings in REPL variables; the neural model should see metadata, bounded slices, code, and compact stdout. Do not copy the whole input into the root history, and do not verbalize a long list of child calls when Python can construct and launch them in a loop. The REPL exposes: -- `context` (alias `ctx`) — the full input string. Often huge — never `print(context)` in full. -- `llm_query(prompt, model=None, max_tokens=None, system=None)` — one-shot child LLM. Cheap. Use for chunk-level work. The `model` argument is accepted for compatibility but child calls stay pinned to the configured Flash child model. -- `llm_query_batched(prompts, model=None)` — concurrent fan-out. Returns `list[str]` in input order. The `model` argument is accepted for compatibility but ignored. -- `rlm_query(prompt, model=None)` — recursive sub-RLM. Use when a sub-task itself needs decomposition. The `model` argument is accepted for compatibility but ignored. -- `rlm_query_batched(prompts, model=None)` — concurrent recursive sub-RLMs. The `model` argument is accepted for compatibility but ignored. -- `chunk_context(max_chars=20000, overlap=0)` — full-coverage chunks with index/start/end/text fields. -- `chunk_coverage(chunks)` — coverage summary for chunks produced by `chunk_context`. -- `SHOW_VARS()` — list user variables and their types. -- `repl_set(name, value)` / `repl_get(name)` — explicit cross-round storage. -- `print(...)` — diagnostic output. The driver feeds you a truncated preview next round. -- `FINAL(value)` — end the loop with this string answer. -- `FINAL_VAR(name)` — end the loop with the value of a named variable. +- `context_meta()` - bounded metadata: char count, line count, preview, tail preview. +- `peek(start, end, unit="chars")` - bounded slice by char offsets or line numbers. +- `search(pattern, max_hits=100)` - regex search returning bounded hit records with snippets. +- `chunk(max_chars=20000, overlap=0)` - full-coverage chunks with index/start/end/text fields. +- `chunk_coverage(chunks)` - coverage summary for chunks produced by `chunk`. +- `sub_query(prompt, slice=None)` - one child LLM call, optionally scoped to one bounded slice. +- `sub_query_batch(prompt, slices)` - apply one prompt to many bounded slices concurrently. +- `sub_query_map(prompts, slices=None)` - run N distinct prompts, optionally paired with N bounded slices. +- `sub_rlm(prompt, source=None)` - recursive sub-RLM for a sub-task that needs its own decomposition. Pass a bounded source, not the whole body. +- `SHOW_VARS()` - list user variables and their types. +- `repl_set(name, value)` / `repl_get(name)` - explicit cross-round storage. +- `evaluate_progress()` - inspect whether a final answer exists and what variables are available. +- `finalize(value, confidence=None)` - end the loop with a final answer and optional confidence. +- `print(...)` - diagnostic output. The driver feeds you a truncated preview next round. -Variables, imports, and any other state PERSIST across rounds — the REPL is a single long-lived Python process for the whole turn. +Variables, imports, and any other state persist across rounds. There is no `context` or `ctx` variable. Use `peek`, `search`, `chunk`, and `context_meta`. -Contract — every turn, output ONE ` ```repl ` block of Python. That's it. No prose-only turns. No "I will do X" — just emit the code that does X. +Contract: every turn, output exactly one ` ```repl ` block of Python and nothing else. No prose-only turns. No "I will do X"; emit the code that does X. -Strategy patterns +Five-phase skeleton -1. PREVIEW first. +1. Load ```repl -print(f"len(context) = {len(context)}") -print(context[:500]) +meta = context_meta() +print(meta) ``` +Confirm the handle shape. Do not re-load the body. Keep the head small: names and metadata only. -2. CHUNK + map-reduce with batched concurrent calls. +2. Orient ```repl -chunk_size = 8000 -chunks = chunk_context(max_chars=chunk_size) +hits = search(r"term|phrase", max_hits=20) +sample = peek(0, min(meta["chars"], 1200)) +print({"hits": len(hits), "sample": sample[:300]}) +``` +Search before peeking. Pull only the slices you need. Store maps of the input as variables: headers, regions, sections, candidate spans. + +3. Compute +```repl +chunks = chunk(max_chars=12000, overlap=400) coverage = chunk_coverage(chunks) -prompts = [f"Extract any mentions of X from section {c['index']} ({c['start']}:{c['end']}):\n\n{c['text']}" for c in chunks] -partials = llm_query_batched(prompts) +partials = sub_query_batch( + "Extract the facts needed for the user's question from this slice. " + "Return only grounded facts and cite the slice index/range.", + chunks, +) +print({"coverage": coverage, "partials": len(partials)}) +``` +Use deterministic Python first for counts, regex, parsing, sorting, dedupe, joins, and coverage. You do NO math by asking a child model to count; if Python can enumerate, parse, or simulate it exactly, do that in Python. + +4. Recurse +```repl combined = "\n\n".join(partials) -answer = llm_query(f"Coverage: {coverage}\n\nSynthesize across these section-level extractions:\n\n{combined}") -print(answer[:500]) -``` -Then on the next turn: -```repl -FINAL(answer) +analysis = sub_rlm( + "Synthesize these section findings into a precise answer. " + "Call out conflicts and missing coverage.", + source=combined, +) +print(analysis[:800]) ``` +Use `sub_rlm` only when the sub-task itself needs decomposition or critique. Pass slices or compact variables, not the whole body. Memoize recursive results in variables. -3. RECURSIVE decomposition for hard sub-problems. +5. Converge ```repl -trend = rlm_query(f"Analyze this dataset and conclude with one word — up, down, or stable: {data}") -recommendation = "Hold" if "stable" in trend.lower() else ("Hedge" if "down" in trend.lower() else "Increase") -print(trend, "→", recommendation) -``` - -4. PROGRAMMATIC computation + LLM interpretation. -```repl -import math -theta = math.degrees(math.atan2(v_perp, v_parallel)) -final_answer = llm_query(f"Entry angle is {theta:.2f}°. Phrase the answer for a physics student.") -FINAL(final_answer) +progress = evaluate_progress() +finalize( + f"{analysis}\n\nCoverage: {coverage['covered_chars']}/{coverage['input_chars']} chars " + f"across {coverage['chunks']} chunks; complete={coverage['complete']}.", + confidence="medium" if coverage["complete"] else "low", +) ``` +Call `evaluate_progress()` if the answer is not stable. Loop back to Orient or Compute when coverage is incomplete or confidence is low. Call `finalize(...)` only when the answer is supported by variables you can inspect. Rules -- Emit exactly ONE ` ```repl ` block per turn. The block must contain Python code only. -- Never `print(context)` or otherwise dump it whole — slice, sample, or chunk. -- You MUST call `llm_query` / `llm_query_batched` / `rlm_query` at least once before `FINAL(...)`. Calling FINAL from a top-level prose answer (without ever running a `repl` block that touched `context` via a sub-LLM) is REJECTED — the driver will discard the FINAL and ask you to actually use the REPL. -- Sub-LLMs are powerful — feed them generous chunks (tens of thousands of chars), not tiny windows. -- For exact counts, package totals, line totals, or other structured aggregates, compute them with Python over `context` directly. Do not ask a child LLM to count. -- For whole-input map-reduce, report coverage in the final answer: chunks processed, total chunks, and whether every line/char range was included. If you only processed a subset, say that explicitly. -- Do NOT pad your output with prose like "Here is what I'll do:" — just emit the next ```repl block. +- Use the bounded helpers (`context_meta`, `peek`, `search`, `chunk`) to inspect input. +- Use `sub_query`, `sub_query_batch`, `sub_query_map`, or `sub_rlm` before finalizing unless the task is purely deterministic and fully computed in Python. +- End only by calling `finalize(value, confidence=...)`. +- For exact counts, totals, parsing, and structured aggregates, compute with Python. Do not ask a child LLM to count. +- For whole-input map-reduce, include coverage in the final answer: chunks processed, total chunks, and whether every char range was included. If you only processed a subset, say that explicitly. "#; #[cfg(test)] @@ -103,49 +119,66 @@ mod tests { } #[test] - fn rlm_prompt_mentions_context_variable() { - assert!(body().contains("`context`")); - } - - #[test] - fn rlm_prompt_mentions_ctx_alias() { - assert!(body().contains("`ctx`")); + fn rlm_prompt_uses_five_phase_skeleton() { + let s = body(); + for phase in ["Load", "Orient", "Compute", "Recurse", "Converge"] { + assert!(s.contains(phase), "system prompt missing phase: {phase}"); + } } #[test] fn rlm_prompt_mentions_all_helpers() { let s = body(); for name in [ - "llm_query", - "llm_query_batched", - "rlm_query", - "rlm_query_batched", - "chunk_context", + "peek", + "search", + "chunk", "chunk_coverage", + "context_meta", + "sub_query", + "sub_query_batch", + "sub_query_map", + "sub_rlm", + "finalize", + "evaluate_progress", "SHOW_VARS", - "FINAL", - "FINAL_VAR", ] { assert!(s.contains(name), "system prompt missing helper: {name}"); } } #[test] - fn rlm_prompt_forbids_prose_shortcut() { - // The new contract requires a sub-LLM call before FINAL — the - // prompt must say so explicitly so the model doesn't try to bail - // with FINAL("...inferred from preview..."). - assert!( - body().contains("REJECTED") || body().contains("rejected"), - "system prompt should reject the prose-shortcut path explicitly" - ); + fn rlm_prompt_does_not_publicize_context_variables() { + let s = body(); + assert!(s.contains("There is no `context` or `ctx` variable")); + assert!(!s.contains("len(context)")); + assert!(!s.contains("chunk_context")); + assert!(!s.contains("llm_query")); + assert!(!s.contains("rlm_query")); + } + + #[test] + fn rlm_prompt_is_finalize_only() { + let s = body(); + assert!(s.contains("finalize(value")); + assert!(!s.contains("FINAL_VAR")); + assert!(!s.contains("FINAL(value)")); + assert!(!s.contains("FINAL(")); } #[test] fn rlm_prompt_requires_deterministic_counts_and_coverage() { let s = body(); - assert!(s.contains("compute them with Python")); - assert!(s.contains("report coverage")); + assert!(s.contains("compute with Python")); + assert!(s.contains("include coverage")); assert!(s.contains("chunks processed")); } + + #[test] + fn rlm_prompt_mentions_symbolic_state_contract() { + let s = body(); + assert!(s.contains("symbolic recursion")); + assert!(s.contains("REPL variables")); + assert!(s.contains("Do not copy the whole input")); + } } diff --git a/crates/tui/src/rlm/session.rs b/crates/tui/src/rlm/session.rs new file mode 100644 index 00000000..71426863 --- /dev/null +++ b/crates/tui/src/rlm/session.rs @@ -0,0 +1,180 @@ +//! Persistent RLM session state for the v0.8.33 head/hands tool surface. + +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; +use tokio::sync::Mutex; +use uuid::Uuid; + +use crate::repl::PythonRuntime; + +pub type SharedRlmSessionStore = Arc>>>>; + +#[must_use] +pub fn new_shared_rlm_session_store() -> SharedRlmSessionStore { + Arc::new(Mutex::new(HashMap::new())) +} + +#[derive(Debug)] +pub struct RlmSession { + pub name: String, + pub id: String, + pub kernel: Option, + pub context_meta: ContextMeta, + pub config: RlmSessionConfig, + pub rpc_count: u32, + pub total_duration: Duration, + pub peak_var_count: usize, + pub final_count: usize, + pub created_at: Instant, + pub last_used_at: Instant, + pub context_path: PathBuf, +} + +impl RlmSession { + #[must_use] + pub fn new( + name: String, + kernel: PythonRuntime, + context_meta: ContextMeta, + context_path: PathBuf, + ) -> Self { + let now = Instant::now(); + Self { + name, + id: format!("rlm:{}", Uuid::new_v4().simple()), + kernel: Some(kernel), + context_meta, + config: RlmSessionConfig::default(), + rpc_count: 0, + total_duration: Duration::ZERO, + peak_var_count: 0, + final_count: 0, + created_at: now, + last_used_at: now, + context_path, + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ContextMeta { + pub length: usize, + #[serde(rename = "type")] + pub type_name: String, + pub preview_500: String, + pub sha256: String, +} + +impl ContextMeta { + #[must_use] + pub fn from_body(body: &str, type_name: impl Into) -> Self { + Self { + length: body.chars().count(), + type_name: type_name.into(), + preview_500: body.chars().take(500).collect(), + sha256: sha256_hex(body.as_bytes()), + } + } +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum OutputFeedback { + Full, + Metadata, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RlmSessionConfig { + pub output_feedback: OutputFeedback, + pub sub_query_timeout_secs: u64, + pub sub_rlm_max_depth: u32, + pub share_session: bool, +} + +impl Default for RlmSessionConfig { + fn default() -> Self { + Self { + output_feedback: OutputFeedback::Full, + sub_query_timeout_secs: 120, + sub_rlm_max_depth: 1, + share_session: false, + } + } +} + +pub fn write_context_file(body: &str) -> std::io::Result { + let dir = std::env::temp_dir().join("deepseek_rlm_ctx"); + std::fs::create_dir_all(&dir)?; + let path = dir.join(format!( + "session_{}_{}.txt", + std::process::id(), + Uuid::new_v4().simple() + )); + std::fs::write(&path, body)?; + Ok(path) +} + +#[must_use] +pub fn derive_session_name(source_hint: Option<&str>) -> String { + let hint = source_hint + .and_then(|raw| { + Path::new(raw) + .file_name() + .and_then(|name| name.to_str()) + .or(Some(raw)) + }) + .unwrap_or("context"); + let mut out = String::new(); + for ch in hint.chars() { + if ch.is_ascii_alphanumeric() { + out.push(ch.to_ascii_lowercase()); + } else if !out.ends_with('_') { + out.push('_'); + } + if out.len() >= 48 { + break; + } + } + let out = out.trim_matches('_'); + if out.is_empty() { + "context".to_string() + } else { + out.to_string() + } +} + +fn sha256_hex(bytes: &[u8]) -> String { + let mut hasher = Sha256::new(); + hasher.update(bytes); + format!("{:x}", hasher.finalize()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn derive_session_name_slugifies_path() { + assert_eq!( + derive_session_name(Some("src/Big File.rs")), + "big_file_rs".to_string() + ); + } + + #[test] + fn context_meta_hashes_and_previews_body() { + let meta = ContextMeta::from_body("abcdef", "text"); + assert_eq!(meta.length, 6); + assert_eq!(meta.preview_500, "abcdef"); + assert_eq!( + meta.sha256, + "bef57ec7f53a6d40beb640a780a639c83bc29ac8a9816f1fc6c5c6dcd93c4721" + ); + } +} diff --git a/crates/tui/src/runtime_threads.rs b/crates/tui/src/runtime_threads.rs index 742ef440..b556e030 100644 --- a/crates/tui/src/runtime_threads.rs +++ b/crates/tui/src/runtime_threads.rs @@ -1960,6 +1960,8 @@ impl RuntimeThreadManager { active_thread_id: Some(thread.id.clone()), shell_manager: None, hook_executor: None, + handle_store: crate::tools::handle::new_shared_handle_store(), + rlm_sessions: crate::rlm::session::new_shared_rlm_session_store(), }, subagent_model_overrides: self.config.subagent_model_overrides(), memory_enabled: self.config.memory_enabled(), diff --git a/crates/tui/src/settings.rs b/crates/tui/src/settings.rs index 18fec2ed..0df883a7 100644 --- a/crates/tui/src/settings.rs +++ b/crates/tui/src/settings.rs @@ -211,7 +211,7 @@ pub struct Settings { pub default_mode: String, /// Sidebar width as percentage of terminal width pub sidebar_width_percent: u16, - /// Sidebar focus mode: auto, plan, todos, tasks, agents, context + /// Sidebar focus mode: auto, work, tasks, agents, context pub sidebar_focus: String, /// Enable the session-context panel (#504). Shows working set, tokens, /// cost, MCP/LSP status, cycle count, and memory info. @@ -555,13 +555,13 @@ impl Settings { "sidebar_focus" | "focus" => { let normalized = match value.trim().to_ascii_lowercase().as_str() { "auto" => "auto", - "plan" => "plan", - "todos" => "todos", + "work" | "plan" | "todos" => "work", "tasks" => "tasks", "agents" | "subagents" | "sub-agents" => "agents", + "context" | "session" => "context", _ => { anyhow::bail!( - "Failed to update setting: invalid sidebar focus '{value}'. Expected: auto, plan, todos, tasks, agents." + "Failed to update setting: invalid sidebar focus '{value}'. Expected: auto, work, tasks, agents, context." ) } }; @@ -732,7 +732,7 @@ impl Settings { ("sidebar_width", "Sidebar width percentage: 10-50"), ( "sidebar_focus", - "Sidebar focus: auto, plan, todos, tasks, agents", + "Sidebar focus: auto, work, tasks, agents, context", ), ("cost_currency", "Cost display currency: usd, cny"), ("max_history", "Max input history entries"), @@ -886,8 +886,7 @@ fn normalize_background_color_setting(value: &str) -> Result> { fn normalize_sidebar_focus(value: &str) -> &str { match value.trim().to_ascii_lowercase().as_str() { - "plan" => "plan", - "todos" => "todos", + "work" | "plan" | "todos" => "work", "tasks" => "tasks", "agents" | "subagents" | "sub-agents" => "agents", "context" | "session" => "context", @@ -1008,6 +1007,28 @@ mod tests { assert!(err.to_string().contains("invalid cost currency")); } + #[test] + fn sidebar_focus_accepts_work_values_and_legacy_aliases() { + let mut settings = Settings::default(); + + settings.set("sidebar_focus", "work").expect("set work"); + assert_eq!(settings.sidebar_focus, "work"); + + settings.set("focus", "plan").expect("legacy plan alias"); + assert_eq!(settings.sidebar_focus, "work"); + + settings.set("focus", "todos").expect("legacy todos alias"); + assert_eq!(settings.sidebar_focus, "work"); + + settings.set("focus", "context").expect("context focus"); + assert_eq!(settings.sidebar_focus, "context"); + + let err = settings + .set("sidebar_focus", "classic") + .expect_err("classic is not a supported public focus"); + assert!(err.to_string().contains("invalid sidebar focus")); + } + #[test] fn display_localizes_header_and_config_file_label() { let settings = Settings::default(); diff --git a/crates/tui/src/tools/fetch_url.rs b/crates/tui/src/tools/fetch_url.rs index 57dfeb13..8c76ccea 100644 --- a/crates/tui/src/tools/fetch_url.rs +++ b/crates/tui/src/tools/fetch_url.rs @@ -7,6 +7,7 @@ //! (`format = "markdown"`); pass `format = "raw"` to keep the bytes intact //! when the model wants to do its own parsing. +use super::handle::query_jsonpath; use super::spec::{ ApprovalRequirement, ToolCapability, ToolContext, ToolError, ToolResult, ToolSpec, optional_u64, }; @@ -15,6 +16,7 @@ use async_trait::async_trait; use regex::Regex; use serde::Serialize; use serde_json::{Value, json}; +use std::collections::BTreeMap; use std::sync::OnceLock; use std::time::Duration; @@ -73,9 +75,12 @@ impl Format { struct FetchResponse { url: String, status: u16, + headers: BTreeMap, content_type: String, content: String, truncated: bool, + #[serde(skip_serializing_if = "Option::is_none")] + fields: Option>>, } pub struct FetchUrlTool; @@ -110,6 +115,11 @@ impl ToolSpec for FetchUrlTool { "timeout_ms": { "type": "integer", "description": "Request timeout in milliseconds (default 15,000; max 60,000)." + }, + "fields": { + "type": "array", + "items": { "type": "string" }, + "description": "Optional JSONPath projections for JSON responses. Supports $, .field, [index], [*], and ['field']; returns matches under `fields`." } }, "required": ["url"] @@ -146,6 +156,7 @@ impl ToolSpec for FetchUrlTool { let max_bytes = optional_u64(&input, "max_bytes", DEFAULT_MAX_BYTES).min(HARD_MAX_BYTES); let timeout_ms = optional_u64(&input, "timeout_ms", DEFAULT_TIMEOUT_MS).min(HARD_MAX_TIMEOUT_MS); + let requested_fields = parse_fields(&input)?; let mut current_url = reqwest::Url::parse(&url) .map_err(|e| ToolError::invalid_input(format!("invalid URL: {e}")))?; let mut redirects_followed = 0usize; @@ -202,6 +213,7 @@ impl ToolSpec for FetchUrlTool { .and_then(|v| v.to_str().ok()) .unwrap_or("application/octet-stream") .to_string(); + let headers = response_headers(resp.headers()); let bytes = resp .bytes() @@ -216,6 +228,7 @@ impl ToolSpec for FetchUrlTool { }; let body_text = String::from_utf8_lossy(usable).to_string(); + let fields = project_json_fields(&body_text, &content_type, &requested_fields)?; let processed = match format { Format::Raw => body_text, Format::Text | Format::Markdown => { @@ -230,9 +243,11 @@ impl ToolSpec for FetchUrlTool { let response = FetchResponse { url: final_url, status: status.as_u16(), + headers, content_type, content: processed, truncated, + fields, }; if !status.is_success() { @@ -386,6 +401,66 @@ fn validate_dns_resolved_ip( ))) } +fn parse_fields(input: &Value) -> Result, ToolError> { + let Some(values) = input.get("fields") else { + return Ok(Vec::new()); + }; + let Some(values) = values.as_array() else { + return Err(ToolError::invalid_input("`fields` must be an array")); + }; + let mut fields = Vec::new(); + for value in values { + let Some(field) = value.as_str() else { + return Err(ToolError::invalid_input( + "`fields` entries must be JSONPath strings", + )); + }; + let field = field.trim(); + if !field.is_empty() { + fields.push(field.to_string()); + } + } + Ok(fields) +} + +fn response_headers(headers: &reqwest::header::HeaderMap) -> BTreeMap { + headers + .iter() + .filter_map(|(name, value)| { + value + .to_str() + .ok() + .map(|value| (name.as_str().to_ascii_lowercase(), value.to_string())) + }) + .collect() +} + +fn project_json_fields( + body_text: &str, + content_type: &str, + fields: &[String], +) -> Result>>, ToolError> { + if fields.is_empty() { + return Ok(None); + } + if !content_type.to_ascii_lowercase().contains("json") { + return Err(ToolError::invalid_input( + "`fields` can only be used with JSON responses", + )); + } + let body_json: Value = serde_json::from_str(body_text).map_err(|e| { + ToolError::execution_failed(format!("response body is not valid JSON for `fields`: {e}")) + })?; + let mut out = BTreeMap::new(); + for field in fields { + let matches = query_jsonpath(&body_json, field).map_err(|e| { + ToolError::invalid_input(format!("invalid JSONPath `{field}` in `fields`: {e}")) + })?; + out.insert(field.clone(), matches); + } + Ok(Some(out)) +} + /// Strip `