diff --git a/crates/tui/src/repl/runtime.rs b/crates/tui/src/repl/runtime.rs index 626a0e7f..3b5cdfc9 100644 --- a/crates/tui/src/repl/runtime.rs +++ b/crates/tui/src/repl/runtime.rs @@ -495,7 +495,7 @@ def _rpc(req): return {"error": f"unexpected protocol line: {line[:120]!r}"} def llm_query(prompt, model=None, max_tokens=None, system=None): - """One-shot sub-LLM call. Returns the completion text as a string.""" + """One-shot sub-LLM call. The model arg is accepted for compatibility but ignored by Rust.""" resp = _rpc({"type":"llm","prompt":str(prompt),"model":model, "max_tokens":max_tokens,"system":system}) if isinstance(resp, dict) and resp.get("error"): @@ -505,7 +505,7 @@ def llm_query(prompt, model=None, max_tokens=None, system=None): return str(resp) def llm_query_batched(prompts, model=None): - """Run multiple sub-LLM calls concurrently. Returns a list of strings.""" + """Run multiple sub-LLM calls concurrently. The model arg is accepted for compatibility but ignored.""" if not isinstance(prompts, (list, tuple)): return ["[llm_query_batched: prompts must be a list]"] resp = _rpc({"type":"llm_batch","prompts":[str(p) for p in prompts],"model":model}) @@ -523,7 +523,7 @@ def llm_query_batched(prompts, model=None): return out def rlm_query(prompt, model=None): - """Recursive sub-RLM (paper's `sub_RLM`). Each call gets its own REPL.""" + """Recursive sub-RLM. The model arg is accepted for compatibility but ignored by Rust.""" resp = _rpc({"type":"rlm","prompt":str(prompt),"model":model}) if isinstance(resp, dict) and resp.get("error"): return f"[rlm_query error: {resp['error']}]" @@ -532,7 +532,7 @@ def rlm_query(prompt, model=None): return str(resp) def rlm_query_batched(prompts, model=None): - """Run multiple recursive sub-RLMs in parallel.""" + """Run multiple recursive sub-RLMs in parallel. The model arg is accepted for compatibility but ignored.""" if not isinstance(prompts, (list, tuple)): return ["[rlm_query_batched: prompts must be a list]"] resp = _rpc({"type":"rlm_batch","prompts":[str(p) for p in prompts],"model":model}) diff --git a/crates/tui/src/rlm/bridge.rs b/crates/tui/src/rlm/bridge.rs index 36d8a9dc..904ddef7 100644 --- a/crates/tui/src/rlm/bridge.rs +++ b/crates/tui/src/rlm/bridge.rs @@ -86,14 +86,16 @@ impl RlmBridge { async fn dispatch_llm( &self, prompt: String, - model: Option, + _model: Option, max_tokens: Option, system: Option, ) -> SingleResp { let request = MessageRequest { - model: model - .filter(|m| !m.is_empty()) - .unwrap_or_else(|| self.child_model.clone()), + // The Python helper accepts `model=` for older snippets, but it is + // intentionally not authoritative. RLM child calls are pinned to + // the tool's configured child model so model-generated Python + // cannot silently upgrade cheap fanout work to an expensive model. + model: self.child_model.clone(), messages: vec![Message { role: "user".to_string(), content: vec![ContentBlock::Text { @@ -150,16 +152,12 @@ impl RlmBridge { SingleResp { text, error: None } } - async fn dispatch_llm_batch(&self, prompts: Vec, model: Option) -> BatchResp { + async fn dispatch_llm_batch(&self, prompts: Vec, _model: Option) -> BatchResp { if let Some(resp) = batch_guard(prompts.len()) { return resp; } - let model = Arc::new( - model - .filter(|m| !m.is_empty()) - .unwrap_or_else(|| self.child_model.clone()), - ); + let model = Arc::new(self.child_model.clone()); let futures = prompts.into_iter().map(|prompt| { let model = Arc::clone(&model); @@ -174,12 +172,12 @@ impl RlmBridge { } } - async fn dispatch_rlm(&self, prompt: String, model: Option) -> SingleResp { + async fn dispatch_rlm(&self, prompt: String, _model: Option) -> SingleResp { if self.depth_remaining == 0 { // Budget exhausted — fall back to a one-shot child completion // rather than returning an error. Matches the paper's behaviour // ("sub_RLM gracefully degrades to llm_query at depth=0"). - return self.dispatch_llm(prompt, model, None, None).await; + return self.dispatch_llm(prompt, None, None, None).await; } // Build a drain channel to absorb status events from the nested @@ -192,9 +190,7 @@ impl RlmBridge { async move { while rx.recv().await.is_some() {} }, ); - let child_model = model - .filter(|m| !m.is_empty()) - .unwrap_or_else(|| self.child_model.clone()); + let child_model = self.child_model.clone(); // Recursive call. The dyn-erasure on `run_rlm_turn_inner` breaks // the `bridge → turn → bridge` opaque-future cycle. @@ -223,16 +219,14 @@ impl RlmBridge { } } - async fn dispatch_rlm_batch(&self, prompts: Vec, model: Option) -> BatchResp { + async fn dispatch_rlm_batch(&self, prompts: Vec, _model: Option) -> BatchResp { if let Some(resp) = batch_guard(prompts.len()) { return resp; } - let model = Arc::new(model); - let futures = prompts.into_iter().map(|p| { - let model = Arc::clone(&model); - async move { self.dispatch_rlm(p, (*model).clone()).await } - }); + let futures = prompts + .into_iter() + .map(|p| async move { self.dispatch_rlm(p, None).await }); BatchResp { results: join_all(futures).await, } @@ -341,7 +335,7 @@ mod tests { } #[tokio::test] - async fn llm_dispatch_uses_trait_backed_mock_client() { + async fn llm_dispatch_pins_configured_child_model() { let mock = Arc::new(MockLlmClient::new(Vec::new())); mock.push_message_response(mock_response("child answer", 7, 11)); let bridge = bridge_for(Arc::clone(&mock), 1); @@ -365,7 +359,7 @@ mod tests { let captured = mock.captured_requests(); assert_eq!(captured.len(), 1); - assert_eq!(captured[0].model, "override-model"); + assert_eq!(captured[0].model, "child-model"); assert_eq!(captured[0].max_tokens, 123); assert_eq!( captured[0].system, @@ -378,7 +372,7 @@ mod tests { } #[tokio::test] - async fn llm_batch_dispatch_preserves_result_count_and_usage() { + async fn llm_batch_dispatch_pins_configured_child_model() { let mock = Arc::new(MockLlmClient::new(Vec::new())); mock.push_message_response(mock_response("one", 1, 2)); mock.push_message_response(mock_response("two", 3, 4)); @@ -410,7 +404,7 @@ mod tests { assert!( captured .iter() - .all(|request| request.model == "batch-model") + .all(|request| request.model == "child-model") ); let usage = bridge.usage.lock().await; @@ -419,7 +413,7 @@ mod tests { } #[tokio::test] - async fn rlm_dispatch_at_depth_zero_falls_back_to_plain_llm_query() { + async fn rlm_dispatch_at_depth_zero_pins_configured_child_model() { let mock = Arc::new(MockLlmClient::new(Vec::new())); mock.push_message_response(mock_response("fallback answer", 3, 5)); let bridge = bridge_for(Arc::clone(&mock), 0); @@ -445,6 +439,6 @@ mod tests { let captured = mock.captured_requests(); assert_eq!(captured.len(), 1); - assert_eq!(captured[0].model, "override-model"); + assert_eq!(captured[0].model, "child-model"); } } diff --git a/crates/tui/src/rlm/prompt.rs b/crates/tui/src/rlm/prompt.rs index ba003cb9..5553d2cf 100644 --- a/crates/tui/src/rlm/prompt.rs +++ b/crates/tui/src/rlm/prompt.rs @@ -15,10 +15,10 @@ const RLM_SYSTEM_PROMPT: &str = r#"You are the root of a Recursive Language Mode The REPL exposes: - `context` (alias `ctx`) — the full input string. Often huge — never `print(context)` in full. -- `llm_query(prompt, model=None, max_tokens=None, system=None)` — one-shot child LLM. Cheap. Use for chunk-level work. -- `llm_query_batched(prompts, model=None)` — concurrent fan-out. Returns `list[str]` in input order. -- `rlm_query(prompt, model=None)` — recursive sub-RLM. Use when a sub-task itself needs decomposition. -- `rlm_query_batched(prompts, model=None)` — concurrent recursive sub-RLMs. +- `llm_query(prompt, model=None, max_tokens=None, system=None)` — one-shot child LLM. Cheap. Use for chunk-level work. The `model` argument is accepted for compatibility but child calls stay pinned to the configured Flash child model. +- `llm_query_batched(prompts, model=None)` — concurrent fan-out. Returns `list[str]` in input order. The `model` argument is accepted for compatibility but ignored. +- `rlm_query(prompt, model=None)` — recursive sub-RLM. Use when a sub-task itself needs decomposition. The `model` argument is accepted for compatibility but ignored. +- `rlm_query_batched(prompts, model=None)` — concurrent recursive sub-RLMs. The `model` argument is accepted for compatibility but ignored. - `SHOW_VARS()` — list user variables and their types. - `repl_set(name, value)` / `repl_get(name)` — explicit cross-round storage. - `print(...)` — diagnostic output. The driver feeds you a truncated preview next round. diff --git a/crates/tui/src/rlm/turn.rs b/crates/tui/src/rlm/turn.rs index c97f5ad1..16504fb8 100644 --- a/crates/tui/src/rlm/turn.rs +++ b/crates/tui/src/rlm/turn.rs @@ -588,11 +588,21 @@ fn build_metadata_message( parts.push("**REPL helpers** (use inside ```repl blocks)".to_string()); parts.push("- `context` / `ctx` — the full input string".to_string()); parts.push("- `len(context)` / `context[a:b]` / `context.splitlines()` — slice it".to_string()); - parts.push("- `llm_query(prompt, model=None)` — one-shot child LLM".to_string()); - parts.push("- `llm_query_batched([p1, p2, ...])` — concurrent fan-out".to_string()); - parts.push("- `rlm_query(prompt, model=None)` — recursive sub-RLM".to_string()); parts.push( - "- `rlm_query_batched([p1, p2, ...])` — concurrent recursive sub-RLMs".to_string(), + "- `llm_query(prompt, model=None)` — one-shot child LLM; `model` is ignored and child calls stay pinned to Flash" + .to_string(), + ); + parts.push( + "- `llm_query_batched([p1, p2, ...])` — concurrent fan-out; `model` is ignored" + .to_string(), + ); + parts.push( + "- `rlm_query(prompt, model=None)` — recursive sub-RLM; `model` is ignored" + .to_string(), + ); + parts.push( + "- `rlm_query_batched([p1, p2, ...])` — concurrent recursive sub-RLMs; `model` is ignored" + .to_string(), ); parts.push("- `SHOW_VARS()` — list user variables".to_string()); parts.push("- `repl_set(name, value)` / `repl_get(name)` — explicit store".to_string()); diff --git a/crates/tui/src/tools/rlm.rs b/crates/tui/src/tools/rlm.rs index 5cc641ab..8881c8ce 100644 --- a/crates/tui/src/tools/rlm.rs +++ b/crates/tui/src/tools/rlm.rs @@ -93,10 +93,6 @@ impl ToolSpec for RlmTool { "type": "string", "description": "Inline content to load as PROMPT. Use only when the input isn't a file you can point at. Capped at 200k chars." }, - "child_model": { - "type": "string", - "description": "Model for sub-LLM (`llm_query`) calls inside the REPL. Default: deepseek-v4-flash." - }, "max_depth": { "type": "integer", "description": "Recursion budget for `sub_rlm()` calls. 0 disables recursion; default 1 matches paper experiments." @@ -182,12 +178,10 @@ impl ToolSpec for RlmTool { )); } - let child_model = input - .get("child_model") - .and_then(|v| v.as_str()) - .filter(|s| !s.is_empty()) - .unwrap_or(DEFAULT_CHILD_MODEL) - .to_string(); + // Pin child calls to Flash so model-generated tool args cannot quietly + // turn fanout work into Pro-billed requests. The RLM root still uses + // the session model; child helper calls are the cheap batch layer. + let child_model = DEFAULT_CHILD_MODEL.to_string(); let max_depth = input .get("max_depth") @@ -353,7 +347,6 @@ mod tests { assert!(schema["properties"]["task"].is_object()); assert!(schema["properties"]["file_path"].is_object()); assert!(schema["properties"]["content"].is_object()); - assert!(schema["properties"]["child_model"].is_object()); assert!(schema["properties"]["max_depth"].is_object()); let required = schema["required"].as_array().unwrap(); assert!(required.iter().any(|v| v == "task")); diff --git a/docs/MODES.md b/docs/MODES.md index c7a76135..7b320294 100644 --- a/docs/MODES.md +++ b/docs/MODES.md @@ -16,7 +16,7 @@ Press `Shift+Tab` to cycle reasoning effort. - **Agent**: multi-step tool use. Approvals for shell and paid tools (file writes are allowed without a prompt). - **YOLO**: enables shell + trust mode and auto-approves all tools. Use only in trusted repos. -All three modes have access to the `rlm_query` tool — a structured tool call that fans out 1–16 cheap parallel children on `deepseek-v4-flash`. The model reaches for it when work is decomposable. +All three modes have access to the `rlm` tool. Inside its Python REPL, `llm_query_batched` fans out 1–16 cheap parallel child calls pinned to `deepseek-v4-flash`. The model reaches for it when work is decomposable. ## Compatibility Notes diff --git a/docs/TOOL_SURFACE.md b/docs/TOOL_SURFACE.md index b58470c7..68cece8d 100644 --- a/docs/TOOL_SURFACE.md +++ b/docs/TOOL_SURFACE.md @@ -151,11 +151,11 @@ reflect very different cost classes: | Tool | What each child does | Wall-clock | Token cost | Cap | |---|---|---|---|---| | `agent_spawn` | Full sub-agent loop (planning, tool calls, multi-turn streaming, can spawn children) | minutes | thousands of tokens | 10 in flight by default (`[subagents].max_concurrent`, hard ceiling 20) | -| `rlm_query` | One-shot non-streaming Chat Completions call to `deepseek-v4-flash` | seconds | ~hundreds of tokens | 16 per call | +| `rlm` helper `llm_query_batched` | One-shot non-streaming Chat Completions calls pinned to `deepseek-v4-flash` | seconds | ~hundreds of tokens | 16 per call | The caps appear in each tool's description and error messages so the model (and the user) can choose the right tool for the job. If one sub-agent is -enough but you need parallel lookups, prefer `rlm_query`; if each task needs +enough but you need parallel lookups, prefer `rlm` with `llm_query_batched`; if each task needs its own tool-carrying agent loop, use `agent_spawn` (and cancel completed ones to free slots).