fix(rlm): pin child calls to flash (#832)

2026-05-06 03:41:47 -05:00
parent 69714819f8
commit 03e59c60ce
7 changed files with 50 additions and 53 deletions
@@ -495,7 +495,7 @@ def _rpc(req):
    return {"error": f"unexpected protocol line: {line[:120]!r}"}

 def llm_query(prompt, model=None, max_tokens=None, system=None):
-    """One-shot sub-LLM call. Returns the completion text as a string."""
+    """One-shot sub-LLM call. The model arg is accepted for compatibility but ignored by Rust."""
    resp = _rpc({"type":"llm","prompt":str(prompt),"model":model,
                 "max_tokens":max_tokens,"system":system})
    if isinstance(resp, dict) and resp.get("error"):
@@ -505,7 +505,7 @@ def llm_query(prompt, model=None, max_tokens=None, system=None):
    return str(resp)

 def llm_query_batched(prompts, model=None):
-    """Run multiple sub-LLM calls concurrently. Returns a list of strings."""
+    """Run multiple sub-LLM calls concurrently. The model arg is accepted for compatibility but ignored."""
    if not isinstance(prompts, (list, tuple)):
        return ["[llm_query_batched: prompts must be a list]"]
    resp = _rpc({"type":"llm_batch","prompts":[str(p) for p in prompts],"model":model})
@@ -523,7 +523,7 @@ def llm_query_batched(prompts, model=None):
    return out

 def rlm_query(prompt, model=None):
-    """Recursive sub-RLM (paper's `sub_RLM`). Each call gets its own REPL."""
+    """Recursive sub-RLM. The model arg is accepted for compatibility but ignored by Rust."""
    resp = _rpc({"type":"rlm","prompt":str(prompt),"model":model})
    if isinstance(resp, dict) and resp.get("error"):
        return f"[rlm_query error: {resp['error']}]"
@@ -532,7 +532,7 @@ def rlm_query(prompt, model=None):
    return str(resp)

 def rlm_query_batched(prompts, model=None):
-    """Run multiple recursive sub-RLMs in parallel."""
+    """Run multiple recursive sub-RLMs in parallel. The model arg is accepted for compatibility but ignored."""
    if not isinstance(prompts, (list, tuple)):
        return ["[rlm_query_batched: prompts must be a list]"]
    resp = _rpc({"type":"rlm_batch","prompts":[str(p) for p in prompts],"model":model})
@@ -86,14 +86,16 @@ impl RlmBridge {
    async fn dispatch_llm(
        &self,
        prompt: String,
-        model: Option<String>,
+        _model: Option<String>,
        max_tokens: Option<u32>,
        system: Option<String>,
    ) -> SingleResp {
        let request = MessageRequest {
-            model: model
-                .filter(|m| !m.is_empty())
-                .unwrap_or_else(|| self.child_model.clone()),
+            // The Python helper accepts `model=` for older snippets, but it is
+            // intentionally not authoritative. RLM child calls are pinned to
+            // the tool's configured child model so model-generated Python
+            // cannot silently upgrade cheap fanout work to an expensive model.
+            model: self.child_model.clone(),
            messages: vec![Message {
                role: "user".to_string(),
                content: vec![ContentBlock::Text {
@@ -150,16 +152,12 @@ impl RlmBridge {
        SingleResp { text, error: None }
    }

-    async fn dispatch_llm_batch(&self, prompts: Vec<String>, model: Option<String>) -> BatchResp {
+    async fn dispatch_llm_batch(&self, prompts: Vec<String>, _model: Option<String>) -> BatchResp {
        if let Some(resp) = batch_guard(prompts.len()) {
            return resp;
        }

-        let model = Arc::new(
-            model
-                .filter(|m| !m.is_empty())
-                .unwrap_or_else(|| self.child_model.clone()),
-        );
+        let model = Arc::new(self.child_model.clone());

        let futures = prompts.into_iter().map(|prompt| {
            let model = Arc::clone(&model);
@@ -174,12 +172,12 @@ impl RlmBridge {
        }
    }

-    async fn dispatch_rlm(&self, prompt: String, model: Option<String>) -> SingleResp {
+    async fn dispatch_rlm(&self, prompt: String, _model: Option<String>) -> SingleResp {
        if self.depth_remaining == 0 {
            // Budget exhausted — fall back to a one-shot child completion
            // rather than returning an error. Matches the paper's behaviour
            // ("sub_RLM gracefully degrades to llm_query at depth=0").
-            return self.dispatch_llm(prompt, model, None, None).await;
+            return self.dispatch_llm(prompt, None, None, None).await;
        }

        // Build a drain channel to absorb status events from the nested
@@ -192,9 +190,7 @@ impl RlmBridge {
            async move { while rx.recv().await.is_some() {} },
        );

-        let child_model = model
-            .filter(|m| !m.is_empty())
-            .unwrap_or_else(|| self.child_model.clone());
+        let child_model = self.child_model.clone();

        // Recursive call. The dyn-erasure on `run_rlm_turn_inner` breaks
        // the `bridge → turn → bridge` opaque-future cycle.
@@ -223,16 +219,14 @@ impl RlmBridge {
        }
    }

-    async fn dispatch_rlm_batch(&self, prompts: Vec<String>, model: Option<String>) -> BatchResp {
+    async fn dispatch_rlm_batch(&self, prompts: Vec<String>, _model: Option<String>) -> BatchResp {
        if let Some(resp) = batch_guard(prompts.len()) {
            return resp;
        }

-        let model = Arc::new(model);
-        let futures = prompts.into_iter().map(|p| {
-            let model = Arc::clone(&model);
-            async move { self.dispatch_rlm(p, (*model).clone()).await }
-        });
+        let futures = prompts
+            .into_iter()
+            .map(|p| async move { self.dispatch_rlm(p, None).await });
        BatchResp {
            results: join_all(futures).await,
        }
@@ -341,7 +335,7 @@ mod tests {
    }

    #[tokio::test]
-    async fn llm_dispatch_uses_trait_backed_mock_client() {
+    async fn llm_dispatch_pins_configured_child_model() {
        let mock = Arc::new(MockLlmClient::new(Vec::new()));
        mock.push_message_response(mock_response("child answer", 7, 11));
        let bridge = bridge_for(Arc::clone(&mock), 1);
@@ -365,7 +359,7 @@ mod tests {

        let captured = mock.captured_requests();
        assert_eq!(captured.len(), 1);
-        assert_eq!(captured[0].model, "override-model");
+        assert_eq!(captured[0].model, "child-model");
        assert_eq!(captured[0].max_tokens, 123);
        assert_eq!(
            captured[0].system,
@@ -378,7 +372,7 @@ mod tests {
    }

    #[tokio::test]
-    async fn llm_batch_dispatch_preserves_result_count_and_usage() {
+    async fn llm_batch_dispatch_pins_configured_child_model() {
        let mock = Arc::new(MockLlmClient::new(Vec::new()));
        mock.push_message_response(mock_response("one", 1, 2));
        mock.push_message_response(mock_response("two", 3, 4));
@@ -410,7 +404,7 @@ mod tests {
        assert!(
            captured
                .iter()
-                .all(|request| request.model == "batch-model")
+                .all(|request| request.model == "child-model")
        );

        let usage = bridge.usage.lock().await;
@@ -419,7 +413,7 @@ mod tests {
    }

    #[tokio::test]
-    async fn rlm_dispatch_at_depth_zero_falls_back_to_plain_llm_query() {
+    async fn rlm_dispatch_at_depth_zero_pins_configured_child_model() {
        let mock = Arc::new(MockLlmClient::new(Vec::new()));
        mock.push_message_response(mock_response("fallback answer", 3, 5));
        let bridge = bridge_for(Arc::clone(&mock), 0);
@@ -445,6 +439,6 @@ mod tests {

        let captured = mock.captured_requests();
        assert_eq!(captured.len(), 1);
-        assert_eq!(captured[0].model, "override-model");
+        assert_eq!(captured[0].model, "child-model");
    }
 }
@@ -15,10 +15,10 @@ const RLM_SYSTEM_PROMPT: &str = r#"You are the root of a Recursive Language Mode

 The REPL exposes:
 - `context` (alias `ctx`) — the full input string. Often huge — never `print(context)` in full.
- `llm_query(prompt, model=None, max_tokens=None, system=None)` — one-shot child LLM. Cheap. Use for chunk-level work.
- `llm_query_batched(prompts, model=None)` — concurrent fan-out. Returns `list[str]` in input order.
- `rlm_query(prompt, model=None)` — recursive sub-RLM. Use when a sub-task itself needs decomposition.
- `rlm_query_batched(prompts, model=None)` — concurrent recursive sub-RLMs.
+- `llm_query(prompt, model=None, max_tokens=None, system=None)` — one-shot child LLM. Cheap. Use for chunk-level work. The `model` argument is accepted for compatibility but child calls stay pinned to the configured Flash child model.
+- `llm_query_batched(prompts, model=None)` — concurrent fan-out. Returns `list[str]` in input order. The `model` argument is accepted for compatibility but ignored.
+- `rlm_query(prompt, model=None)` — recursive sub-RLM. Use when a sub-task itself needs decomposition. The `model` argument is accepted for compatibility but ignored.
+- `rlm_query_batched(prompts, model=None)` — concurrent recursive sub-RLMs. The `model` argument is accepted for compatibility but ignored.
 - `SHOW_VARS()` — list user variables and their types.
 - `repl_set(name, value)` / `repl_get(name)` — explicit cross-round storage.
 - `print(...)` — diagnostic output. The driver feeds you a truncated preview next round.
@@ -588,11 +588,21 @@ fn build_metadata_message(
    parts.push("**REPL helpers** (use inside ```repl blocks)".to_string());
    parts.push("- `context` / `ctx`                       — the full input string".to_string());
    parts.push("- `len(context)` / `context[a:b]` / `context.splitlines()` — slice it".to_string());
-    parts.push("- `llm_query(prompt, model=None)`        — one-shot child LLM".to_string());
-    parts.push("- `llm_query_batched([p1, p2, ...])`     — concurrent fan-out".to_string());
-    parts.push("- `rlm_query(prompt, model=None)`        — recursive sub-RLM".to_string());
    parts.push(
-        "- `rlm_query_batched([p1, p2, ...])`     — concurrent recursive sub-RLMs".to_string(),
+        "- `llm_query(prompt, model=None)`        — one-shot child LLM; `model` is ignored and child calls stay pinned to Flash"
+            .to_string(),
+    );
+    parts.push(
+        "- `llm_query_batched([p1, p2, ...])`     — concurrent fan-out; `model` is ignored"
+            .to_string(),
+    );
+    parts.push(
+        "- `rlm_query(prompt, model=None)`        — recursive sub-RLM; `model` is ignored"
+            .to_string(),
+    );
+    parts.push(
+        "- `rlm_query_batched([p1, p2, ...])`     — concurrent recursive sub-RLMs; `model` is ignored"
+            .to_string(),
    );
    parts.push("- `SHOW_VARS()`                          — list user variables".to_string());
    parts.push("- `repl_set(name, value)` / `repl_get(name)` — explicit store".to_string());
@@ -93,10 +93,6 @@ impl ToolSpec for RlmTool {
                    "type": "string",
                    "description": "Inline content to load as PROMPT. Use only when the input isn't a file you can point at. Capped at 200k chars."
                },
-                "child_model": {
-                    "type": "string",
-                    "description": "Model for sub-LLM (`llm_query`) calls inside the REPL. Default: deepseek-v4-flash."
-                },
                "max_depth": {
                    "type": "integer",
                    "description": "Recursion budget for `sub_rlm()` calls. 0 disables recursion; default 1 matches paper experiments."
@@ -182,12 +178,10 @@ impl ToolSpec for RlmTool {
            ));
        }

-        let child_model = input
-            .get("child_model")
-            .and_then(|v| v.as_str())
-            .filter(|s| !s.is_empty())
-            .unwrap_or(DEFAULT_CHILD_MODEL)
-            .to_string();
+        // Pin child calls to Flash so model-generated tool args cannot quietly
+        // turn fanout work into Pro-billed requests. The RLM root still uses
+        // the session model; child helper calls are the cheap batch layer.
+        let child_model = DEFAULT_CHILD_MODEL.to_string();

        let max_depth = input
            .get("max_depth")
@@ -353,7 +347,6 @@ mod tests {
        assert!(schema["properties"]["task"].is_object());
        assert!(schema["properties"]["file_path"].is_object());
        assert!(schema["properties"]["content"].is_object());
-        assert!(schema["properties"]["child_model"].is_object());
        assert!(schema["properties"]["max_depth"].is_object());
        let required = schema["required"].as_array().unwrap();
        assert!(required.iter().any(|v| v == "task"));
@@ -16,7 +16,7 @@ Press `Shift+Tab` to cycle reasoning effort.
 - **Agent**: multi-step tool use. Approvals for shell and paid tools (file writes are allowed without a prompt).
 - **YOLO**: enables shell + trust mode and auto-approves all tools. Use only in trusted repos.

-All three modes have access to the `rlm_query` tool — a structured tool call that fans out 1–16 cheap parallel children on `deepseek-v4-flash`. The model reaches for it when work is decomposable.
+All three modes have access to the `rlm` tool. Inside its Python REPL, `llm_query_batched` fans out 1–16 cheap parallel child calls pinned to `deepseek-v4-flash`. The model reaches for it when work is decomposable.

 ## Compatibility Notes

@@ -151,11 +151,11 @@ reflect very different cost classes:
 | Tool | What each child does | Wall-clock | Token cost | Cap |
 |---|---|---|---|---|
 | `agent_spawn` | Full sub-agent loop (planning, tool calls, multi-turn streaming, can spawn children) | minutes | thousands of tokens | 10 in flight by default (`[subagents].max_concurrent`, hard ceiling 20) |
-| `rlm_query` | One-shot non-streaming Chat Completions call to `deepseek-v4-flash` | seconds | ~hundreds of tokens | 16 per call |
+| `rlm` helper `llm_query_batched` | One-shot non-streaming Chat Completions calls pinned to `deepseek-v4-flash` | seconds | ~hundreds of tokens | 16 per call |

 The caps appear in each tool's description and error messages so the model
 (and the user) can choose the right tool for the job. If one sub-agent is
-enough but you need parallel lookups, prefer `rlm_query`; if each task needs
+enough but you need parallel lookups, prefer `rlm` with `llm_query_batched`; if each task needs
 its own tool-carrying agent loop, use `agent_spawn` (and cancel completed
 ones to free slots).