fix(rlm): pin child calls to flash (#832)
This commit is contained in:
@@ -495,7 +495,7 @@ def _rpc(req):
|
||||
return {"error": f"unexpected protocol line: {line[:120]!r}"}
|
||||
|
||||
def llm_query(prompt, model=None, max_tokens=None, system=None):
|
||||
"""One-shot sub-LLM call. Returns the completion text as a string."""
|
||||
"""One-shot sub-LLM call. The model arg is accepted for compatibility but ignored by Rust."""
|
||||
resp = _rpc({"type":"llm","prompt":str(prompt),"model":model,
|
||||
"max_tokens":max_tokens,"system":system})
|
||||
if isinstance(resp, dict) and resp.get("error"):
|
||||
@@ -505,7 +505,7 @@ def llm_query(prompt, model=None, max_tokens=None, system=None):
|
||||
return str(resp)
|
||||
|
||||
def llm_query_batched(prompts, model=None):
|
||||
"""Run multiple sub-LLM calls concurrently. Returns a list of strings."""
|
||||
"""Run multiple sub-LLM calls concurrently. The model arg is accepted for compatibility but ignored."""
|
||||
if not isinstance(prompts, (list, tuple)):
|
||||
return ["[llm_query_batched: prompts must be a list]"]
|
||||
resp = _rpc({"type":"llm_batch","prompts":[str(p) for p in prompts],"model":model})
|
||||
@@ -523,7 +523,7 @@ def llm_query_batched(prompts, model=None):
|
||||
return out
|
||||
|
||||
def rlm_query(prompt, model=None):
|
||||
"""Recursive sub-RLM (paper's `sub_RLM`). Each call gets its own REPL."""
|
||||
"""Recursive sub-RLM. The model arg is accepted for compatibility but ignored by Rust."""
|
||||
resp = _rpc({"type":"rlm","prompt":str(prompt),"model":model})
|
||||
if isinstance(resp, dict) and resp.get("error"):
|
||||
return f"[rlm_query error: {resp['error']}]"
|
||||
@@ -532,7 +532,7 @@ def rlm_query(prompt, model=None):
|
||||
return str(resp)
|
||||
|
||||
def rlm_query_batched(prompts, model=None):
|
||||
"""Run multiple recursive sub-RLMs in parallel."""
|
||||
"""Run multiple recursive sub-RLMs in parallel. The model arg is accepted for compatibility but ignored."""
|
||||
if not isinstance(prompts, (list, tuple)):
|
||||
return ["[rlm_query_batched: prompts must be a list]"]
|
||||
resp = _rpc({"type":"rlm_batch","prompts":[str(p) for p in prompts],"model":model})
|
||||
|
||||
@@ -86,14 +86,16 @@ impl RlmBridge {
|
||||
async fn dispatch_llm(
|
||||
&self,
|
||||
prompt: String,
|
||||
model: Option<String>,
|
||||
_model: Option<String>,
|
||||
max_tokens: Option<u32>,
|
||||
system: Option<String>,
|
||||
) -> SingleResp {
|
||||
let request = MessageRequest {
|
||||
model: model
|
||||
.filter(|m| !m.is_empty())
|
||||
.unwrap_or_else(|| self.child_model.clone()),
|
||||
// The Python helper accepts `model=` for older snippets, but it is
|
||||
// intentionally not authoritative. RLM child calls are pinned to
|
||||
// the tool's configured child model so model-generated Python
|
||||
// cannot silently upgrade cheap fanout work to an expensive model.
|
||||
model: self.child_model.clone(),
|
||||
messages: vec![Message {
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentBlock::Text {
|
||||
@@ -150,16 +152,12 @@ impl RlmBridge {
|
||||
SingleResp { text, error: None }
|
||||
}
|
||||
|
||||
async fn dispatch_llm_batch(&self, prompts: Vec<String>, model: Option<String>) -> BatchResp {
|
||||
async fn dispatch_llm_batch(&self, prompts: Vec<String>, _model: Option<String>) -> BatchResp {
|
||||
if let Some(resp) = batch_guard(prompts.len()) {
|
||||
return resp;
|
||||
}
|
||||
|
||||
let model = Arc::new(
|
||||
model
|
||||
.filter(|m| !m.is_empty())
|
||||
.unwrap_or_else(|| self.child_model.clone()),
|
||||
);
|
||||
let model = Arc::new(self.child_model.clone());
|
||||
|
||||
let futures = prompts.into_iter().map(|prompt| {
|
||||
let model = Arc::clone(&model);
|
||||
@@ -174,12 +172,12 @@ impl RlmBridge {
|
||||
}
|
||||
}
|
||||
|
||||
async fn dispatch_rlm(&self, prompt: String, model: Option<String>) -> SingleResp {
|
||||
async fn dispatch_rlm(&self, prompt: String, _model: Option<String>) -> SingleResp {
|
||||
if self.depth_remaining == 0 {
|
||||
// Budget exhausted — fall back to a one-shot child completion
|
||||
// rather than returning an error. Matches the paper's behaviour
|
||||
// ("sub_RLM gracefully degrades to llm_query at depth=0").
|
||||
return self.dispatch_llm(prompt, model, None, None).await;
|
||||
return self.dispatch_llm(prompt, None, None, None).await;
|
||||
}
|
||||
|
||||
// Build a drain channel to absorb status events from the nested
|
||||
@@ -192,9 +190,7 @@ impl RlmBridge {
|
||||
async move { while rx.recv().await.is_some() {} },
|
||||
);
|
||||
|
||||
let child_model = model
|
||||
.filter(|m| !m.is_empty())
|
||||
.unwrap_or_else(|| self.child_model.clone());
|
||||
let child_model = self.child_model.clone();
|
||||
|
||||
// Recursive call. The dyn-erasure on `run_rlm_turn_inner` breaks
|
||||
// the `bridge → turn → bridge` opaque-future cycle.
|
||||
@@ -223,16 +219,14 @@ impl RlmBridge {
|
||||
}
|
||||
}
|
||||
|
||||
async fn dispatch_rlm_batch(&self, prompts: Vec<String>, model: Option<String>) -> BatchResp {
|
||||
async fn dispatch_rlm_batch(&self, prompts: Vec<String>, _model: Option<String>) -> BatchResp {
|
||||
if let Some(resp) = batch_guard(prompts.len()) {
|
||||
return resp;
|
||||
}
|
||||
|
||||
let model = Arc::new(model);
|
||||
let futures = prompts.into_iter().map(|p| {
|
||||
let model = Arc::clone(&model);
|
||||
async move { self.dispatch_rlm(p, (*model).clone()).await }
|
||||
});
|
||||
let futures = prompts
|
||||
.into_iter()
|
||||
.map(|p| async move { self.dispatch_rlm(p, None).await });
|
||||
BatchResp {
|
||||
results: join_all(futures).await,
|
||||
}
|
||||
@@ -341,7 +335,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn llm_dispatch_uses_trait_backed_mock_client() {
|
||||
async fn llm_dispatch_pins_configured_child_model() {
|
||||
let mock = Arc::new(MockLlmClient::new(Vec::new()));
|
||||
mock.push_message_response(mock_response("child answer", 7, 11));
|
||||
let bridge = bridge_for(Arc::clone(&mock), 1);
|
||||
@@ -365,7 +359,7 @@ mod tests {
|
||||
|
||||
let captured = mock.captured_requests();
|
||||
assert_eq!(captured.len(), 1);
|
||||
assert_eq!(captured[0].model, "override-model");
|
||||
assert_eq!(captured[0].model, "child-model");
|
||||
assert_eq!(captured[0].max_tokens, 123);
|
||||
assert_eq!(
|
||||
captured[0].system,
|
||||
@@ -378,7 +372,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn llm_batch_dispatch_preserves_result_count_and_usage() {
|
||||
async fn llm_batch_dispatch_pins_configured_child_model() {
|
||||
let mock = Arc::new(MockLlmClient::new(Vec::new()));
|
||||
mock.push_message_response(mock_response("one", 1, 2));
|
||||
mock.push_message_response(mock_response("two", 3, 4));
|
||||
@@ -410,7 +404,7 @@ mod tests {
|
||||
assert!(
|
||||
captured
|
||||
.iter()
|
||||
.all(|request| request.model == "batch-model")
|
||||
.all(|request| request.model == "child-model")
|
||||
);
|
||||
|
||||
let usage = bridge.usage.lock().await;
|
||||
@@ -419,7 +413,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn rlm_dispatch_at_depth_zero_falls_back_to_plain_llm_query() {
|
||||
async fn rlm_dispatch_at_depth_zero_pins_configured_child_model() {
|
||||
let mock = Arc::new(MockLlmClient::new(Vec::new()));
|
||||
mock.push_message_response(mock_response("fallback answer", 3, 5));
|
||||
let bridge = bridge_for(Arc::clone(&mock), 0);
|
||||
@@ -445,6 +439,6 @@ mod tests {
|
||||
|
||||
let captured = mock.captured_requests();
|
||||
assert_eq!(captured.len(), 1);
|
||||
assert_eq!(captured[0].model, "override-model");
|
||||
assert_eq!(captured[0].model, "child-model");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,10 +15,10 @@ const RLM_SYSTEM_PROMPT: &str = r#"You are the root of a Recursive Language Mode
|
||||
|
||||
The REPL exposes:
|
||||
- `context` (alias `ctx`) — the full input string. Often huge — never `print(context)` in full.
|
||||
- `llm_query(prompt, model=None, max_tokens=None, system=None)` — one-shot child LLM. Cheap. Use for chunk-level work.
|
||||
- `llm_query_batched(prompts, model=None)` — concurrent fan-out. Returns `list[str]` in input order.
|
||||
- `rlm_query(prompt, model=None)` — recursive sub-RLM. Use when a sub-task itself needs decomposition.
|
||||
- `rlm_query_batched(prompts, model=None)` — concurrent recursive sub-RLMs.
|
||||
- `llm_query(prompt, model=None, max_tokens=None, system=None)` — one-shot child LLM. Cheap. Use for chunk-level work. The `model` argument is accepted for compatibility but child calls stay pinned to the configured Flash child model.
|
||||
- `llm_query_batched(prompts, model=None)` — concurrent fan-out. Returns `list[str]` in input order. The `model` argument is accepted for compatibility but ignored.
|
||||
- `rlm_query(prompt, model=None)` — recursive sub-RLM. Use when a sub-task itself needs decomposition. The `model` argument is accepted for compatibility but ignored.
|
||||
- `rlm_query_batched(prompts, model=None)` — concurrent recursive sub-RLMs. The `model` argument is accepted for compatibility but ignored.
|
||||
- `SHOW_VARS()` — list user variables and their types.
|
||||
- `repl_set(name, value)` / `repl_get(name)` — explicit cross-round storage.
|
||||
- `print(...)` — diagnostic output. The driver feeds you a truncated preview next round.
|
||||
|
||||
@@ -588,11 +588,21 @@ fn build_metadata_message(
|
||||
parts.push("**REPL helpers** (use inside ```repl blocks)".to_string());
|
||||
parts.push("- `context` / `ctx` — the full input string".to_string());
|
||||
parts.push("- `len(context)` / `context[a:b]` / `context.splitlines()` — slice it".to_string());
|
||||
parts.push("- `llm_query(prompt, model=None)` — one-shot child LLM".to_string());
|
||||
parts.push("- `llm_query_batched([p1, p2, ...])` — concurrent fan-out".to_string());
|
||||
parts.push("- `rlm_query(prompt, model=None)` — recursive sub-RLM".to_string());
|
||||
parts.push(
|
||||
"- `rlm_query_batched([p1, p2, ...])` — concurrent recursive sub-RLMs".to_string(),
|
||||
"- `llm_query(prompt, model=None)` — one-shot child LLM; `model` is ignored and child calls stay pinned to Flash"
|
||||
.to_string(),
|
||||
);
|
||||
parts.push(
|
||||
"- `llm_query_batched([p1, p2, ...])` — concurrent fan-out; `model` is ignored"
|
||||
.to_string(),
|
||||
);
|
||||
parts.push(
|
||||
"- `rlm_query(prompt, model=None)` — recursive sub-RLM; `model` is ignored"
|
||||
.to_string(),
|
||||
);
|
||||
parts.push(
|
||||
"- `rlm_query_batched([p1, p2, ...])` — concurrent recursive sub-RLMs; `model` is ignored"
|
||||
.to_string(),
|
||||
);
|
||||
parts.push("- `SHOW_VARS()` — list user variables".to_string());
|
||||
parts.push("- `repl_set(name, value)` / `repl_get(name)` — explicit store".to_string());
|
||||
|
||||
@@ -93,10 +93,6 @@ impl ToolSpec for RlmTool {
|
||||
"type": "string",
|
||||
"description": "Inline content to load as PROMPT. Use only when the input isn't a file you can point at. Capped at 200k chars."
|
||||
},
|
||||
"child_model": {
|
||||
"type": "string",
|
||||
"description": "Model for sub-LLM (`llm_query`) calls inside the REPL. Default: deepseek-v4-flash."
|
||||
},
|
||||
"max_depth": {
|
||||
"type": "integer",
|
||||
"description": "Recursion budget for `sub_rlm()` calls. 0 disables recursion; default 1 matches paper experiments."
|
||||
@@ -182,12 +178,10 @@ impl ToolSpec for RlmTool {
|
||||
));
|
||||
}
|
||||
|
||||
let child_model = input
|
||||
.get("child_model")
|
||||
.and_then(|v| v.as_str())
|
||||
.filter(|s| !s.is_empty())
|
||||
.unwrap_or(DEFAULT_CHILD_MODEL)
|
||||
.to_string();
|
||||
// Pin child calls to Flash so model-generated tool args cannot quietly
|
||||
// turn fanout work into Pro-billed requests. The RLM root still uses
|
||||
// the session model; child helper calls are the cheap batch layer.
|
||||
let child_model = DEFAULT_CHILD_MODEL.to_string();
|
||||
|
||||
let max_depth = input
|
||||
.get("max_depth")
|
||||
@@ -353,7 +347,6 @@ mod tests {
|
||||
assert!(schema["properties"]["task"].is_object());
|
||||
assert!(schema["properties"]["file_path"].is_object());
|
||||
assert!(schema["properties"]["content"].is_object());
|
||||
assert!(schema["properties"]["child_model"].is_object());
|
||||
assert!(schema["properties"]["max_depth"].is_object());
|
||||
let required = schema["required"].as_array().unwrap();
|
||||
assert!(required.iter().any(|v| v == "task"));
|
||||
|
||||
+1
-1
@@ -16,7 +16,7 @@ Press `Shift+Tab` to cycle reasoning effort.
|
||||
- **Agent**: multi-step tool use. Approvals for shell and paid tools (file writes are allowed without a prompt).
|
||||
- **YOLO**: enables shell + trust mode and auto-approves all tools. Use only in trusted repos.
|
||||
|
||||
All three modes have access to the `rlm_query` tool — a structured tool call that fans out 1–16 cheap parallel children on `deepseek-v4-flash`. The model reaches for it when work is decomposable.
|
||||
All three modes have access to the `rlm` tool. Inside its Python REPL, `llm_query_batched` fans out 1–16 cheap parallel child calls pinned to `deepseek-v4-flash`. The model reaches for it when work is decomposable.
|
||||
|
||||
## Compatibility Notes
|
||||
|
||||
|
||||
@@ -151,11 +151,11 @@ reflect very different cost classes:
|
||||
| Tool | What each child does | Wall-clock | Token cost | Cap |
|
||||
|---|---|---|---|---|
|
||||
| `agent_spawn` | Full sub-agent loop (planning, tool calls, multi-turn streaming, can spawn children) | minutes | thousands of tokens | 10 in flight by default (`[subagents].max_concurrent`, hard ceiling 20) |
|
||||
| `rlm_query` | One-shot non-streaming Chat Completions call to `deepseek-v4-flash` | seconds | ~hundreds of tokens | 16 per call |
|
||||
| `rlm` helper `llm_query_batched` | One-shot non-streaming Chat Completions calls pinned to `deepseek-v4-flash` | seconds | ~hundreds of tokens | 16 per call |
|
||||
|
||||
The caps appear in each tool's description and error messages so the model
|
||||
(and the user) can choose the right tool for the job. If one sub-agent is
|
||||
enough but you need parallel lookups, prefer `rlm_query`; if each task needs
|
||||
enough but you need parallel lookups, prefer `rlm` with `llm_query_batched`; if each task needs
|
||||
its own tool-carrying agent loop, use `agent_spawn` (and cancel completed
|
||||
ones to free slots).
|
||||
|
||||
|
||||
Reference in New Issue
Block a user