diff --git a/crates/tui/src/repl/runtime.rs b/crates/tui/src/repl/runtime.rs
index 626a0e7f..3b5cdfc9 100644
--- a/crates/tui/src/repl/runtime.rs
+++ b/crates/tui/src/repl/runtime.rs
@@ -495,7 +495,7 @@ def _rpc(req):
     return {"error": f"unexpected protocol line: {line[:120]!r}"}
 
 def llm_query(prompt, model=None, max_tokens=None, system=None):
-    """One-shot sub-LLM call. Returns the completion text as a string."""
+    """One-shot sub-LLM call. The model arg is accepted for compatibility but ignored by Rust."""
     resp = _rpc({"type":"llm","prompt":str(prompt),"model":model,
                  "max_tokens":max_tokens,"system":system})
     if isinstance(resp, dict) and resp.get("error"):
@@ -505,7 +505,7 @@ def llm_query(prompt, model=None, max_tokens=None, system=None):
     return str(resp)
 
 def llm_query_batched(prompts, model=None):
-    """Run multiple sub-LLM calls concurrently. Returns a list of strings."""
+    """Run multiple sub-LLM calls concurrently. The model arg is accepted for compatibility but ignored."""
     if not isinstance(prompts, (list, tuple)):
         return ["[llm_query_batched: prompts must be a list]"]
     resp = _rpc({"type":"llm_batch","prompts":[str(p) for p in prompts],"model":model})
@@ -523,7 +523,7 @@ def llm_query_batched(prompts, model=None):
     return out
 
 def rlm_query(prompt, model=None):
-    """Recursive sub-RLM (paper's `sub_RLM`). Each call gets its own REPL."""
+    """Recursive sub-RLM. The model arg is accepted for compatibility but ignored by Rust."""
     resp = _rpc({"type":"rlm","prompt":str(prompt),"model":model})
     if isinstance(resp, dict) and resp.get("error"):
         return f"[rlm_query error: {resp['error']}]"
@@ -532,7 +532,7 @@ def rlm_query(prompt, model=None):
     return str(resp)
 
 def rlm_query_batched(prompts, model=None):
-    """Run multiple recursive sub-RLMs in parallel."""
+    """Run multiple recursive sub-RLMs in parallel. The model arg is accepted for compatibility but ignored."""
     if not isinstance(prompts, (list, tuple)):
         return ["[rlm_query_batched: prompts must be a list]"]
     resp = _rpc({"type":"rlm_batch","prompts":[str(p) for p in prompts],"model":model})
diff --git a/crates/tui/src/rlm/bridge.rs b/crates/tui/src/rlm/bridge.rs
index 36d8a9dc..904ddef7 100644
--- a/crates/tui/src/rlm/bridge.rs
+++ b/crates/tui/src/rlm/bridge.rs
@@ -86,14 +86,16 @@ impl RlmBridge {
     async fn dispatch_llm(
         &self,
         prompt: String,
-        model: Option<String>,
+        _model: Option<String>,
         max_tokens: Option<u32>,
         system: Option<String>,
     ) -> SingleResp {
         let request = MessageRequest {
-            model: model
-                .filter(|m| !m.is_empty())
-                .unwrap_or_else(|| self.child_model.clone()),
+            // The Python helper accepts `model=` for older snippets, but it is
+            // intentionally not authoritative. RLM child calls are pinned to
+            // the tool's configured child model so model-generated Python
+            // cannot silently upgrade cheap fanout work to an expensive model.
+            model: self.child_model.clone(),
             messages: vec![Message {
                 role: "user".to_string(),
                 content: vec![ContentBlock::Text {
@@ -150,16 +152,12 @@ impl RlmBridge {
         SingleResp { text, error: None }
     }
 
-    async fn dispatch_llm_batch(&self, prompts: Vec<String>, model: Option<String>) -> BatchResp {
+    async fn dispatch_llm_batch(&self, prompts: Vec<String>, _model: Option<String>) -> BatchResp {
         if let Some(resp) = batch_guard(prompts.len()) {
             return resp;
         }
 
-        let model = Arc::new(
-            model
-                .filter(|m| !m.is_empty())
-                .unwrap_or_else(|| self.child_model.clone()),
-        );
+        let model = Arc::new(self.child_model.clone());
 
         let futures = prompts.into_iter().map(|prompt| {
             let model = Arc::clone(&model);
@@ -174,12 +172,12 @@ impl RlmBridge {
         }
     }
 
-    async fn dispatch_rlm(&self, prompt: String, model: Option<String>) -> SingleResp {
+    async fn dispatch_rlm(&self, prompt: String, _model: Option<String>) -> SingleResp {
         if self.depth_remaining == 0 {
             // Budget exhausted — fall back to a one-shot child completion
             // rather than returning an error. Matches the paper's behaviour
             // ("sub_RLM gracefully degrades to llm_query at depth=0").
-            return self.dispatch_llm(prompt, model, None, None).await;
+            return self.dispatch_llm(prompt, None, None, None).await;
         }
 
         // Build a drain channel to absorb status events from the nested
@@ -192,9 +190,7 @@ impl RlmBridge {
             async move { while rx.recv().await.is_some() {} },
         );
 
-        let child_model = model
-            .filter(|m| !m.is_empty())
-            .unwrap_or_else(|| self.child_model.clone());
+        let child_model = self.child_model.clone();
 
         // Recursive call. The dyn-erasure on `run_rlm_turn_inner` breaks
         // the `bridge → turn → bridge` opaque-future cycle.
@@ -223,16 +219,14 @@ impl RlmBridge {
         }
     }
 
-    async fn dispatch_rlm_batch(&self, prompts: Vec<String>, model: Option<String>) -> BatchResp {
+    async fn dispatch_rlm_batch(&self, prompts: Vec<String>, _model: Option<String>) -> BatchResp {
         if let Some(resp) = batch_guard(prompts.len()) {
             return resp;
         }
 
-        let model = Arc::new(model);
-        let futures = prompts.into_iter().map(|p| {
-            let model = Arc::clone(&model);
-            async move { self.dispatch_rlm(p, (*model).clone()).await }
-        });
+        let futures = prompts
+            .into_iter()
+            .map(|p| async move { self.dispatch_rlm(p, None).await });
         BatchResp {
             results: join_all(futures).await,
         }
@@ -341,7 +335,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn llm_dispatch_uses_trait_backed_mock_client() {
+    async fn llm_dispatch_pins_configured_child_model() {
         let mock = Arc::new(MockLlmClient::new(Vec::new()));
         mock.push_message_response(mock_response("child answer", 7, 11));
         let bridge = bridge_for(Arc::clone(&mock), 1);
@@ -365,7 +359,7 @@ mod tests {
 
         let captured = mock.captured_requests();
         assert_eq!(captured.len(), 1);
-        assert_eq!(captured[0].model, "override-model");
+        assert_eq!(captured[0].model, "child-model");
         assert_eq!(captured[0].max_tokens, 123);
         assert_eq!(
             captured[0].system,
@@ -378,7 +372,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn llm_batch_dispatch_preserves_result_count_and_usage() {
+    async fn llm_batch_dispatch_pins_configured_child_model() {
         let mock = Arc::new(MockLlmClient::new(Vec::new()));
         mock.push_message_response(mock_response("one", 1, 2));
         mock.push_message_response(mock_response("two", 3, 4));
@@ -410,7 +404,7 @@ mod tests {
         assert!(
             captured
                 .iter()
-                .all(|request| request.model == "batch-model")
+                .all(|request| request.model == "child-model")
         );
 
         let usage = bridge.usage.lock().await;
@@ -419,7 +413,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn rlm_dispatch_at_depth_zero_falls_back_to_plain_llm_query() {
+    async fn rlm_dispatch_at_depth_zero_pins_configured_child_model() {
         let mock = Arc::new(MockLlmClient::new(Vec::new()));
         mock.push_message_response(mock_response("fallback answer", 3, 5));
         let bridge = bridge_for(Arc::clone(&mock), 0);
@@ -445,6 +439,6 @@ mod tests {
 
         let captured = mock.captured_requests();
         assert_eq!(captured.len(), 1);
-        assert_eq!(captured[0].model, "override-model");
+        assert_eq!(captured[0].model, "child-model");
     }
 }
diff --git a/crates/tui/src/rlm/prompt.rs b/crates/tui/src/rlm/prompt.rs
index ba003cb9..5553d2cf 100644
--- a/crates/tui/src/rlm/prompt.rs
+++ b/crates/tui/src/rlm/prompt.rs
@@ -15,10 +15,10 @@ const RLM_SYSTEM_PROMPT: &str = r#"You are the root of a Recursive Language Mode
 
 The REPL exposes:
 - `context` (alias `ctx`) — the full input string. Often huge — never `print(context)` in full.
-- `llm_query(prompt, model=None, max_tokens=None, system=None)` — one-shot child LLM. Cheap. Use for chunk-level work.
-- `llm_query_batched(prompts, model=None)` — concurrent fan-out. Returns `list[str]` in input order.
-- `rlm_query(prompt, model=None)` — recursive sub-RLM. Use when a sub-task itself needs decomposition.
-- `rlm_query_batched(prompts, model=None)` — concurrent recursive sub-RLMs.
+- `llm_query(prompt, model=None, max_tokens=None, system=None)` — one-shot child LLM. Cheap. Use for chunk-level work. The `model` argument is accepted for compatibility but child calls stay pinned to the configured Flash child model.
+- `llm_query_batched(prompts, model=None)` — concurrent fan-out. Returns `list[str]` in input order. The `model` argument is accepted for compatibility but ignored.
+- `rlm_query(prompt, model=None)` — recursive sub-RLM. Use when a sub-task itself needs decomposition. The `model` argument is accepted for compatibility but ignored.
+- `rlm_query_batched(prompts, model=None)` — concurrent recursive sub-RLMs. The `model` argument is accepted for compatibility but ignored.
 - `SHOW_VARS()` — list user variables and their types.
 - `repl_set(name, value)` / `repl_get(name)` — explicit cross-round storage.
 - `print(...)` — diagnostic output. The driver feeds you a truncated preview next round.
diff --git a/crates/tui/src/rlm/turn.rs b/crates/tui/src/rlm/turn.rs
index c97f5ad1..16504fb8 100644
--- a/crates/tui/src/rlm/turn.rs
+++ b/crates/tui/src/rlm/turn.rs
@@ -588,11 +588,21 @@ fn build_metadata_message(
     parts.push("**REPL helpers** (use inside ```repl blocks)".to_string());
     parts.push("- `context` / `ctx`                       — the full input string".to_string());
     parts.push("- `len(context)` / `context[a:b]` / `context.splitlines()` — slice it".to_string());
-    parts.push("- `llm_query(prompt, model=None)`        — one-shot child LLM".to_string());
-    parts.push("- `llm_query_batched([p1, p2, ...])`     — concurrent fan-out".to_string());
-    parts.push("- `rlm_query(prompt, model=None)`        — recursive sub-RLM".to_string());
     parts.push(
-        "- `rlm_query_batched([p1, p2, ...])`     — concurrent recursive sub-RLMs".to_string(),
+        "- `llm_query(prompt, model=None)`        — one-shot child LLM; `model` is ignored and child calls stay pinned to Flash"
+            .to_string(),
+    );
+    parts.push(
+        "- `llm_query_batched([p1, p2, ...])`     — concurrent fan-out; `model` is ignored"
+            .to_string(),
+    );
+    parts.push(
+        "- `rlm_query(prompt, model=None)`        — recursive sub-RLM; `model` is ignored"
+            .to_string(),
+    );
+    parts.push(
+        "- `rlm_query_batched([p1, p2, ...])`     — concurrent recursive sub-RLMs; `model` is ignored"
+            .to_string(),
     );
     parts.push("- `SHOW_VARS()`                          — list user variables".to_string());
     parts.push("- `repl_set(name, value)` / `repl_get(name)` — explicit store".to_string());
diff --git a/crates/tui/src/tools/rlm.rs b/crates/tui/src/tools/rlm.rs
index 5cc641ab..8881c8ce 100644
--- a/crates/tui/src/tools/rlm.rs
+++ b/crates/tui/src/tools/rlm.rs
@@ -93,10 +93,6 @@ impl ToolSpec for RlmTool {
                     "type": "string",
                     "description": "Inline content to load as PROMPT. Use only when the input isn't a file you can point at. Capped at 200k chars."
                 },
-                "child_model": {
-                    "type": "string",
-                    "description": "Model for sub-LLM (`llm_query`) calls inside the REPL. Default: deepseek-v4-flash."
-                },
                 "max_depth": {
                     "type": "integer",
                     "description": "Recursion budget for `sub_rlm()` calls. 0 disables recursion; default 1 matches paper experiments."
@@ -182,12 +178,10 @@ impl ToolSpec for RlmTool {
             ));
         }
 
-        let child_model = input
-            .get("child_model")
-            .and_then(|v| v.as_str())
-            .filter(|s| !s.is_empty())
-            .unwrap_or(DEFAULT_CHILD_MODEL)
-            .to_string();
+        // Pin child calls to Flash so model-generated tool args cannot quietly
+        // turn fanout work into Pro-billed requests. The RLM root still uses
+        // the session model; child helper calls are the cheap batch layer.
+        let child_model = DEFAULT_CHILD_MODEL.to_string();
 
         let max_depth = input
             .get("max_depth")
@@ -353,7 +347,6 @@ mod tests {
         assert!(schema["properties"]["task"].is_object());
         assert!(schema["properties"]["file_path"].is_object());
         assert!(schema["properties"]["content"].is_object());
-        assert!(schema["properties"]["child_model"].is_object());
         assert!(schema["properties"]["max_depth"].is_object());
         let required = schema["required"].as_array().unwrap();
         assert!(required.iter().any(|v| v == "task"));
diff --git a/docs/MODES.md b/docs/MODES.md
index c7a76135..7b320294 100644
--- a/docs/MODES.md
+++ b/docs/MODES.md
@@ -16,7 +16,7 @@ Press `Shift+Tab` to cycle reasoning effort.
 - **Agent**: multi-step tool use. Approvals for shell and paid tools (file writes are allowed without a prompt).
 - **YOLO**: enables shell + trust mode and auto-approves all tools. Use only in trusted repos.
 
-All three modes have access to the `rlm_query` tool — a structured tool call that fans out 1–16 cheap parallel children on `deepseek-v4-flash`. The model reaches for it when work is decomposable.
+All three modes have access to the `rlm` tool. Inside its Python REPL, `llm_query_batched` fans out 1–16 cheap parallel child calls pinned to `deepseek-v4-flash`. The model reaches for it when work is decomposable.
 
 ## Compatibility Notes
 
diff --git a/docs/TOOL_SURFACE.md b/docs/TOOL_SURFACE.md
index b58470c7..68cece8d 100644
--- a/docs/TOOL_SURFACE.md
+++ b/docs/TOOL_SURFACE.md
@@ -151,11 +151,11 @@ reflect very different cost classes:
 | Tool | What each child does | Wall-clock | Token cost | Cap |
 |---|---|---|---|---|
 | `agent_spawn` | Full sub-agent loop (planning, tool calls, multi-turn streaming, can spawn children) | minutes | thousands of tokens | 10 in flight by default (`[subagents].max_concurrent`, hard ceiling 20) |
-| `rlm_query` | One-shot non-streaming Chat Completions call to `deepseek-v4-flash` | seconds | ~hundreds of tokens | 16 per call |
+| `rlm` helper `llm_query_batched` | One-shot non-streaming Chat Completions calls pinned to `deepseek-v4-flash` | seconds | ~hundreds of tokens | 16 per call |
 
 The caps appear in each tool's description and error messages so the model
 (and the user) can choose the right tool for the job. If one sub-agent is
-enough but you need parallel lookups, prefer `rlm_query`; if each task needs
+enough but you need parallel lookups, prefer `rlm` with `llm_query_batched`; if each task needs
 its own tool-carrying agent loop, use `agent_spawn` (and cancel completed
 ones to free slots).