feat(test): #69 mock LLM client + eval --record fixture flag

Adds a queue-driven `MockLlmClient` that implements the `LlmClient` trait by replaying canned per-turn `StreamEvent` vectors and capturing every outgoing `MessageRequest`. The mock lives at the trait boundary so it stays decoupled from the concrete reqwest plumbing inside `DeepSeekClient`, and surfaces builders (`canned::*`) for the common event shapes (text delta, thinking delta, tool_use start, input JSON delta, message delta). Wires a new `--record <DIR>` flag into `deepseek eval` that appends one JSON Lines fixture line per step to `<DIR>/<scenario>.jsonl`. The format is documented at the top of `eval.rs` and is the storage shape the mock will replay from. `crates/tui/src/llm_client.rs` becomes `crates/tui/src/llm_client/mod.rs` to host the new submodule cleanly. The trait shape is unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 00:03:00 -05:00
parent 693fbca4ea
commit 0626bc80c0
4 changed files with 744 additions and 3 deletions
@@ -6,9 +6,10 @@
 use anyhow::{Context, Result, anyhow};
 use ignore::WalkBuilder;
 use regex::Regex;
-use serde::Serialize;
+use serde::{Deserialize, Serialize};
 use std::collections::BTreeMap;
 use std::fs;
+use std::io::Write;
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -102,6 +103,14 @@ pub struct EvalHarnessConfig {
    pub shell_expect_token: String,
    /// Maximum characters stored for step output summaries.
    pub max_output_chars: usize,
+    /// When set, every step is appended as a JSON Lines fixture to a file
+    /// inside this directory. The fixture file is named after the scenario
+    /// (e.g. `offline-tool-loop.jsonl`). Each line follows the schema:
+    /// `{ "request": <step descriptor>, "response_events": [<events>] }`.
+    /// The mock LLM client (`crate::llm_client::mock`) can replay these
+    /// fixtures for deterministic offline tests. See
+    /// `crates/tui/tests/README.md` for the full record/replay flow.
+    pub record_dir: Option<PathBuf>,
 }

 impl Default for EvalHarnessConfig {
@@ -117,6 +126,7 @@ impl Default for EvalHarnessConfig {
            shell_command,
            shell_expect_token: "eval-harness".to_string(),
            max_output_chars: 240,
+            record_dir: None,
        }
    }
 }
@@ -273,26 +283,122 @@ impl EvalHarness {
                    success: true,
                    duration,
                    error: None,
-                    output: Some(output),
+                    output: Some(output.clone()),
                });
+                if let Some(dir) = self.config.record_dir.as_deref() {
+                    let _ = record_fixture(
+                        dir,
+                        &self.config.scenario_name,
+                        FixtureRecord::ok(kind, &output),
+                    );
+                }
                Some(value)
            }
            Err(err) => {
                stats.errors += 1;
+                let err_str = err.to_string();
                steps.push(EvalStep {
                    kind,
                    tool_name: kind.tool_name(),
                    success: false,
                    duration,
-                    error: Some(err.to_string()),
+                    error: Some(err_str.clone()),
                    output: None,
                });
+                if let Some(dir) = self.config.record_dir.as_deref() {
+                    let _ = record_fixture(
+                        dir,
+                        &self.config.scenario_name,
+                        FixtureRecord::err(kind, &err_str),
+                    );
+                }
                None
            }
        }
    }
 }

+// === Fixture record/replay format ===========================================
+//
+// The `--record` flag writes one JSON object per line to a `.jsonl` file:
+//
+//     { "request": { "step": "list_dir", "kind": "List" },
+//       "response_events": [{ "type": "ok", "output": "…" }] }
+//
+// The mock LLM client replays these fixtures via
+// `MockLlmClient::push_message_response` (or the streaming variant) by mapping
+// each `response_events` array onto a canned `Vec<StreamEvent>`.
+//
+// This format is intentionally minimal — additional fields (timing, model,
+// usage) can be added without breaking older fixtures because each line is a
+// self-contained JSON object.
+
+/// Schema for one line of a `--record` JSONL fixture file.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct FixtureRecord {
+    /// Step descriptor (`{ step, kind }`).
+    pub request: serde_json::Value,
+    /// One or more synthetic response events.
+    pub response_events: Vec<serde_json::Value>,
+}
+
+impl FixtureRecord {
+    fn ok(kind: ScenarioStepKind, output: &str) -> Self {
+        Self {
+            request: serde_json::json!({
+                "step": kind.tool_name(),
+                "kind": format!("{kind:?}"),
+            }),
+            response_events: vec![serde_json::json!({
+                "type": "ok",
+                "output": output,
+            })],
+        }
+    }
+
+    fn err(kind: ScenarioStepKind, error: &str) -> Self {
+        Self {
+            request: serde_json::json!({
+                "step": kind.tool_name(),
+                "kind": format!("{kind:?}"),
+            }),
+            response_events: vec![serde_json::json!({
+                "type": "error",
+                "error": error,
+            })],
+        }
+    }
+}
+
+/// Append one fixture record to `<dir>/<scenario>.jsonl` (creating dir + file
+/// if missing). Best-effort: I/O errors are returned but generally ignored by
+/// the harness so a recording failure does not mask the run's primary result.
+pub fn record_fixture(dir: &Path, scenario_name: &str, record: FixtureRecord) -> Result<PathBuf> {
+    fs::create_dir_all(dir)
+        .with_context(|| format!("failed to create fixture dir: {}", dir.display()))?;
+    let safe_scenario = scenario_name
+        .chars()
+        .map(|c| {
+            if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
+                c
+            } else {
+                '_'
+            }
+        })
+        .collect::<String>();
+    let path = dir.join(format!("{safe_scenario}.jsonl"));
+    let line = serde_json::to_string(&record).context("failed to serialize fixture record")?;
+
+    let mut file = fs::OpenOptions::new()
+        .create(true)
+        .append(true)
+        .open(&path)
+        .with_context(|| format!("failed to open fixture file: {}", path.display()))?;
+    writeln!(file, "{line}")
+        .with_context(|| format!("failed to write fixture line to {}", path.display()))?;
+    Ok(path)
+}
+
 impl Default for EvalHarness {
    fn default() -> Self {
        Self::new(EvalHarnessConfig::default())
@@ -0,0 +1,627 @@
+//! `MockLlmClient` — a queue-driven `LlmClient` implementation for tests.
+//!
+//! This client implements the [`LlmClient`](super::LlmClient) trait by replaying a
+//! pre-loaded queue of canned responses (one per turn). It captures every
+//! request the runtime sends so tests can assert on the outgoing payload —
+//! e.g. confirming that prior `reasoning_content` is replayed in DeepSeek V4
+//! thinking-mode tool-calling turns (V4 §5.1.1; the bug that broke
+//! v0.4.9-v0.5.1).
+//!
+//! # Mocking strategy
+//!
+//! Tests mock at the **trait boundary** (`LlmClient`), never at the `reqwest`
+//! HTTP layer. The trait is the durable abstraction — internal HTTP plumbing
+//! changes frequently and is not part of the public engine contract.
+//!
+//! # Example
+//!
+//! ```ignore
+//! use crate::llm_client::mock::{MockLlmClient, canned};
+//! use crate::llm_client::LlmClient;
+//!
+//! // One canned turn that emits "hello world" as two text deltas, then
+//! // finishes with stop_reason = "end_turn".
+//! let turn = vec![
+//!     canned::message_start("msg_1"),
+//!     canned::text_delta(0, "hello "),
+//!     canned::text_delta(0, "world"),
+//!     canned::message_stop(),
+//! ];
+//!
+//! let mock = MockLlmClient::new(vec![turn]);
+//! let stream = mock.create_message_stream(/* ... */).await.unwrap();
+//! // ... drain the stream, assert deltas ...
+//! assert_eq!(mock.call_count(), 1);
+//! assert_eq!(mock.captured_requests().len(), 1);
+//! ```
+
+// This module ships methods + builder helpers that integration tests rely on
+// individually. Not every helper is exercised by unit tests — that's expected
+// (the goal is a usable mock surface for downstream tests), so we silence
+// per-item dead-code warnings at the module level.
+#![allow(dead_code)]
+
+use std::collections::VecDeque;
+use std::pin::Pin;
+use std::sync::Mutex;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use anyhow::{Result, anyhow};
+use async_stream::try_stream;
+use futures_util::Stream;
+
+use crate::models::{
+    ContentBlock, MessageDelta, MessageRequest, MessageResponse, StreamEvent, Usage,
+};
+
+use super::{LlmClient, StreamEventBox};
+
+/// A pre-recorded "turn" the mock will replay on the next streaming call.
+///
+/// `MessageStop` does *not* need to be the final element — the mock will
+/// auto-emit one if missing, mirroring the real client's behaviour. Likewise
+/// the mock does not require `MessageStart` to be present.
+pub type CannedTurn = Vec<StreamEvent>;
+
+/// A queue-driven mock LLM client.
+///
+/// The mock holds a FIFO queue of canned response turns. Each call to
+/// [`LlmClient::create_message_stream`] dequeues the next turn and replays its
+/// events as a stream. If the queue is exhausted, the call returns an error
+/// — tests should ensure they push exactly as many turns as the runtime will
+/// consume.
+///
+/// The mock also captures the [`MessageRequest`] passed to every call so tests
+/// can assert on the outgoing payload (e.g. that prior `reasoning_content` is
+/// preserved across turns).
+pub struct MockLlmClient {
+    canned: Mutex<VecDeque<CannedTurn>>,
+    captured_requests: Mutex<Vec<MessageRequest>>,
+    calls: AtomicUsize,
+    provider_name: &'static str,
+    model: String,
+    /// If set, [`LlmClient::create_message`] returns this verbatim. Otherwise
+    /// it falls back to streaming + collection. Useful for non-streaming
+    /// compaction-style calls.
+    canned_messages: Mutex<VecDeque<MessageResponse>>,
+}
+
+impl MockLlmClient {
+    /// Construct a mock that will replay the given canned turns in order.
+    #[must_use]
+    pub fn new(canned: Vec<CannedTurn>) -> Self {
+        Self {
+            canned: Mutex::new(canned.into()),
+            captured_requests: Mutex::new(Vec::new()),
+            calls: AtomicUsize::new(0),
+            provider_name: "mock",
+            model: "mock-model".to_string(),
+            canned_messages: Mutex::new(VecDeque::new()),
+        }
+    }
+
+    /// Set the provider-name string returned by [`LlmClient::provider_name`].
+    #[must_use]
+    pub fn with_provider(mut self, name: &'static str) -> Self {
+        self.provider_name = name;
+        self
+    }
+
+    /// Set the model identifier returned by [`LlmClient::model`].
+    #[must_use]
+    pub fn with_model(mut self, model: impl Into<String>) -> Self {
+        self.model = model.into();
+        self
+    }
+
+    /// Push a canned turn onto the back of the queue.
+    pub fn push_turn(&self, turn: CannedTurn) {
+        self.canned
+            .lock()
+            .expect("MockLlmClient.canned mutex poisoned")
+            .push_back(turn);
+    }
+
+    /// Push a canned non-streaming `MessageResponse`. Consumed by
+    /// [`LlmClient::create_message`] (FIFO).
+    pub fn push_message_response(&self, response: MessageResponse) {
+        self.canned_messages
+            .lock()
+            .expect("MockLlmClient.canned_messages mutex poisoned")
+            .push_back(response);
+    }
+
+    /// Number of completed calls to either `create_message` or
+    /// `create_message_stream`.
+    #[must_use]
+    pub fn call_count(&self) -> usize {
+        self.calls.load(Ordering::SeqCst)
+    }
+
+    /// Number of canned turns still queued.
+    #[must_use]
+    pub fn remaining_turns(&self) -> usize {
+        self.canned
+            .lock()
+            .expect("MockLlmClient.canned mutex poisoned")
+            .len()
+    }
+
+    /// Snapshot of every request the mock has been asked to handle, in order.
+    #[must_use]
+    pub fn captured_requests(&self) -> Vec<MessageRequest> {
+        self.captured_requests
+            .lock()
+            .expect("MockLlmClient.captured_requests mutex poisoned")
+            .clone()
+    }
+
+    /// Convenience: return the most recently captured request, or `None` if
+    /// the mock has not been called yet.
+    #[must_use]
+    pub fn last_request(&self) -> Option<MessageRequest> {
+        self.captured_requests
+            .lock()
+            .expect("MockLlmClient.captured_requests mutex poisoned")
+            .last()
+            .cloned()
+    }
+
+    fn record_request(&self, request: &MessageRequest) {
+        self.captured_requests
+            .lock()
+            .expect("MockLlmClient.captured_requests mutex poisoned")
+            .push(request.clone());
+        self.calls.fetch_add(1, Ordering::SeqCst);
+    }
+
+    fn pop_turn(&self) -> Option<CannedTurn> {
+        self.canned
+            .lock()
+            .expect("MockLlmClient.canned mutex poisoned")
+            .pop_front()
+    }
+
+    fn pop_message(&self) -> Option<MessageResponse> {
+        self.canned_messages
+            .lock()
+            .expect("MockLlmClient.canned_messages mutex poisoned")
+            .pop_front()
+    }
+}
+
+impl LlmClient for MockLlmClient {
+    fn provider_name(&self) -> &'static str {
+        self.provider_name
+    }
+
+    fn model(&self) -> &str {
+        &self.model
+    }
+
+    async fn create_message(&self, request: MessageRequest) -> Result<MessageResponse> {
+        self.record_request(&request);
+
+        if let Some(canned) = self.pop_message() {
+            return Ok(canned);
+        }
+
+        // Fallback: synthesize a MessageResponse from the next streaming turn.
+        let Some(turn) = self.pop_turn() else {
+            return Err(anyhow!(
+                "MockLlmClient: create_message called but no canned response queued (request #{})",
+                self.calls.load(Ordering::SeqCst)
+            ));
+        };
+
+        Ok(synthesize_message_response(turn, &self.model))
+    }
+
+    async fn create_message_stream(&self, request: MessageRequest) -> Result<StreamEventBox> {
+        self.record_request(&request);
+
+        let Some(turn) = self.pop_turn() else {
+            return Err(anyhow!(
+                "MockLlmClient: create_message_stream called but no canned turn queued (call #{})",
+                self.calls.load(Ordering::SeqCst)
+            ));
+        };
+
+        Ok(stream_from_canned(turn))
+    }
+
+    async fn health_check(&self) -> Result<bool> {
+        Ok(true)
+    }
+}
+
+/// Wrap a canned event vector as a stream that yields each event in order and
+/// auto-appends `MessageStop` if the trailing event is not already one.
+fn stream_from_canned(turn: CannedTurn) -> StreamEventBox {
+    let s = try_stream! {
+        let has_stop = matches!(turn.last(), Some(StreamEvent::MessageStop));
+        for ev in turn {
+            yield ev;
+        }
+        if !has_stop {
+            yield StreamEvent::MessageStop;
+        }
+    };
+    Box::pin(s) as Pin<Box<dyn Stream<Item = Result<StreamEvent>> + Send + 'static>>
+}
+
+/// Best-effort: collapse a streaming turn into a non-streaming
+/// `MessageResponse` by concatenating text deltas. Used only as a fallback
+/// when callers `create_message` without a queued `MessageResponse`.
+fn synthesize_message_response(turn: CannedTurn, model: &str) -> MessageResponse {
+    use crate::models::Delta;
+
+    let mut text = String::new();
+    let mut stop_reason: Option<String> = None;
+
+    for ev in turn {
+        match ev {
+            StreamEvent::ContentBlockDelta {
+                delta: Delta::TextDelta { text: t },
+                ..
+            } => text.push_str(&t),
+            StreamEvent::MessageDelta {
+                delta: MessageDelta {
+                    stop_reason: sr, ..
+                },
+                ..
+            } => stop_reason = sr,
+            _ => {}
+        }
+    }
+
+    MessageResponse {
+        id: "mock_msg".to_string(),
+        r#type: "message".to_string(),
+        role: "assistant".to_string(),
+        content: vec![ContentBlock::Text {
+            text,
+            cache_control: None,
+        }],
+        model: model.to_string(),
+        stop_reason: stop_reason.or_else(|| Some("end_turn".to_string())),
+        stop_sequence: None,
+        container: None,
+        usage: Usage::default(),
+    }
+}
+
+/// Builders for common canned-event patterns. Re-exported so tests can build
+/// realistic streams without wiring `StreamEvent` shapes by hand.
+pub mod canned {
+    use serde_json::Value;
+
+    use crate::models::{
+        ContentBlockStart, Delta, MessageDelta, MessageResponse, StreamEvent, Usage,
+    };
+
+    /// `MessageStart` event with a synthetic message envelope.
+    #[must_use]
+    pub fn message_start(id: &str) -> StreamEvent {
+        StreamEvent::MessageStart {
+            message: MessageResponse {
+                id: id.to_string(),
+                r#type: "message".to_string(),
+                role: "assistant".to_string(),
+                content: vec![],
+                model: "mock-model".to_string(),
+                stop_reason: None,
+                stop_sequence: None,
+                container: None,
+                usage: Usage::default(),
+            },
+        }
+    }
+
+    /// Open a text content block at `index`.
+    #[must_use]
+    pub fn text_block_start(index: u32) -> StreamEvent {
+        StreamEvent::ContentBlockStart {
+            index,
+            content_block: ContentBlockStart::Text {
+                text: String::new(),
+            },
+        }
+    }
+
+    /// Append `text` to the content block at `index`.
+    #[must_use]
+    pub fn text_delta(index: u32, text: &str) -> StreamEvent {
+        StreamEvent::ContentBlockDelta {
+            index,
+            delta: Delta::TextDelta {
+                text: text.to_string(),
+            },
+        }
+    }
+
+    /// Append a thinking-content delta at `index`.
+    #[must_use]
+    pub fn thinking_delta(index: u32, thinking: &str) -> StreamEvent {
+        StreamEvent::ContentBlockDelta {
+            index,
+            delta: Delta::ThinkingDelta {
+                thinking: thinking.to_string(),
+            },
+        }
+    }
+
+    /// Open a tool_use content block at `index`.
+    #[must_use]
+    pub fn tool_use_block_start(index: u32, id: &str, name: &str) -> StreamEvent {
+        StreamEvent::ContentBlockStart {
+            index,
+            content_block: ContentBlockStart::ToolUse {
+                id: id.to_string(),
+                name: name.to_string(),
+                input: Value::Null,
+                caller: None,
+            },
+        }
+    }
+
+    /// Stream partial JSON for a tool's input arguments.
+    #[must_use]
+    pub fn tool_input_delta(index: u32, partial_json: &str) -> StreamEvent {
+        StreamEvent::ContentBlockDelta {
+            index,
+            delta: Delta::InputJsonDelta {
+                partial_json: partial_json.to_string(),
+            },
+        }
+    }
+
+    /// Close the content block at `index`.
+    #[must_use]
+    pub fn block_stop(index: u32) -> StreamEvent {
+        StreamEvent::ContentBlockStop { index }
+    }
+
+    /// Emit a `message_delta` carrying `stop_reason` and optional `usage`.
+    #[must_use]
+    pub fn message_delta(stop_reason: &str, usage: Option<Usage>) -> StreamEvent {
+        StreamEvent::MessageDelta {
+            delta: MessageDelta {
+                stop_reason: Some(stop_reason.to_string()),
+                stop_sequence: None,
+            },
+            usage,
+        }
+    }
+
+    /// Final `message_stop` sentinel.
+    #[must_use]
+    pub fn message_stop() -> StreamEvent {
+        StreamEvent::MessageStop
+    }
+
+    /// Convenience: a complete "assistant emits this text" turn ending with
+    /// `stop_reason = "end_turn"`.
+    #[must_use]
+    pub fn simple_text_turn(text: &str) -> Vec<StreamEvent> {
+        vec![
+            message_start("mock_msg_1"),
+            text_block_start(0),
+            text_delta(0, text),
+            block_stop(0),
+            message_delta("end_turn", None),
+            message_stop(),
+        ]
+    }
+
+    /// Convenience: a turn that emits one assistant tool_call and stops.
+    #[must_use]
+    pub fn tool_call_turn(call_id: &str, tool_name: &str, args_json: &str) -> Vec<StreamEvent> {
+        vec![
+            message_start("mock_msg_tool"),
+            tool_use_block_start(0, call_id, tool_name),
+            tool_input_delta(0, args_json),
+            block_stop(0),
+            message_delta("tool_use", None),
+            message_stop(),
+        ]
+    }
+}
+
+// === Tests ===
+
+#[cfg(test)]
+mod tests {
+    use futures_util::StreamExt;
+
+    use super::*;
+    use crate::llm_client::LlmClient;
+    use crate::models::{Delta, Message, MessageRequest, StreamEvent};
+
+    fn empty_request() -> MessageRequest {
+        MessageRequest {
+            model: "mock-model".to_string(),
+            messages: vec![Message {
+                role: "user".to_string(),
+                content: vec![],
+            }],
+            max_tokens: 1024,
+            system: None,
+            tools: None,
+            tool_choice: None,
+            metadata: None,
+            thinking: None,
+            reasoning_effort: None,
+            stream: Some(true),
+            temperature: None,
+            top_p: None,
+        }
+    }
+
+    #[tokio::test]
+    async fn replays_canned_turn_via_stream() {
+        let mock = MockLlmClient::new(vec![canned::simple_text_turn("hello world")]);
+
+        let mut stream = mock
+            .create_message_stream(empty_request())
+            .await
+            .expect("stream should open");
+
+        let mut text = String::new();
+        let mut saw_stop = false;
+        while let Some(ev) = stream.next().await {
+            match ev.expect("event") {
+                StreamEvent::ContentBlockDelta {
+                    delta: Delta::TextDelta { text: t },
+                    ..
+                } => text.push_str(&t),
+                StreamEvent::MessageStop => {
+                    saw_stop = true;
+                    break;
+                }
+                _ => {}
+            }
+        }
+
+        assert_eq!(text, "hello world");
+        assert!(saw_stop);
+        assert_eq!(mock.call_count(), 1);
+        assert_eq!(mock.captured_requests().len(), 1);
+        assert_eq!(mock.remaining_turns(), 0);
+    }
+
+    #[tokio::test]
+    async fn errors_when_queue_exhausted() {
+        let mock = MockLlmClient::new(Vec::new());
+        let result = mock.create_message_stream(empty_request()).await;
+        match result {
+            Ok(_) => panic!("should error on empty queue"),
+            Err(err) => assert!(format!("{err}").contains("no canned")),
+        }
+    }
+
+    #[tokio::test]
+    async fn captures_request_payload_for_assertions() {
+        let mock = MockLlmClient::new(vec![canned::simple_text_turn("ok")]);
+        let mut req = empty_request();
+        req.temperature = Some(0.42);
+        let _ = mock.create_message_stream(req).await.unwrap();
+
+        let captured = mock.last_request().expect("should have captured");
+        assert_eq!(captured.temperature, Some(0.42));
+    }
+
+    #[tokio::test]
+    async fn stream_auto_appends_message_stop() {
+        // Queue a turn missing MessageStop — mock should append one.
+        let turn = vec![canned::text_block_start(0), canned::text_delta(0, "x")];
+        let mock = MockLlmClient::new(vec![turn]);
+
+        let mut stream = mock.create_message_stream(empty_request()).await.unwrap();
+        let mut saw_stop = false;
+        while let Some(ev) = stream.next().await {
+            if matches!(ev.expect("event"), StreamEvent::MessageStop) {
+                saw_stop = true;
+            }
+        }
+        assert!(saw_stop, "auto MessageStop missing");
+    }
+
+    #[tokio::test]
+    async fn create_message_uses_canned_message_response_first() {
+        let mock = MockLlmClient::new(vec![canned::simple_text_turn("from stream")]);
+        mock.push_message_response(MessageResponse {
+            id: "preset".to_string(),
+            r#type: "message".to_string(),
+            role: "assistant".to_string(),
+            content: vec![ContentBlock::Text {
+                text: "from preset".to_string(),
+                cache_control: None,
+            }],
+            model: "mock-model".to_string(),
+            stop_reason: Some("end_turn".to_string()),
+            stop_sequence: None,
+            container: None,
+            usage: Usage::default(),
+        });
+
+        let resp = mock.create_message(empty_request()).await.unwrap();
+        assert_eq!(resp.id, "preset");
+    }
+
+    #[tokio::test]
+    async fn create_message_synthesizes_from_streaming_turn_when_no_message_queued() {
+        let mock = MockLlmClient::new(vec![canned::simple_text_turn("synthesized")]);
+        let resp = mock.create_message(empty_request()).await.unwrap();
+        let text = match &resp.content[0] {
+            ContentBlock::Text { text, .. } => text.clone(),
+            _ => panic!("expected text"),
+        };
+        assert_eq!(text, "synthesized");
+        assert_eq!(resp.stop_reason.as_deref(), Some("end_turn"));
+    }
+
+    #[tokio::test]
+    async fn provider_and_model_are_overridable() {
+        let mock = MockLlmClient::new(vec![canned::simple_text_turn("x")])
+            .with_provider("test-provider")
+            .with_model("test-model");
+        assert_eq!(mock.provider_name(), "test-provider");
+        assert_eq!(mock.model(), "test-model");
+    }
+
+    #[tokio::test]
+    async fn tool_call_turn_serializes_correctly() {
+        let mock = MockLlmClient::new(vec![canned::tool_call_turn(
+            "call_1",
+            "list_dir",
+            r#"{"path":"/tmp"}"#,
+        )]);
+        let mut stream = mock.create_message_stream(empty_request()).await.unwrap();
+
+        let mut saw_tool_use = false;
+        let mut json_seen = String::new();
+        while let Some(ev) = stream.next().await {
+            match ev.unwrap() {
+                StreamEvent::ContentBlockStart { content_block, .. } => {
+                    use crate::models::ContentBlockStart;
+                    if let ContentBlockStart::ToolUse { name, .. } = content_block {
+                        assert_eq!(name, "list_dir");
+                        saw_tool_use = true;
+                    }
+                }
+                StreamEvent::ContentBlockDelta {
+                    delta: Delta::InputJsonDelta { partial_json },
+                    ..
+                } => json_seen.push_str(&partial_json),
+                _ => {}
+            }
+        }
+        assert!(saw_tool_use, "expected tool_use start event");
+        assert!(json_seen.contains("/tmp"));
+    }
+
+    #[tokio::test]
+    async fn multiple_turns_consumed_in_order() {
+        let mock = MockLlmClient::new(vec![
+            canned::simple_text_turn("turn-one"),
+            canned::simple_text_turn("turn-two"),
+        ]);
+        for expected in ["turn-one", "turn-two"] {
+            let mut stream = mock.create_message_stream(empty_request()).await.unwrap();
+            let mut text = String::new();
+            while let Some(ev) = stream.next().await {
+                if let StreamEvent::ContentBlockDelta {
+                    delta: Delta::TextDelta { text: t },
+                    ..
+                } = ev.unwrap()
+                {
+                    text.push_str(&t);
+                }
+            }
+            assert_eq!(text, expected);
+        }
+        assert_eq!(mock.call_count(), 2);
+        assert_eq!(mock.remaining_turns(), 0);
+    }
+}
@@ -30,6 +30,9 @@ use std::pin::Pin;
 use std::time::{Duration, Instant};
 use uuid::Uuid;

+#[cfg(test)]
+pub mod mock;
+
 // === LlmClient Trait ===

 /// Type alias for boxed stream of SSE events
@@ -282,6 +282,10 @@ struct EvalArgs {
    /// Emit machine-readable JSON output
    #[arg(long, default_value_t = false)]
    json: bool,
+    /// Append one JSONL fixture line per step to `<DIR>/<scenario>.jsonl`.
+    /// Mock LLM tests can later replay these fixtures.
+    #[arg(long, value_name = "DIR")]
+    record: Option<PathBuf>,
 }

 #[derive(Args, Debug, Clone, Default)]
@@ -672,6 +676,7 @@ fn run_eval(args: EvalArgs) -> Result<()> {
        shell_command: args.shell_command,
        shell_expect_token: args.shell_expect_token,
        max_output_chars: args.max_output_chars,
+        record_dir: args.record.clone(),
        ..EvalHarnessConfig::default()
    };