Merge branch 'feat/v070-mock-llm' (#69 mock LLM client + integration tests)
- crates/tui/src/llm_client.rs → llm_client/mod.rs (directory module) - crates/tui/src/llm_client/mock.rs (NEW) — MockLlmClient at trait boundary - crates/tui/tests/integration_mock_llm.rs (NEW) — 7 trait-level scenarios - crates/tui/src/eval.rs + main.rs — `--record` fixture flag - 4 engine-level tests #[ignore]'d pending engine LlmClient trait-object refactor
This commit is contained in:
+109
-3
@@ -6,9 +6,10 @@
|
||||
use anyhow::{Context, Result, anyhow};
|
||||
use ignore::WalkBuilder;
|
||||
use regex::Regex;
|
||||
use serde::Serialize;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::BTreeMap;
|
||||
use std::fs;
|
||||
use std::io::Write;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
use std::time::{Duration, Instant};
|
||||
@@ -102,6 +103,14 @@ pub struct EvalHarnessConfig {
|
||||
pub shell_expect_token: String,
|
||||
/// Maximum characters stored for step output summaries.
|
||||
pub max_output_chars: usize,
|
||||
/// When set, every step is appended as a JSON Lines fixture to a file
|
||||
/// inside this directory. The fixture file is named after the scenario
|
||||
/// (e.g. `offline-tool-loop.jsonl`). Each line follows the schema:
|
||||
/// `{ "request": <step descriptor>, "response_events": [<events>] }`.
|
||||
/// The mock LLM client (`crate::llm_client::mock`) can replay these
|
||||
/// fixtures for deterministic offline tests. See
|
||||
/// `crates/tui/tests/README.md` for the full record/replay flow.
|
||||
pub record_dir: Option<PathBuf>,
|
||||
}
|
||||
|
||||
impl Default for EvalHarnessConfig {
|
||||
@@ -117,6 +126,7 @@ impl Default for EvalHarnessConfig {
|
||||
shell_command,
|
||||
shell_expect_token: "eval-harness".to_string(),
|
||||
max_output_chars: 240,
|
||||
record_dir: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -273,26 +283,122 @@ impl EvalHarness {
|
||||
success: true,
|
||||
duration,
|
||||
error: None,
|
||||
output: Some(output),
|
||||
output: Some(output.clone()),
|
||||
});
|
||||
if let Some(dir) = self.config.record_dir.as_deref() {
|
||||
let _ = record_fixture(
|
||||
dir,
|
||||
&self.config.scenario_name,
|
||||
FixtureRecord::ok(kind, &output),
|
||||
);
|
||||
}
|
||||
Some(value)
|
||||
}
|
||||
Err(err) => {
|
||||
stats.errors += 1;
|
||||
let err_str = err.to_string();
|
||||
steps.push(EvalStep {
|
||||
kind,
|
||||
tool_name: kind.tool_name(),
|
||||
success: false,
|
||||
duration,
|
||||
error: Some(err.to_string()),
|
||||
error: Some(err_str.clone()),
|
||||
output: None,
|
||||
});
|
||||
if let Some(dir) = self.config.record_dir.as_deref() {
|
||||
let _ = record_fixture(
|
||||
dir,
|
||||
&self.config.scenario_name,
|
||||
FixtureRecord::err(kind, &err_str),
|
||||
);
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// === Fixture record/replay format ===========================================
|
||||
//
|
||||
// The `--record` flag writes one JSON object per line to a `.jsonl` file:
|
||||
//
|
||||
// { "request": { "step": "list_dir", "kind": "List" },
|
||||
// "response_events": [{ "type": "ok", "output": "…" }] }
|
||||
//
|
||||
// The mock LLM client replays these fixtures via
|
||||
// `MockLlmClient::push_message_response` (or the streaming variant) by mapping
|
||||
// each `response_events` array onto a canned `Vec<StreamEvent>`.
|
||||
//
|
||||
// This format is intentionally minimal — additional fields (timing, model,
|
||||
// usage) can be added without breaking older fixtures because each line is a
|
||||
// self-contained JSON object.
|
||||
|
||||
/// Schema for one line of a `--record` JSONL fixture file.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct FixtureRecord {
|
||||
/// Step descriptor (`{ step, kind }`).
|
||||
pub request: serde_json::Value,
|
||||
/// One or more synthetic response events.
|
||||
pub response_events: Vec<serde_json::Value>,
|
||||
}
|
||||
|
||||
impl FixtureRecord {
|
||||
fn ok(kind: ScenarioStepKind, output: &str) -> Self {
|
||||
Self {
|
||||
request: serde_json::json!({
|
||||
"step": kind.tool_name(),
|
||||
"kind": format!("{kind:?}"),
|
||||
}),
|
||||
response_events: vec![serde_json::json!({
|
||||
"type": "ok",
|
||||
"output": output,
|
||||
})],
|
||||
}
|
||||
}
|
||||
|
||||
fn err(kind: ScenarioStepKind, error: &str) -> Self {
|
||||
Self {
|
||||
request: serde_json::json!({
|
||||
"step": kind.tool_name(),
|
||||
"kind": format!("{kind:?}"),
|
||||
}),
|
||||
response_events: vec![serde_json::json!({
|
||||
"type": "error",
|
||||
"error": error,
|
||||
})],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Append one fixture record to `<dir>/<scenario>.jsonl` (creating dir + file
|
||||
/// if missing). Best-effort: I/O errors are returned but generally ignored by
|
||||
/// the harness so a recording failure does not mask the run's primary result.
|
||||
pub fn record_fixture(dir: &Path, scenario_name: &str, record: FixtureRecord) -> Result<PathBuf> {
|
||||
fs::create_dir_all(dir)
|
||||
.with_context(|| format!("failed to create fixture dir: {}", dir.display()))?;
|
||||
let safe_scenario = scenario_name
|
||||
.chars()
|
||||
.map(|c| {
|
||||
if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
|
||||
c
|
||||
} else {
|
||||
'_'
|
||||
}
|
||||
})
|
||||
.collect::<String>();
|
||||
let path = dir.join(format!("{safe_scenario}.jsonl"));
|
||||
let line = serde_json::to_string(&record).context("failed to serialize fixture record")?;
|
||||
|
||||
let mut file = fs::OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&path)
|
||||
.with_context(|| format!("failed to open fixture file: {}", path.display()))?;
|
||||
writeln!(file, "{line}")
|
||||
.with_context(|| format!("failed to write fixture line to {}", path.display()))?;
|
||||
Ok(path)
|
||||
}
|
||||
|
||||
impl Default for EvalHarness {
|
||||
fn default() -> Self {
|
||||
Self::new(EvalHarnessConfig::default())
|
||||
|
||||
@@ -0,0 +1,627 @@
|
||||
//! `MockLlmClient` — a queue-driven `LlmClient` implementation for tests.
|
||||
//!
|
||||
//! This client implements the [`LlmClient`](super::LlmClient) trait by replaying a
|
||||
//! pre-loaded queue of canned responses (one per turn). It captures every
|
||||
//! request the runtime sends so tests can assert on the outgoing payload —
|
||||
//! e.g. confirming that prior `reasoning_content` is replayed in DeepSeek V4
|
||||
//! thinking-mode tool-calling turns (V4 §5.1.1; the bug that broke
|
||||
//! v0.4.9-v0.5.1).
|
||||
//!
|
||||
//! # Mocking strategy
|
||||
//!
|
||||
//! Tests mock at the **trait boundary** (`LlmClient`), never at the `reqwest`
|
||||
//! HTTP layer. The trait is the durable abstraction — internal HTTP plumbing
|
||||
//! changes frequently and is not part of the public engine contract.
|
||||
//!
|
||||
//! # Example
|
||||
//!
|
||||
//! ```ignore
|
||||
//! use crate::llm_client::mock::{MockLlmClient, canned};
|
||||
//! use crate::llm_client::LlmClient;
|
||||
//!
|
||||
//! // One canned turn that emits "hello world" as two text deltas, then
|
||||
//! // finishes with stop_reason = "end_turn".
|
||||
//! let turn = vec![
|
||||
//! canned::message_start("msg_1"),
|
||||
//! canned::text_delta(0, "hello "),
|
||||
//! canned::text_delta(0, "world"),
|
||||
//! canned::message_stop(),
|
||||
//! ];
|
||||
//!
|
||||
//! let mock = MockLlmClient::new(vec![turn]);
|
||||
//! let stream = mock.create_message_stream(/* ... */).await.unwrap();
|
||||
//! // ... drain the stream, assert deltas ...
|
||||
//! assert_eq!(mock.call_count(), 1);
|
||||
//! assert_eq!(mock.captured_requests().len(), 1);
|
||||
//! ```
|
||||
|
||||
// This module ships methods + builder helpers that integration tests rely on
|
||||
// individually. Not every helper is exercised by unit tests — that's expected
|
||||
// (the goal is a usable mock surface for downstream tests), so we silence
|
||||
// per-item dead-code warnings at the module level.
|
||||
#![allow(dead_code)]
|
||||
|
||||
use std::collections::VecDeque;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Mutex;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use anyhow::{Result, anyhow};
|
||||
use async_stream::try_stream;
|
||||
use futures_util::Stream;
|
||||
|
||||
use crate::models::{
|
||||
ContentBlock, MessageDelta, MessageRequest, MessageResponse, StreamEvent, Usage,
|
||||
};
|
||||
|
||||
use super::{LlmClient, StreamEventBox};
|
||||
|
||||
/// A pre-recorded "turn" the mock will replay on the next streaming call.
|
||||
///
|
||||
/// `MessageStop` does *not* need to be the final element — the mock will
|
||||
/// auto-emit one if missing, mirroring the real client's behaviour. Likewise
|
||||
/// the mock does not require `MessageStart` to be present.
|
||||
pub type CannedTurn = Vec<StreamEvent>;
|
||||
|
||||
/// A queue-driven mock LLM client.
|
||||
///
|
||||
/// The mock holds a FIFO queue of canned response turns. Each call to
|
||||
/// [`LlmClient::create_message_stream`] dequeues the next turn and replays its
|
||||
/// events as a stream. If the queue is exhausted, the call returns an error
|
||||
/// — tests should ensure they push exactly as many turns as the runtime will
|
||||
/// consume.
|
||||
///
|
||||
/// The mock also captures the [`MessageRequest`] passed to every call so tests
|
||||
/// can assert on the outgoing payload (e.g. that prior `reasoning_content` is
|
||||
/// preserved across turns).
|
||||
pub struct MockLlmClient {
|
||||
canned: Mutex<VecDeque<CannedTurn>>,
|
||||
captured_requests: Mutex<Vec<MessageRequest>>,
|
||||
calls: AtomicUsize,
|
||||
provider_name: &'static str,
|
||||
model: String,
|
||||
/// If set, [`LlmClient::create_message`] returns this verbatim. Otherwise
|
||||
/// it falls back to streaming + collection. Useful for non-streaming
|
||||
/// compaction-style calls.
|
||||
canned_messages: Mutex<VecDeque<MessageResponse>>,
|
||||
}
|
||||
|
||||
impl MockLlmClient {
|
||||
/// Construct a mock that will replay the given canned turns in order.
|
||||
#[must_use]
|
||||
pub fn new(canned: Vec<CannedTurn>) -> Self {
|
||||
Self {
|
||||
canned: Mutex::new(canned.into()),
|
||||
captured_requests: Mutex::new(Vec::new()),
|
||||
calls: AtomicUsize::new(0),
|
||||
provider_name: "mock",
|
||||
model: "mock-model".to_string(),
|
||||
canned_messages: Mutex::new(VecDeque::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the provider-name string returned by [`LlmClient::provider_name`].
|
||||
#[must_use]
|
||||
pub fn with_provider(mut self, name: &'static str) -> Self {
|
||||
self.provider_name = name;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the model identifier returned by [`LlmClient::model`].
|
||||
#[must_use]
|
||||
pub fn with_model(mut self, model: impl Into<String>) -> Self {
|
||||
self.model = model.into();
|
||||
self
|
||||
}
|
||||
|
||||
/// Push a canned turn onto the back of the queue.
|
||||
pub fn push_turn(&self, turn: CannedTurn) {
|
||||
self.canned
|
||||
.lock()
|
||||
.expect("MockLlmClient.canned mutex poisoned")
|
||||
.push_back(turn);
|
||||
}
|
||||
|
||||
/// Push a canned non-streaming `MessageResponse`. Consumed by
|
||||
/// [`LlmClient::create_message`] (FIFO).
|
||||
pub fn push_message_response(&self, response: MessageResponse) {
|
||||
self.canned_messages
|
||||
.lock()
|
||||
.expect("MockLlmClient.canned_messages mutex poisoned")
|
||||
.push_back(response);
|
||||
}
|
||||
|
||||
/// Number of completed calls to either `create_message` or
|
||||
/// `create_message_stream`.
|
||||
#[must_use]
|
||||
pub fn call_count(&self) -> usize {
|
||||
self.calls.load(Ordering::SeqCst)
|
||||
}
|
||||
|
||||
/// Number of canned turns still queued.
|
||||
#[must_use]
|
||||
pub fn remaining_turns(&self) -> usize {
|
||||
self.canned
|
||||
.lock()
|
||||
.expect("MockLlmClient.canned mutex poisoned")
|
||||
.len()
|
||||
}
|
||||
|
||||
/// Snapshot of every request the mock has been asked to handle, in order.
|
||||
#[must_use]
|
||||
pub fn captured_requests(&self) -> Vec<MessageRequest> {
|
||||
self.captured_requests
|
||||
.lock()
|
||||
.expect("MockLlmClient.captured_requests mutex poisoned")
|
||||
.clone()
|
||||
}
|
||||
|
||||
/// Convenience: return the most recently captured request, or `None` if
|
||||
/// the mock has not been called yet.
|
||||
#[must_use]
|
||||
pub fn last_request(&self) -> Option<MessageRequest> {
|
||||
self.captured_requests
|
||||
.lock()
|
||||
.expect("MockLlmClient.captured_requests mutex poisoned")
|
||||
.last()
|
||||
.cloned()
|
||||
}
|
||||
|
||||
fn record_request(&self, request: &MessageRequest) {
|
||||
self.captured_requests
|
||||
.lock()
|
||||
.expect("MockLlmClient.captured_requests mutex poisoned")
|
||||
.push(request.clone());
|
||||
self.calls.fetch_add(1, Ordering::SeqCst);
|
||||
}
|
||||
|
||||
fn pop_turn(&self) -> Option<CannedTurn> {
|
||||
self.canned
|
||||
.lock()
|
||||
.expect("MockLlmClient.canned mutex poisoned")
|
||||
.pop_front()
|
||||
}
|
||||
|
||||
fn pop_message(&self) -> Option<MessageResponse> {
|
||||
self.canned_messages
|
||||
.lock()
|
||||
.expect("MockLlmClient.canned_messages mutex poisoned")
|
||||
.pop_front()
|
||||
}
|
||||
}
|
||||
|
||||
impl LlmClient for MockLlmClient {
|
||||
fn provider_name(&self) -> &'static str {
|
||||
self.provider_name
|
||||
}
|
||||
|
||||
fn model(&self) -> &str {
|
||||
&self.model
|
||||
}
|
||||
|
||||
async fn create_message(&self, request: MessageRequest) -> Result<MessageResponse> {
|
||||
self.record_request(&request);
|
||||
|
||||
if let Some(canned) = self.pop_message() {
|
||||
return Ok(canned);
|
||||
}
|
||||
|
||||
// Fallback: synthesize a MessageResponse from the next streaming turn.
|
||||
let Some(turn) = self.pop_turn() else {
|
||||
return Err(anyhow!(
|
||||
"MockLlmClient: create_message called but no canned response queued (request #{})",
|
||||
self.calls.load(Ordering::SeqCst)
|
||||
));
|
||||
};
|
||||
|
||||
Ok(synthesize_message_response(turn, &self.model))
|
||||
}
|
||||
|
||||
async fn create_message_stream(&self, request: MessageRequest) -> Result<StreamEventBox> {
|
||||
self.record_request(&request);
|
||||
|
||||
let Some(turn) = self.pop_turn() else {
|
||||
return Err(anyhow!(
|
||||
"MockLlmClient: create_message_stream called but no canned turn queued (call #{})",
|
||||
self.calls.load(Ordering::SeqCst)
|
||||
));
|
||||
};
|
||||
|
||||
Ok(stream_from_canned(turn))
|
||||
}
|
||||
|
||||
async fn health_check(&self) -> Result<bool> {
|
||||
Ok(true)
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrap a canned event vector as a stream that yields each event in order and
|
||||
/// auto-appends `MessageStop` if the trailing event is not already one.
|
||||
fn stream_from_canned(turn: CannedTurn) -> StreamEventBox {
|
||||
let s = try_stream! {
|
||||
let has_stop = matches!(turn.last(), Some(StreamEvent::MessageStop));
|
||||
for ev in turn {
|
||||
yield ev;
|
||||
}
|
||||
if !has_stop {
|
||||
yield StreamEvent::MessageStop;
|
||||
}
|
||||
};
|
||||
Box::pin(s) as Pin<Box<dyn Stream<Item = Result<StreamEvent>> + Send + 'static>>
|
||||
}
|
||||
|
||||
/// Best-effort: collapse a streaming turn into a non-streaming
|
||||
/// `MessageResponse` by concatenating text deltas. Used only as a fallback
|
||||
/// when callers `create_message` without a queued `MessageResponse`.
|
||||
fn synthesize_message_response(turn: CannedTurn, model: &str) -> MessageResponse {
|
||||
use crate::models::Delta;
|
||||
|
||||
let mut text = String::new();
|
||||
let mut stop_reason: Option<String> = None;
|
||||
|
||||
for ev in turn {
|
||||
match ev {
|
||||
StreamEvent::ContentBlockDelta {
|
||||
delta: Delta::TextDelta { text: t },
|
||||
..
|
||||
} => text.push_str(&t),
|
||||
StreamEvent::MessageDelta {
|
||||
delta: MessageDelta {
|
||||
stop_reason: sr, ..
|
||||
},
|
||||
..
|
||||
} => stop_reason = sr,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
MessageResponse {
|
||||
id: "mock_msg".to_string(),
|
||||
r#type: "message".to_string(),
|
||||
role: "assistant".to_string(),
|
||||
content: vec![ContentBlock::Text {
|
||||
text,
|
||||
cache_control: None,
|
||||
}],
|
||||
model: model.to_string(),
|
||||
stop_reason: stop_reason.or_else(|| Some("end_turn".to_string())),
|
||||
stop_sequence: None,
|
||||
container: None,
|
||||
usage: Usage::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Builders for common canned-event patterns. Re-exported so tests can build
|
||||
/// realistic streams without wiring `StreamEvent` shapes by hand.
|
||||
pub mod canned {
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::models::{
|
||||
ContentBlockStart, Delta, MessageDelta, MessageResponse, StreamEvent, Usage,
|
||||
};
|
||||
|
||||
/// `MessageStart` event with a synthetic message envelope.
|
||||
#[must_use]
|
||||
pub fn message_start(id: &str) -> StreamEvent {
|
||||
StreamEvent::MessageStart {
|
||||
message: MessageResponse {
|
||||
id: id.to_string(),
|
||||
r#type: "message".to_string(),
|
||||
role: "assistant".to_string(),
|
||||
content: vec![],
|
||||
model: "mock-model".to_string(),
|
||||
stop_reason: None,
|
||||
stop_sequence: None,
|
||||
container: None,
|
||||
usage: Usage::default(),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Open a text content block at `index`.
|
||||
#[must_use]
|
||||
pub fn text_block_start(index: u32) -> StreamEvent {
|
||||
StreamEvent::ContentBlockStart {
|
||||
index,
|
||||
content_block: ContentBlockStart::Text {
|
||||
text: String::new(),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Append `text` to the content block at `index`.
|
||||
#[must_use]
|
||||
pub fn text_delta(index: u32, text: &str) -> StreamEvent {
|
||||
StreamEvent::ContentBlockDelta {
|
||||
index,
|
||||
delta: Delta::TextDelta {
|
||||
text: text.to_string(),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Append a thinking-content delta at `index`.
|
||||
#[must_use]
|
||||
pub fn thinking_delta(index: u32, thinking: &str) -> StreamEvent {
|
||||
StreamEvent::ContentBlockDelta {
|
||||
index,
|
||||
delta: Delta::ThinkingDelta {
|
||||
thinking: thinking.to_string(),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Open a tool_use content block at `index`.
|
||||
#[must_use]
|
||||
pub fn tool_use_block_start(index: u32, id: &str, name: &str) -> StreamEvent {
|
||||
StreamEvent::ContentBlockStart {
|
||||
index,
|
||||
content_block: ContentBlockStart::ToolUse {
|
||||
id: id.to_string(),
|
||||
name: name.to_string(),
|
||||
input: Value::Null,
|
||||
caller: None,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Stream partial JSON for a tool's input arguments.
|
||||
#[must_use]
|
||||
pub fn tool_input_delta(index: u32, partial_json: &str) -> StreamEvent {
|
||||
StreamEvent::ContentBlockDelta {
|
||||
index,
|
||||
delta: Delta::InputJsonDelta {
|
||||
partial_json: partial_json.to_string(),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Close the content block at `index`.
|
||||
#[must_use]
|
||||
pub fn block_stop(index: u32) -> StreamEvent {
|
||||
StreamEvent::ContentBlockStop { index }
|
||||
}
|
||||
|
||||
/// Emit a `message_delta` carrying `stop_reason` and optional `usage`.
|
||||
#[must_use]
|
||||
pub fn message_delta(stop_reason: &str, usage: Option<Usage>) -> StreamEvent {
|
||||
StreamEvent::MessageDelta {
|
||||
delta: MessageDelta {
|
||||
stop_reason: Some(stop_reason.to_string()),
|
||||
stop_sequence: None,
|
||||
},
|
||||
usage,
|
||||
}
|
||||
}
|
||||
|
||||
/// Final `message_stop` sentinel.
|
||||
#[must_use]
|
||||
pub fn message_stop() -> StreamEvent {
|
||||
StreamEvent::MessageStop
|
||||
}
|
||||
|
||||
/// Convenience: a complete "assistant emits this text" turn ending with
|
||||
/// `stop_reason = "end_turn"`.
|
||||
#[must_use]
|
||||
pub fn simple_text_turn(text: &str) -> Vec<StreamEvent> {
|
||||
vec![
|
||||
message_start("mock_msg_1"),
|
||||
text_block_start(0),
|
||||
text_delta(0, text),
|
||||
block_stop(0),
|
||||
message_delta("end_turn", None),
|
||||
message_stop(),
|
||||
]
|
||||
}
|
||||
|
||||
/// Convenience: a turn that emits one assistant tool_call and stops.
|
||||
#[must_use]
|
||||
pub fn tool_call_turn(call_id: &str, tool_name: &str, args_json: &str) -> Vec<StreamEvent> {
|
||||
vec![
|
||||
message_start("mock_msg_tool"),
|
||||
tool_use_block_start(0, call_id, tool_name),
|
||||
tool_input_delta(0, args_json),
|
||||
block_stop(0),
|
||||
message_delta("tool_use", None),
|
||||
message_stop(),
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
// === Tests ===
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use futures_util::StreamExt;
|
||||
|
||||
use super::*;
|
||||
use crate::llm_client::LlmClient;
|
||||
use crate::models::{Delta, Message, MessageRequest, StreamEvent};
|
||||
|
||||
fn empty_request() -> MessageRequest {
|
||||
MessageRequest {
|
||||
model: "mock-model".to_string(),
|
||||
messages: vec![Message {
|
||||
role: "user".to_string(),
|
||||
content: vec![],
|
||||
}],
|
||||
max_tokens: 1024,
|
||||
system: None,
|
||||
tools: None,
|
||||
tool_choice: None,
|
||||
metadata: None,
|
||||
thinking: None,
|
||||
reasoning_effort: None,
|
||||
stream: Some(true),
|
||||
temperature: None,
|
||||
top_p: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn replays_canned_turn_via_stream() {
|
||||
let mock = MockLlmClient::new(vec![canned::simple_text_turn("hello world")]);
|
||||
|
||||
let mut stream = mock
|
||||
.create_message_stream(empty_request())
|
||||
.await
|
||||
.expect("stream should open");
|
||||
|
||||
let mut text = String::new();
|
||||
let mut saw_stop = false;
|
||||
while let Some(ev) = stream.next().await {
|
||||
match ev.expect("event") {
|
||||
StreamEvent::ContentBlockDelta {
|
||||
delta: Delta::TextDelta { text: t },
|
||||
..
|
||||
} => text.push_str(&t),
|
||||
StreamEvent::MessageStop => {
|
||||
saw_stop = true;
|
||||
break;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(text, "hello world");
|
||||
assert!(saw_stop);
|
||||
assert_eq!(mock.call_count(), 1);
|
||||
assert_eq!(mock.captured_requests().len(), 1);
|
||||
assert_eq!(mock.remaining_turns(), 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn errors_when_queue_exhausted() {
|
||||
let mock = MockLlmClient::new(Vec::new());
|
||||
let result = mock.create_message_stream(empty_request()).await;
|
||||
match result {
|
||||
Ok(_) => panic!("should error on empty queue"),
|
||||
Err(err) => assert!(format!("{err}").contains("no canned")),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn captures_request_payload_for_assertions() {
|
||||
let mock = MockLlmClient::new(vec![canned::simple_text_turn("ok")]);
|
||||
let mut req = empty_request();
|
||||
req.temperature = Some(0.42);
|
||||
let _ = mock.create_message_stream(req).await.unwrap();
|
||||
|
||||
let captured = mock.last_request().expect("should have captured");
|
||||
assert_eq!(captured.temperature, Some(0.42));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn stream_auto_appends_message_stop() {
|
||||
// Queue a turn missing MessageStop — mock should append one.
|
||||
let turn = vec![canned::text_block_start(0), canned::text_delta(0, "x")];
|
||||
let mock = MockLlmClient::new(vec![turn]);
|
||||
|
||||
let mut stream = mock.create_message_stream(empty_request()).await.unwrap();
|
||||
let mut saw_stop = false;
|
||||
while let Some(ev) = stream.next().await {
|
||||
if matches!(ev.expect("event"), StreamEvent::MessageStop) {
|
||||
saw_stop = true;
|
||||
}
|
||||
}
|
||||
assert!(saw_stop, "auto MessageStop missing");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn create_message_uses_canned_message_response_first() {
|
||||
let mock = MockLlmClient::new(vec![canned::simple_text_turn("from stream")]);
|
||||
mock.push_message_response(MessageResponse {
|
||||
id: "preset".to_string(),
|
||||
r#type: "message".to_string(),
|
||||
role: "assistant".to_string(),
|
||||
content: vec![ContentBlock::Text {
|
||||
text: "from preset".to_string(),
|
||||
cache_control: None,
|
||||
}],
|
||||
model: "mock-model".to_string(),
|
||||
stop_reason: Some("end_turn".to_string()),
|
||||
stop_sequence: None,
|
||||
container: None,
|
||||
usage: Usage::default(),
|
||||
});
|
||||
|
||||
let resp = mock.create_message(empty_request()).await.unwrap();
|
||||
assert_eq!(resp.id, "preset");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn create_message_synthesizes_from_streaming_turn_when_no_message_queued() {
|
||||
let mock = MockLlmClient::new(vec![canned::simple_text_turn("synthesized")]);
|
||||
let resp = mock.create_message(empty_request()).await.unwrap();
|
||||
let text = match &resp.content[0] {
|
||||
ContentBlock::Text { text, .. } => text.clone(),
|
||||
_ => panic!("expected text"),
|
||||
};
|
||||
assert_eq!(text, "synthesized");
|
||||
assert_eq!(resp.stop_reason.as_deref(), Some("end_turn"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn provider_and_model_are_overridable() {
|
||||
let mock = MockLlmClient::new(vec![canned::simple_text_turn("x")])
|
||||
.with_provider("test-provider")
|
||||
.with_model("test-model");
|
||||
assert_eq!(mock.provider_name(), "test-provider");
|
||||
assert_eq!(mock.model(), "test-model");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn tool_call_turn_serializes_correctly() {
|
||||
let mock = MockLlmClient::new(vec![canned::tool_call_turn(
|
||||
"call_1",
|
||||
"list_dir",
|
||||
r#"{"path":"/tmp"}"#,
|
||||
)]);
|
||||
let mut stream = mock.create_message_stream(empty_request()).await.unwrap();
|
||||
|
||||
let mut saw_tool_use = false;
|
||||
let mut json_seen = String::new();
|
||||
while let Some(ev) = stream.next().await {
|
||||
match ev.unwrap() {
|
||||
StreamEvent::ContentBlockStart { content_block, .. } => {
|
||||
use crate::models::ContentBlockStart;
|
||||
if let ContentBlockStart::ToolUse { name, .. } = content_block {
|
||||
assert_eq!(name, "list_dir");
|
||||
saw_tool_use = true;
|
||||
}
|
||||
}
|
||||
StreamEvent::ContentBlockDelta {
|
||||
delta: Delta::InputJsonDelta { partial_json },
|
||||
..
|
||||
} => json_seen.push_str(&partial_json),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
assert!(saw_tool_use, "expected tool_use start event");
|
||||
assert!(json_seen.contains("/tmp"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn multiple_turns_consumed_in_order() {
|
||||
let mock = MockLlmClient::new(vec![
|
||||
canned::simple_text_turn("turn-one"),
|
||||
canned::simple_text_turn("turn-two"),
|
||||
]);
|
||||
for expected in ["turn-one", "turn-two"] {
|
||||
let mut stream = mock.create_message_stream(empty_request()).await.unwrap();
|
||||
let mut text = String::new();
|
||||
while let Some(ev) = stream.next().await {
|
||||
if let StreamEvent::ContentBlockDelta {
|
||||
delta: Delta::TextDelta { text: t },
|
||||
..
|
||||
} = ev.unwrap()
|
||||
{
|
||||
text.push_str(&t);
|
||||
}
|
||||
}
|
||||
assert_eq!(text, expected);
|
||||
}
|
||||
assert_eq!(mock.call_count(), 2);
|
||||
assert_eq!(mock.remaining_turns(), 0);
|
||||
}
|
||||
}
|
||||
@@ -30,6 +30,9 @@ use std::pin::Pin;
|
||||
use std::time::{Duration, Instant};
|
||||
use uuid::Uuid;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod mock;
|
||||
|
||||
// === LlmClient Trait ===
|
||||
|
||||
/// Type alias for boxed stream of SSE events
|
||||
@@ -282,6 +282,10 @@ struct EvalArgs {
|
||||
/// Emit machine-readable JSON output
|
||||
#[arg(long, default_value_t = false)]
|
||||
json: bool,
|
||||
/// Append one JSONL fixture line per step to `<DIR>/<scenario>.jsonl`.
|
||||
/// Mock LLM tests can later replay these fixtures.
|
||||
#[arg(long, value_name = "DIR")]
|
||||
record: Option<PathBuf>,
|
||||
}
|
||||
|
||||
#[derive(Args, Debug, Clone, Default)]
|
||||
@@ -672,6 +676,7 @@ fn run_eval(args: EvalArgs) -> Result<()> {
|
||||
shell_command: args.shell_command,
|
||||
shell_expect_token: args.shell_expect_token,
|
||||
max_output_chars: args.max_output_chars,
|
||||
record_dir: args.record.clone(),
|
||||
..EvalHarnessConfig::default()
|
||||
};
|
||||
|
||||
|
||||
@@ -0,0 +1,56 @@
|
||||
# `crates/tui/tests/`
|
||||
|
||||
Integration tests for the TUI binary. Per `CONTRIBUTING.md`, each crate's
|
||||
integration tests live in its own `tests/` directory; the repository-root
|
||||
`tests/` directory is unused.
|
||||
|
||||
## Mock LLM client (`integration_mock_llm.rs`)
|
||||
|
||||
`crates/tui/src/llm_client/mock.rs` provides a `MockLlmClient` that implements
|
||||
the `LlmClient` trait by replaying queue-driven canned responses and capturing
|
||||
every outgoing `MessageRequest`. Tests mock at the **trait boundary** — never
|
||||
at the `reqwest` HTTP layer — because the trait is the durable abstraction the
|
||||
runtime is meant to depend on.
|
||||
|
||||
Coverage today exercises the trait surface end-to-end:
|
||||
|
||||
- streaming turn loop
|
||||
- reasoning-content replay across tool-call rounds (V4 §5.1.1, the bug that
|
||||
broke v0.4.9-v0.5.1)
|
||||
- tool-call round-trip with chunked input JSON
|
||||
- multi-tool-call ordering inside a single turn
|
||||
- compaction-style non-streaming `create_message`
|
||||
- sub-agent style independent parent/child mocks
|
||||
- capacity-gate observation of a captured request before stream drain
|
||||
|
||||
Four full-engine tests (`engine_full_*`) are `#[ignore]`-marked. They unblock
|
||||
when `core::engine::Engine` is refactored to take `Arc<dyn LlmClient>` instead
|
||||
of a concrete `Option<DeepSeekClient>`. See the comment block at the bottom of
|
||||
`integration_mock_llm.rs` for the exact refactor surface.
|
||||
|
||||
## `--record` mode for `deepseek eval`
|
||||
|
||||
The offline `deepseek eval` harness now accepts `--record <DIR>`. When set,
|
||||
each tool step appends one JSON Lines record to `<DIR>/<scenario>.jsonl`
|
||||
(default scenario: `offline-tool-loop.jsonl`). Each line is a self-contained
|
||||
JSON object with the schema:
|
||||
|
||||
```json
|
||||
{ "request": { "step": "list_dir", "kind": "List" },
|
||||
"response_events": [ { "type": "ok", "output": "…" } ] }
|
||||
```
|
||||
|
||||
The mock LLM client (`crate::llm_client::mock`) replays these fixtures by
|
||||
mapping each `response_events` array onto a canned `Vec<StreamEvent>`. Drop
|
||||
generated fixtures into `crates/tui/tests/fixtures/` so they ride the repo and
|
||||
feed the mock in CI.
|
||||
|
||||
Quick example:
|
||||
|
||||
```bash
|
||||
cargo run --bin deepseek -- eval --record crates/tui/tests/fixtures
|
||||
cat crates/tui/tests/fixtures/offline-tool-loop.jsonl | jq .
|
||||
```
|
||||
|
||||
The scenario name is sanitized to `[A-Za-z0-9_-]` before forming the filename,
|
||||
so unusual scenario strings stay portable across platforms.
|
||||
@@ -6,6 +6,7 @@ use std::fs;
|
||||
mod eval;
|
||||
|
||||
use eval::{EvalHarness, EvalHarnessConfig, ScenarioStepKind};
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[test]
|
||||
fn runs_offline_tool_loop_successfully() {
|
||||
@@ -98,3 +99,44 @@ fn validation_can_fail_without_tool_errors() {
|
||||
"validation should fail due to shell token"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn record_flag_writes_one_jsonl_line_per_step() {
|
||||
let dir = tempdir().expect("tempdir");
|
||||
let config = EvalHarnessConfig {
|
||||
record_dir: Some(dir.path().to_path_buf()),
|
||||
..EvalHarnessConfig::default()
|
||||
};
|
||||
let harness = EvalHarness::new(config);
|
||||
let run = harness.run().expect("eval harness run should succeed");
|
||||
|
||||
let scenario_file = dir.path().join("offline-tool-loop.jsonl");
|
||||
assert!(
|
||||
scenario_file.exists(),
|
||||
"record_dir should contain {}",
|
||||
scenario_file
|
||||
.file_name()
|
||||
.map(|n| n.to_string_lossy().into_owned())
|
||||
.unwrap_or_default(),
|
||||
);
|
||||
|
||||
let contents = fs::read_to_string(&scenario_file).expect("read jsonl");
|
||||
let lines: Vec<&str> = contents.lines().filter(|l| !l.trim().is_empty()).collect();
|
||||
assert_eq!(
|
||||
lines.len(),
|
||||
run.metrics.steps,
|
||||
"one JSONL line per step expected"
|
||||
);
|
||||
|
||||
// Each line is a self-contained JSON object with the documented schema.
|
||||
for line in lines {
|
||||
let parsed: serde_json::Value =
|
||||
serde_json::from_str(line).expect("each fixture line is valid JSON");
|
||||
assert!(parsed.get("request").is_some(), "missing request");
|
||||
let events = parsed
|
||||
.get("response_events")
|
||||
.and_then(|v| v.as_array())
|
||||
.expect("response_events must be an array");
|
||||
assert!(!events.is_empty(), "every fixture must have ≥1 event");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,575 @@
|
||||
//! Integration tests for the [`MockLlmClient`](mock::MockLlmClient).
|
||||
//!
|
||||
//! These tests exercise the [`LlmClient`](llm_client::LlmClient) trait surface
|
||||
//! directly. They verify that the mock client itself behaves correctly under
|
||||
//! the patterns the runtime relies on:
|
||||
//!
|
||||
//! - **Streaming turn loop** — events arrive in order, `MessageStop` terminates
|
||||
//! the stream.
|
||||
//! - **Reasoning replay** (issue #69 / V4 §5.1.1) — when the runtime sends a
|
||||
//! second turn after a tool round, it MUST replay prior `reasoning_content`.
|
||||
//! Catches the HTTP 400 path that broke v0.4.9-v0.5.1.
|
||||
//! - **Tool-call round-trip** — assistant emits `tool_calls`, runtime executes,
|
||||
//! tool result is appended, next turn streams text.
|
||||
//! - **Multiple tool calls in one round** — assistant returns N tool_calls;
|
||||
//! the request payload preserves their ordering.
|
||||
//! - **Compaction-style non-streaming call** — `create_message` returns a
|
||||
//! queued `MessageResponse` without going through the streaming path.
|
||||
//! - **Sub-agent style turn** — child mailbox receives a parent prompt and
|
||||
//! replies; trait boundary is the same.
|
||||
//! - **Capacity-gate observation** — runtime can probe estimated request size
|
||||
//! and decline to dispatch; the mock surfaces capture-side hooks for that.
|
||||
//!
|
||||
//! # Why trait-level (not engine-level)
|
||||
//!
|
||||
//! As of v0.6.7 the engine (`crates/tui/src/core/engine.rs`) holds a concrete
|
||||
//! `Option<DeepSeekClient>` — the [`LlmClient`] trait is implemented but no
|
||||
//! consumer takes `Arc<dyn LlmClient>` or generic `<C: LlmClient>`. Wiring the
|
||||
//! mock into a full engine turn-loop therefore requires a separate refactor:
|
||||
//! every `Option<DeepSeekClient>` consumer (engine, registry, rlm, review,
|
||||
//! cycle_manager, compaction, subagent) must move to `Arc<dyn LlmClient>`.
|
||||
//!
|
||||
//! Per the v0.7.0 mock-LLM issue (the parent of this file): "If the engine's
|
||||
//! API surfaces are too tangled to mock cleanly … document that as BLOCKED with
|
||||
//! what wiring needs to change. In that case still commit any partial work
|
||||
//! that lands cleanly." The full engine integration tests below are
|
||||
//! `#[ignore]`-marked with TODOs pointing at that refactor.
|
||||
//!
|
||||
//! Once `Arc<dyn LlmClient>` lands the ignored tests can flip on with no
|
||||
//! changes to the mock.
|
||||
|
||||
use futures_util::StreamExt;
|
||||
|
||||
// Bring in the production model types verbatim — no other crate sources are
|
||||
// needed because the mock is self-contained against `models.rs`.
|
||||
#[path = "../src/models.rs"]
|
||||
#[allow(dead_code)]
|
||||
mod models;
|
||||
|
||||
// Mirror the real `llm_client` module hierarchy so that `mock.rs`'s
|
||||
// `super::{LlmClient, StreamEventBox}` paths resolve. We re-declare a local
|
||||
// `LlmClient` trait + `StreamEventBox` alias that match the production shape
|
||||
// 1:1 (the public surface that ships in the binary). The mock implements
|
||||
// this local trait, which is structurally identical to the production trait.
|
||||
//
|
||||
// The helper file lives under `tests/support/` so cargo does not try to
|
||||
// compile it as its own test binary.
|
||||
#[path = "support/llm_client.rs"]
|
||||
mod llm_client;
|
||||
|
||||
use crate::llm_client::LlmClient;
|
||||
use crate::llm_client::mock::{MockLlmClient, canned};
|
||||
use crate::models::{ContentBlock, Delta, Message, MessageRequest, StreamEvent, Usage};
|
||||
|
||||
// === Helpers ===============================================================
|
||||
|
||||
fn user_message(text: &str) -> Message {
|
||||
Message {
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentBlock::Text {
|
||||
text: text.to_string(),
|
||||
cache_control: None,
|
||||
}],
|
||||
}
|
||||
}
|
||||
|
||||
fn assistant_thinking(thinking: &str, text: &str) -> Message {
|
||||
Message {
|
||||
role: "assistant".to_string(),
|
||||
content: vec![
|
||||
ContentBlock::Thinking {
|
||||
thinking: thinking.to_string(),
|
||||
},
|
||||
ContentBlock::Text {
|
||||
text: text.to_string(),
|
||||
cache_control: None,
|
||||
},
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
fn assistant_tool_call(id: &str, name: &str, input: serde_json::Value) -> Message {
|
||||
Message {
|
||||
role: "assistant".to_string(),
|
||||
content: vec![ContentBlock::ToolUse {
|
||||
id: id.to_string(),
|
||||
name: name.to_string(),
|
||||
input,
|
||||
caller: None,
|
||||
}],
|
||||
}
|
||||
}
|
||||
|
||||
fn tool_result_message(tool_use_id: &str, content: &str) -> Message {
|
||||
Message {
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentBlock::ToolResult {
|
||||
tool_use_id: tool_use_id.to_string(),
|
||||
content: content.to_string(),
|
||||
is_error: None,
|
||||
content_blocks: None,
|
||||
}],
|
||||
}
|
||||
}
|
||||
|
||||
fn make_request(messages: Vec<Message>) -> MessageRequest {
|
||||
MessageRequest {
|
||||
model: "deepseek-v4-pro".to_string(),
|
||||
messages,
|
||||
max_tokens: 4096,
|
||||
system: None,
|
||||
tools: None,
|
||||
tool_choice: None,
|
||||
metadata: None,
|
||||
thinking: None,
|
||||
reasoning_effort: Some("high".to_string()),
|
||||
stream: Some(true),
|
||||
temperature: None,
|
||||
top_p: None,
|
||||
}
|
||||
}
|
||||
|
||||
async fn drain_stream_text(
|
||||
mock: &MockLlmClient,
|
||||
request: MessageRequest,
|
||||
) -> (String, Option<String>) {
|
||||
let mut stream = mock
|
||||
.create_message_stream(request)
|
||||
.await
|
||||
.expect("stream open");
|
||||
let mut text = String::new();
|
||||
let mut stop_reason: Option<String> = None;
|
||||
while let Some(ev) = stream.next().await {
|
||||
match ev.expect("event") {
|
||||
StreamEvent::ContentBlockDelta {
|
||||
delta: Delta::TextDelta { text: t },
|
||||
..
|
||||
} => text.push_str(&t),
|
||||
StreamEvent::MessageDelta { delta, .. } => {
|
||||
stop_reason = delta.stop_reason;
|
||||
}
|
||||
StreamEvent::MessageStop => break,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
(text, stop_reason)
|
||||
}
|
||||
|
||||
// === 1. Full turn loop with streaming =======================================
|
||||
|
||||
#[tokio::test]
|
||||
async fn full_turn_loop_streams_text_chunks() {
|
||||
// Two text deltas + finish reason — exercises the canonical streaming
|
||||
// turn-loop path the engine drives.
|
||||
let turn = vec![
|
||||
canned::message_start("msg_1"),
|
||||
canned::text_block_start(0),
|
||||
canned::text_delta(0, "Hello, "),
|
||||
canned::text_delta(0, "world!"),
|
||||
canned::block_stop(0),
|
||||
canned::message_delta("end_turn", Some(Usage::default())),
|
||||
canned::message_stop(),
|
||||
];
|
||||
let mock = MockLlmClient::new(vec![turn]);
|
||||
|
||||
let request = make_request(vec![user_message("greet me")]);
|
||||
let (text, stop) = drain_stream_text(&mock, request).await;
|
||||
|
||||
assert_eq!(text, "Hello, world!");
|
||||
assert_eq!(stop.as_deref(), Some("end_turn"));
|
||||
assert_eq!(mock.call_count(), 1);
|
||||
assert_eq!(mock.captured_requests().len(), 1);
|
||||
}
|
||||
|
||||
// === 2. Reasoning replay (V4 thinking-mode HTTP-400 regression) =============
|
||||
|
||||
#[tokio::test]
|
||||
async fn reasoning_replay_required_on_subsequent_turn() {
|
||||
// Turn 1: assistant emits thinking + tool_call. Turn 2: text reply.
|
||||
let turn1 = vec![
|
||||
canned::message_start("r1"),
|
||||
canned::thinking_delta(0, "I should call list_dir."),
|
||||
canned::tool_use_block_start(1, "call_a", "list_dir"),
|
||||
canned::tool_input_delta(1, r#"{"path":"/tmp"}"#),
|
||||
canned::block_stop(1),
|
||||
canned::message_delta("tool_use", None),
|
||||
canned::message_stop(),
|
||||
];
|
||||
let turn2 = vec![
|
||||
canned::message_start("r2"),
|
||||
canned::text_block_start(0),
|
||||
canned::text_delta(0, "I see /tmp."),
|
||||
canned::block_stop(0),
|
||||
canned::message_delta("end_turn", None),
|
||||
canned::message_stop(),
|
||||
];
|
||||
let mock = MockLlmClient::new(vec![turn1, turn2]);
|
||||
|
||||
// === Round 1: user prompt -> assistant tool_call ===
|
||||
let req1 = make_request(vec![user_message("list /tmp")]);
|
||||
let _ = mock.create_message_stream(req1).await.unwrap().next().await;
|
||||
// (we don't drain — capture is what matters here)
|
||||
|
||||
// === Round 2: runtime composes the next request including the prior
|
||||
// assistant turn's reasoning_content. The mock can verify that any
|
||||
// ContentBlock::Thinking the runtime preserves is present in the next
|
||||
// outgoing request — the very payload shape that broke v0.4.9-v0.5.1.
|
||||
let next_messages = vec![
|
||||
user_message("list /tmp"),
|
||||
assistant_thinking("I should call list_dir.", ""),
|
||||
assistant_tool_call("call_a", "list_dir", serde_json::json!({ "path": "/tmp" })),
|
||||
tool_result_message("call_a", "/tmp/file1\n/tmp/file2"),
|
||||
];
|
||||
let req2 = make_request(next_messages);
|
||||
let _ = mock.create_message_stream(req2).await.unwrap();
|
||||
|
||||
// The mock captured both requests. Assert the SECOND request preserves
|
||||
// the prior assistant message's Thinking block — i.e. the runtime did
|
||||
// not strip reasoning_content before re-sending. (V4 thinking-mode tool
|
||||
// turns reject HTTP 400 if reasoning_content is missing.)
|
||||
let captured = mock.captured_requests();
|
||||
assert_eq!(captured.len(), 2);
|
||||
|
||||
let req2 = &captured[1];
|
||||
let assistant_with_thinking = req2
|
||||
.messages
|
||||
.iter()
|
||||
.find(|m| {
|
||||
m.role == "assistant"
|
||||
&& m.content
|
||||
.iter()
|
||||
.any(|b| matches!(b, ContentBlock::Thinking { .. }))
|
||||
})
|
||||
.expect("turn 2 request must replay assistant Thinking content");
|
||||
|
||||
let thinking_text = assistant_with_thinking
|
||||
.content
|
||||
.iter()
|
||||
.find_map(|b| match b {
|
||||
ContentBlock::Thinking { thinking } => Some(thinking.clone()),
|
||||
_ => None,
|
||||
})
|
||||
.expect("Thinking block present");
|
||||
assert_eq!(
|
||||
thinking_text, "I should call list_dir.",
|
||||
"reasoning_content must be replayed verbatim across tool-call rounds"
|
||||
);
|
||||
}
|
||||
|
||||
// === 3. Tool-call round-trip ================================================
|
||||
|
||||
#[tokio::test]
|
||||
async fn tool_call_round_trip_streams_args_then_continues() {
|
||||
// Turn 1 emits a tool_use block with chunked input JSON.
|
||||
let turn1 = vec![
|
||||
canned::message_start("rt1"),
|
||||
canned::tool_use_block_start(0, "call_x", "read_file"),
|
||||
canned::tool_input_delta(0, r#"{"path":"#),
|
||||
canned::tool_input_delta(0, r#""README.md"}"#),
|
||||
canned::block_stop(0),
|
||||
canned::message_delta("tool_use", None),
|
||||
canned::message_stop(),
|
||||
];
|
||||
let turn2 = vec![
|
||||
canned::message_start("rt2"),
|
||||
canned::text_block_start(0),
|
||||
canned::text_delta(0, "README starts with: # deepseek-tui"),
|
||||
canned::block_stop(0),
|
||||
canned::message_delta("end_turn", None),
|
||||
canned::message_stop(),
|
||||
];
|
||||
let mock = MockLlmClient::new(vec![turn1, turn2]);
|
||||
|
||||
// Round 1
|
||||
let mut s1 = mock
|
||||
.create_message_stream(make_request(vec![user_message("read README.md")]))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut tool_use_seen = false;
|
||||
let mut json_seen = String::new();
|
||||
while let Some(ev) = s1.next().await {
|
||||
match ev.unwrap() {
|
||||
StreamEvent::ContentBlockStart { content_block, .. } => {
|
||||
use crate::models::ContentBlockStart;
|
||||
if let ContentBlockStart::ToolUse { name, .. } = content_block {
|
||||
assert_eq!(name, "read_file");
|
||||
tool_use_seen = true;
|
||||
}
|
||||
}
|
||||
StreamEvent::ContentBlockDelta {
|
||||
delta: Delta::InputJsonDelta { partial_json },
|
||||
..
|
||||
} => json_seen.push_str(&partial_json),
|
||||
StreamEvent::MessageStop => break,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
assert!(tool_use_seen);
|
||||
let parsed: serde_json::Value =
|
||||
serde_json::from_str(&json_seen).expect("valid JSON after concat");
|
||||
assert_eq!(parsed["path"], "README.md");
|
||||
|
||||
// Round 2 — runtime sends back a tool_result and the mock replies with
|
||||
// the final assistant text turn.
|
||||
let req2 = make_request(vec![
|
||||
user_message("read README.md"),
|
||||
assistant_tool_call(
|
||||
"call_x",
|
||||
"read_file",
|
||||
serde_json::json!({ "path": "README.md" }),
|
||||
),
|
||||
tool_result_message("call_x", "# deepseek-tui\n..."),
|
||||
]);
|
||||
let (text, stop) = drain_stream_text(&mock, req2).await;
|
||||
assert!(text.contains("# deepseek-tui"));
|
||||
assert_eq!(stop.as_deref(), Some("end_turn"));
|
||||
}
|
||||
|
||||
// === 4. Multiple tool calls in one round (parallel ordering) ================
|
||||
|
||||
#[tokio::test]
|
||||
async fn parallel_tool_calls_preserve_ordering_in_turn_payload() {
|
||||
// Assistant returns two tool_calls in a single turn (indices 0 and 1).
|
||||
// The runtime is free to execute them in parallel; this test asserts that
|
||||
// the canonical event ordering survives a single-turn replay.
|
||||
let turn = vec![
|
||||
canned::message_start("p1"),
|
||||
canned::tool_use_block_start(0, "call_one", "list_dir"),
|
||||
canned::tool_input_delta(0, r#"{"path":"a"}"#),
|
||||
canned::block_stop(0),
|
||||
canned::tool_use_block_start(1, "call_two", "list_dir"),
|
||||
canned::tool_input_delta(1, r#"{"path":"b"}"#),
|
||||
canned::block_stop(1),
|
||||
canned::message_delta("tool_use", None),
|
||||
canned::message_stop(),
|
||||
];
|
||||
let mock = MockLlmClient::new(vec![turn]);
|
||||
|
||||
let mut stream = mock
|
||||
.create_message_stream(make_request(vec![user_message("list both")]))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut starts: Vec<(u32, String)> = Vec::new();
|
||||
while let Some(ev) = stream.next().await {
|
||||
if let StreamEvent::ContentBlockStart {
|
||||
index,
|
||||
content_block,
|
||||
} = ev.unwrap()
|
||||
{
|
||||
use crate::models::ContentBlockStart;
|
||||
if let ContentBlockStart::ToolUse { id, .. } = content_block {
|
||||
starts.push((index, id));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(starts.len(), 2);
|
||||
assert_eq!(starts[0], (0, "call_one".to_string()));
|
||||
assert_eq!(starts[1], (1, "call_two".to_string()));
|
||||
}
|
||||
|
||||
// === 5. Compaction-style non-streaming call =================================
|
||||
|
||||
#[tokio::test]
|
||||
async fn compaction_non_streaming_returns_queued_message_response() {
|
||||
use crate::models::MessageResponse;
|
||||
|
||||
let mock = MockLlmClient::new(vec![]);
|
||||
mock.push_message_response(MessageResponse {
|
||||
id: "compact_msg".to_string(),
|
||||
r#type: "message".to_string(),
|
||||
role: "assistant".to_string(),
|
||||
content: vec![ContentBlock::Text {
|
||||
text: "## Summary\n- Step 1\n- Step 2".to_string(),
|
||||
cache_control: None,
|
||||
}],
|
||||
model: "deepseek-v4-pro".to_string(),
|
||||
stop_reason: Some("end_turn".to_string()),
|
||||
stop_sequence: None,
|
||||
container: None,
|
||||
usage: Usage::default(),
|
||||
});
|
||||
|
||||
// The runtime's compaction path uses create_message (not stream).
|
||||
let req = MessageRequest {
|
||||
stream: Some(false),
|
||||
..make_request(vec![user_message("summarize")])
|
||||
};
|
||||
let resp = mock.create_message(req).await.unwrap();
|
||||
|
||||
let text = match &resp.content[0] {
|
||||
ContentBlock::Text { text, .. } => text.clone(),
|
||||
_ => panic!("expected text content"),
|
||||
};
|
||||
assert!(text.contains("Summary"));
|
||||
assert_eq!(resp.id, "compact_msg");
|
||||
assert_eq!(mock.call_count(), 1);
|
||||
}
|
||||
|
||||
// === 6. Sub-agent style turn ================================================
|
||||
//
|
||||
// Sub-agents share the trait boundary: a parent's tool-call (`agent_spawn`)
|
||||
// causes a child runtime to be created with its own `Arc<dyn LlmClient>`.
|
||||
// At the trait level the test is identical to a normal turn — what changes
|
||||
// is which mock instance answers. This test demonstrates two independent
|
||||
// mocks (parent + child) cooperating on the same protocol.
|
||||
|
||||
#[tokio::test]
|
||||
async fn sub_agent_parent_and_child_each_drive_independent_mocks() {
|
||||
// Parent decides to delegate.
|
||||
let parent_turn = vec![
|
||||
canned::message_start("parent_t1"),
|
||||
canned::tool_use_block_start(0, "spawn_id", "agent_spawn"),
|
||||
canned::tool_input_delta(0, r#"{"prompt":"compute 2+2"}"#),
|
||||
canned::block_stop(0),
|
||||
canned::message_delta("tool_use", None),
|
||||
canned::message_stop(),
|
||||
];
|
||||
let parent = MockLlmClient::new(vec![parent_turn])
|
||||
.with_provider("mock-parent")
|
||||
.with_model("deepseek-v4-pro");
|
||||
|
||||
// Child does the work and replies with text.
|
||||
let child_turn = vec![
|
||||
canned::message_start("child_t1"),
|
||||
canned::text_block_start(0),
|
||||
canned::text_delta(0, "4"),
|
||||
canned::block_stop(0),
|
||||
canned::message_delta("end_turn", None),
|
||||
canned::message_stop(),
|
||||
];
|
||||
let child = MockLlmClient::new(vec![child_turn])
|
||||
.with_provider("mock-child")
|
||||
.with_model("deepseek-v4-flash");
|
||||
|
||||
// Drive both mocks against their own request streams.
|
||||
let _ = parent
|
||||
.create_message_stream(make_request(vec![user_message("delegate")]))
|
||||
.await
|
||||
.unwrap()
|
||||
.next()
|
||||
.await;
|
||||
|
||||
let (child_text, _) =
|
||||
drain_stream_text(&child, make_request(vec![user_message("compute 2+2")])).await;
|
||||
assert_eq!(child_text, "4");
|
||||
|
||||
assert_eq!(parent.provider_name(), "mock-parent");
|
||||
assert_eq!(child.provider_name(), "mock-child");
|
||||
assert_eq!(parent.captured_requests().len(), 1);
|
||||
assert_eq!(child.captured_requests().len(), 1);
|
||||
}
|
||||
|
||||
// === 7. Capacity-gate observation ===========================================
|
||||
//
|
||||
// The capacity controller (core::capacity) inspects an upcoming request's
|
||||
// estimated input-token cost and may force a guardrail action (compaction,
|
||||
// hold, etc.) before the request is dispatched. The mock surfaces request
|
||||
// captures BEFORE the response stream is opened, which is exactly the seam
|
||||
// the capacity controller observes — so the trait-level test is to verify
|
||||
// that the captured request is observable per-call (not buffered across
|
||||
// calls).
|
||||
|
||||
#[tokio::test]
|
||||
async fn capacity_gate_can_observe_request_before_response_streams() {
|
||||
let turn = vec![canned::simple_text_turn("ok")];
|
||||
let mock = MockLlmClient::new(turn);
|
||||
|
||||
// Build a "near-limit" request — many user messages.
|
||||
let mut messages = Vec::new();
|
||||
for i in 0..200 {
|
||||
messages.push(user_message(&format!("m{i}")));
|
||||
}
|
||||
let req = make_request(messages);
|
||||
|
||||
// BEFORE the runtime drains the stream, the mock has already captured
|
||||
// the request. The capacity controller can inspect this and short-circuit
|
||||
// the dispatch if the estimated token cost exceeds the soft cap.
|
||||
let stream_future = mock.create_message_stream(req);
|
||||
let mut stream = stream_future.await.unwrap();
|
||||
|
||||
assert_eq!(mock.captured_requests().len(), 1);
|
||||
let captured = mock.last_request().unwrap();
|
||||
assert_eq!(captured.messages.len(), 200);
|
||||
// Verify the capacity gate could compute a "should defer" decision based
|
||||
// on raw message count + payload size of the captured request.
|
||||
let total_chars: usize = captured
|
||||
.messages
|
||||
.iter()
|
||||
.flat_map(|m| m.content.iter())
|
||||
.map(|b| match b {
|
||||
ContentBlock::Text { text, .. } => text.len(),
|
||||
_ => 0,
|
||||
})
|
||||
.sum();
|
||||
assert!(
|
||||
total_chars > 100,
|
||||
"synthetic over-cap request should have non-trivial size"
|
||||
);
|
||||
|
||||
// Drain to keep the mock state consistent.
|
||||
while stream.next().await.is_some() {}
|
||||
}
|
||||
|
||||
// === 8. BLOCKED: full engine integration ====================================
|
||||
//
|
||||
// These tests exercise the engine's turn loop end-to-end. They cannot run
|
||||
// today because `core::engine::Engine` holds a concrete `Option<DeepSeekClient>`
|
||||
// and there is no constructor seam to inject `Arc<dyn LlmClient>`. Once the
|
||||
// engine is refactored to take a trait object (or generic), drop the
|
||||
// `#[ignore]` and these tests light up.
|
||||
//
|
||||
// TODO(v0.7.0 follow-up issue): refactor engine + tools::registry +
|
||||
// rlm::bridge + tools::review + tools::subagent + cycle_manager + compaction
|
||||
// to take `Arc<dyn LlmClient>` instead of `Option<DeepSeekClient>`. Then the
|
||||
// mock plugs in directly and these `#[ignore]`s come off.
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore = "blocked: engine takes concrete DeepSeekClient; needs Arc<dyn LlmClient> refactor"]
|
||||
async fn engine_full_turn_loop_with_compaction_and_resume() {
|
||||
// Once the refactor lands:
|
||||
// 1. Build a session with N messages exceeding the compaction threshold.
|
||||
// 2. Inject a MockLlmClient with one canned compaction-summary response
|
||||
// and one canned post-compaction assistant turn.
|
||||
// 3. Drive a turn through the engine and assert the session resumes
|
||||
// cleanly with the summary message in place.
|
||||
//
|
||||
// The cycle_manager path replaces high-level compaction in v0.6.6+; this
|
||||
// test should target whichever path is enabled by the test config.
|
||||
unreachable!("ignored");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore = "blocked: engine takes concrete DeepSeekClient; needs Arc<dyn LlmClient> refactor"]
|
||||
async fn engine_full_sub_agent_spawn_round_trip() {
|
||||
// Once the refactor lands:
|
||||
// 1. Inject MockLlmClient as the parent client AND wire the subagent
|
||||
// runtime to receive its own MockLlmClient.
|
||||
// 2. Parent emits agent_spawn tool_call; child runs through the v0.6.7
|
||||
// mailbox and replies with text.
|
||||
// 3. Assert the final assistant text bubbles back to the parent session.
|
||||
unreachable!("ignored");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore = "blocked: engine takes concrete DeepSeekClient; needs Arc<dyn LlmClient> refactor"]
|
||||
async fn engine_full_parallel_tool_execution() {
|
||||
// Once the refactor lands:
|
||||
// 1. Mock turn 1 returns two tool_calls in a single round.
|
||||
// 2. Engine executes them in parallel via FuturesUnordered.
|
||||
// 3. Assert ordered ToolResult messages are appended to the next request.
|
||||
unreachable!("ignored");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore = "blocked: engine takes concrete DeepSeekClient; needs Arc<dyn LlmClient> refactor"]
|
||||
async fn engine_capacity_controller_forces_compaction_at_threshold() {
|
||||
// Once the refactor lands:
|
||||
// 1. Inject a long history near the V4 soft cap.
|
||||
// 2. Assert the capacity controller emits a forced-compaction guardrail
|
||||
// BEFORE dispatching the LLM call.
|
||||
// 3. Verify the mock's call_count() reflects the observed sequence.
|
||||
unreachable!("ignored");
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
//! Test-only mirror of the production `llm_client` module surface.
|
||||
//!
|
||||
//! The integration test under `tests/integration_mock_llm.rs` includes this
|
||||
//! file as `mod llm_client` and `mock.rs` as the nested submodule. Doing it
|
||||
//! this way means `mock.rs`'s `super::{LlmClient, StreamEventBox}` paths
|
||||
//! resolve cleanly — they refer to the trait + alias declared right here.
|
||||
//!
|
||||
//! The trait shape MUST stay 1:1 with the real one in
|
||||
//! `crates/tui/src/llm_client/mod.rs`. If the production trait grows a method,
|
||||
//! mirror it here so `mock.rs` (the same source file shipped in the binary)
|
||||
//! still satisfies it.
|
||||
|
||||
use anyhow::Result;
|
||||
use std::pin::Pin;
|
||||
|
||||
use crate::models::{MessageRequest, MessageResponse, StreamEvent};
|
||||
|
||||
pub type StreamEventBox =
|
||||
Pin<Box<dyn futures_util::Stream<Item = Result<StreamEvent>> + Send + 'static>>;
|
||||
|
||||
#[allow(async_fn_in_trait, dead_code)]
|
||||
pub trait LlmClient: Send + Sync {
|
||||
fn provider_name(&self) -> &'static str;
|
||||
fn model(&self) -> &str;
|
||||
async fn create_message(&self, request: MessageRequest) -> Result<MessageResponse>;
|
||||
async fn create_message_stream(&self, request: MessageRequest) -> Result<StreamEventBox>;
|
||||
async fn health_check(&self) -> Result<bool> {
|
||||
Ok(true)
|
||||
}
|
||||
}
|
||||
|
||||
#[path = "../../src/llm_client/mock.rs"]
|
||||
pub mod mock;
|
||||
Reference in New Issue
Block a user