220 lines
7.2 KiB
Rust
220 lines
7.2 KiB
Rust
//! Integration tests for the offline evaluation harness.
|
|
|
|
use std::fs;
|
|
|
|
use tempfile::tempdir;
|
|
|
|
#[path = "../src/eval.rs"]
|
|
mod eval;
|
|
#[path = "../src/shell_dispatcher.rs"]
|
|
mod shell_dispatcher;
|
|
|
|
use eval::{EvalHarness, EvalHarnessConfig, FixtureRecord, ScenarioStepKind};
|
|
|
|
const HAPPY_PATH_TOOL_LOOP: [ScenarioStepKind; 6] = [
|
|
ScenarioStepKind::List,
|
|
ScenarioStepKind::Read,
|
|
ScenarioStepKind::Search,
|
|
ScenarioStepKind::Edit,
|
|
ScenarioStepKind::ApplyPatch,
|
|
ScenarioStepKind::ExecShell,
|
|
];
|
|
|
|
#[test]
|
|
fn runs_offline_tool_loop_successfully() {
|
|
let harness = EvalHarness::default();
|
|
let run = harness.run().expect("eval harness run should succeed");
|
|
assert_eq!(
|
|
ScenarioStepKind::parse("patch"),
|
|
Some(ScenarioStepKind::ApplyPatch)
|
|
);
|
|
|
|
assert!(run.metrics.success, "expected success metrics: {run:#?}");
|
|
assert_eq!(run.metrics.tool_errors, 0);
|
|
assert_eq!(run.metrics.steps, 6);
|
|
assert!(run.metrics.duration.as_millis() > 0);
|
|
assert!(!run.scenario_name.is_empty());
|
|
assert!(run.workspace_summary.file_count >= 3);
|
|
|
|
for kind in HAPPY_PATH_TOOL_LOOP {
|
|
let stats = run
|
|
.metrics
|
|
.per_tool
|
|
.get(&kind)
|
|
.expect("missing per-tool stats");
|
|
assert_eq!(stats.invocations, 1, "unexpected invocations for {kind:?}");
|
|
assert_eq!(stats.errors, 0, "unexpected errors for {kind:?}");
|
|
assert!(stats.total_duration.as_nanos() > 0);
|
|
}
|
|
|
|
let notes_path = run.workspace_root().join("notes.txt");
|
|
let notes = fs::read_to_string(¬es_path).expect("notes.txt should exist");
|
|
assert!(notes.contains("edited = true"));
|
|
assert!(notes.contains("todo: offline metrics (patched)"));
|
|
|
|
let report = run.to_report();
|
|
assert_eq!(report.metrics.success, run.metrics.success);
|
|
}
|
|
|
|
#[test]
|
|
fn acceptance_happy_path_records_simulated_llm_tool_plan() {
|
|
let record_dir = tempdir().expect("tempdir");
|
|
let scenario_name = "issue-2791-happy-path-tool-loop";
|
|
let config = EvalHarnessConfig {
|
|
scenario_name: scenario_name.to_string(),
|
|
record_dir: Some(record_dir.path().to_path_buf()),
|
|
..EvalHarnessConfig::default()
|
|
};
|
|
let harness = EvalHarness::new(config);
|
|
|
|
let run = harness.run().expect("happy-path acceptance run");
|
|
|
|
assert!(run.metrics.success, "expected success metrics: {run:#?}");
|
|
assert_eq!(run.metrics.tool_errors, 0);
|
|
assert_eq!(run.metrics.steps, HAPPY_PATH_TOOL_LOOP.len());
|
|
|
|
let actual_tool_names: Vec<&str> = run.steps.iter().map(|step| step.tool_name).collect();
|
|
let expected_tool_names: Vec<&str> = HAPPY_PATH_TOOL_LOOP
|
|
.iter()
|
|
.map(|kind| kind.tool_name())
|
|
.collect();
|
|
assert_eq!(actual_tool_names, expected_tool_names);
|
|
|
|
let scenario_file = record_dir.path().join(format!("{scenario_name}.jsonl"));
|
|
let records = read_fixture_records(&scenario_file);
|
|
assert_eq!(records.len(), HAPPY_PATH_TOOL_LOOP.len());
|
|
|
|
for (record, kind) in records.iter().zip(HAPPY_PATH_TOOL_LOOP) {
|
|
assert_eq!(
|
|
record.request.get("step").and_then(|value| value.as_str()),
|
|
Some(kind.tool_name())
|
|
);
|
|
|
|
let expected_kind = format!("{kind:?}");
|
|
assert_eq!(
|
|
record.request.get("kind").and_then(|value| value.as_str()),
|
|
Some(expected_kind.as_str())
|
|
);
|
|
|
|
let event = record
|
|
.response_events
|
|
.first()
|
|
.expect("simulated LLM fixture should include a response event");
|
|
assert_eq!(
|
|
event.get("type").and_then(|value| value.as_str()),
|
|
Some("ok")
|
|
);
|
|
assert!(
|
|
event
|
|
.get("output")
|
|
.and_then(|value| value.as_str())
|
|
.is_some_and(|output| !output.is_empty()),
|
|
"fixture event should include non-empty tool output"
|
|
);
|
|
}
|
|
|
|
let notes_path = run.workspace_root().join("notes.txt");
|
|
let notes = fs::read_to_string(¬es_path).expect("notes.txt should exist");
|
|
assert!(notes.contains("edited = true"));
|
|
assert!(notes.contains("todo: offline metrics (patched)"));
|
|
}
|
|
|
|
fn read_fixture_records(path: &std::path::Path) -> Vec<FixtureRecord> {
|
|
fs::read_to_string(path)
|
|
.expect("read fixture records")
|
|
.lines()
|
|
.filter(|line| !line.trim().is_empty())
|
|
.map(|line| serde_json::from_str(line).expect("fixture line should parse"))
|
|
.collect()
|
|
}
|
|
|
|
#[test]
|
|
fn records_tool_errors_when_step_fails() {
|
|
let config = EvalHarnessConfig {
|
|
fail_step: Some(ScenarioStepKind::ApplyPatch),
|
|
..EvalHarnessConfig::default()
|
|
};
|
|
let harness = EvalHarness::new(config);
|
|
|
|
let run = harness
|
|
.run()
|
|
.expect("eval harness should return metrics even when a step fails");
|
|
|
|
assert!(!run.metrics.success);
|
|
assert!(run.metrics.tool_errors >= 1);
|
|
|
|
let patch_stats = run
|
|
.metrics
|
|
.per_tool
|
|
.get(&ScenarioStepKind::ApplyPatch)
|
|
.expect("missing apply_patch stats");
|
|
assert_eq!(patch_stats.invocations, 1);
|
|
assert_eq!(patch_stats.errors, 1);
|
|
|
|
let patch_step = run
|
|
.steps
|
|
.iter()
|
|
.find(|step| step.kind == ScenarioStepKind::ApplyPatch)
|
|
.expect("missing apply_patch step");
|
|
assert!(!patch_step.success);
|
|
assert!(patch_step.error.as_deref().is_some_and(|e| !e.is_empty()));
|
|
}
|
|
|
|
#[test]
|
|
fn validation_can_fail_without_tool_errors() {
|
|
let config = EvalHarnessConfig {
|
|
shell_expect_token: "definitely-not-in-output".to_string(),
|
|
..EvalHarnessConfig::default()
|
|
};
|
|
let harness = EvalHarness::new(config);
|
|
|
|
let run = harness.run().expect("eval harness run should complete");
|
|
|
|
assert_eq!(run.metrics.tool_errors, 0);
|
|
assert!(
|
|
!run.metrics.success,
|
|
"validation should fail due to shell token"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn record_flag_writes_one_jsonl_line_per_step() {
|
|
let dir = tempdir().expect("tempdir");
|
|
let config = EvalHarnessConfig {
|
|
record_dir: Some(dir.path().to_path_buf()),
|
|
..EvalHarnessConfig::default()
|
|
};
|
|
let harness = EvalHarness::new(config);
|
|
let run = harness.run().expect("eval harness run should succeed");
|
|
|
|
let scenario_file = dir.path().join("offline-tool-loop.jsonl");
|
|
assert!(
|
|
scenario_file.exists(),
|
|
"record_dir should contain {}",
|
|
scenario_file
|
|
.file_name()
|
|
.map(|n| n.to_string_lossy().into_owned())
|
|
.unwrap_or_default(),
|
|
);
|
|
|
|
let contents = fs::read_to_string(&scenario_file).expect("read jsonl");
|
|
let lines: Vec<&str> = contents.lines().filter(|l| !l.trim().is_empty()).collect();
|
|
assert_eq!(
|
|
lines.len(),
|
|
run.metrics.steps,
|
|
"one JSONL line per step expected"
|
|
);
|
|
|
|
// Each line is a self-contained JSON object with the documented schema.
|
|
for line in lines {
|
|
let parsed: serde_json::Value =
|
|
serde_json::from_str(line).expect("each fixture line is valid JSON");
|
|
assert!(parsed.get("request").is_some(), "missing request");
|
|
let events = parsed
|
|
.get("response_events")
|
|
.and_then(|v| v.as_array())
|
|
.expect("response_events must be an array");
|
|
assert!(!events.is_empty(), "every fixture must have ≥1 event");
|
|
}
|
|
}
|