codewhale/crates/tui/src/eval.rs

//! Offline evaluation harness for exercising representative tool loops.
//!
//! This module is intentionally self-contained so it can be wired into a CLI
//! command later without calling the network or any LLM endpoints.

use anyhow::{Context, Result, anyhow};
use ignore::WalkBuilder;
use regex::Regex;
use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;
use std::fs;
use std::io::Write;
use std::path::{Path, PathBuf};
use std::time::{Duration, Instant};
use tempfile::TempDir;

#[cfg(test)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum EvalShellPlatform {
    Windows,
    Unix,
}

#[cfg(test)]
#[derive(Debug, Clone, PartialEq, Eq)]
struct EvalShellInvocation {
    program: &'static str,
    args: Vec<String>,
    raw_payload_on_windows: bool,
}

#[cfg(test)]
fn eval_shell_invocation_for_platform(
    command: &str,
    platform: EvalShellPlatform,
) -> EvalShellInvocation {
    match platform {
        EvalShellPlatform::Windows => EvalShellInvocation {
            program: "cmd",
            args: vec!["/C".to_string(), command.to_string()],
            raw_payload_on_windows: true,
        },
        EvalShellPlatform::Unix => EvalShellInvocation {
            program: "sh",
            args: vec!["-c".to_string(), command.to_string()],
            raw_payload_on_windows: false,
        },
    }
}

/// Representative tool steps covered by the evaluation harness.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)]
pub enum ScenarioStepKind {
    List,
    Read,
    Search,
    Edit,
    ApplyPatch,
    ExecShell,
}

impl ScenarioStepKind {
    /// Tool name associated with this step.
    pub fn tool_name(self) -> &'static str {
        match self {
            ScenarioStepKind::List => "list_dir",
            ScenarioStepKind::Read => "read_file",
            ScenarioStepKind::Search => "search",
            ScenarioStepKind::Edit => "edit_file",
            ScenarioStepKind::ApplyPatch => "apply_patch",
            ScenarioStepKind::ExecShell => "exec_shell",
        }
    }

    /// Parse a step kind from CLI-friendly strings.
    pub fn parse(value: &str) -> Option<Self> {
        match value.trim().to_lowercase().as_str() {
            "list" | "list_dir" => Some(Self::List),
            "read" | "read_file" => Some(Self::Read),
            "search" | "grep" | "grep_files" => Some(Self::Search),
            "edit" | "edit_file" => Some(Self::Edit),
            "patch" | "apply_patch" => Some(Self::ApplyPatch),
            "shell" | "exec_shell" | "exec" => Some(Self::ExecShell),
            _ => None,
        }
    }
}

/// Aggregate statistics for a single tool kind.
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize)]
pub struct ToolStats {
    pub invocations: usize,
    pub errors: usize,
    pub total_duration: Duration,
}

/// Top-level metrics produced by an evaluation run.
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
pub struct EvalMetrics {
    pub success: bool,
    pub tool_errors: usize,
    pub steps: usize,
    pub duration: Duration,
    pub per_tool: BTreeMap<ScenarioStepKind, ToolStats>,
}

/// One tool invocation recorded by the harness.
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
pub struct EvalStep {
    pub kind: ScenarioStepKind,
    pub tool_name: &'static str,
    pub success: bool,
    pub duration: Duration,
    pub error: Option<String>,
    pub output: Option<String>,
}

/// Summary of the generated temporary workspace.
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
pub struct WorkspaceSummary {
    pub root: PathBuf,
    pub file_count: usize,
    pub files: Vec<PathBuf>,
}

/// Configuration for the offline evaluation harness.
#[derive(Debug, Clone)]
pub struct EvalHarnessConfig {
    /// Human-readable scenario name for reporting.
    pub scenario_name: String,
    /// If set, the harness will intentionally fail this step to test metrics.
    pub fail_step: Option<ScenarioStepKind>,
    /// Shell command executed during the `exec_shell` step.
    pub shell_command: String,
    /// Token that must appear in shell output for validation.
    pub shell_expect_token: String,
    /// Maximum characters stored for step output summaries.
    pub max_output_chars: usize,
    /// When set, every step is appended as a JSON Lines fixture to a file
    /// inside this directory. The fixture file is named after the scenario
    /// (e.g. `offline-tool-loop.jsonl`). Each line follows the schema:
    /// `{ "request": <step descriptor>, "response_events": [<events>] }`.
    /// The mock LLM client (`crate::llm_client::mock`) can replay these
    /// fixtures for deterministic offline tests. See
    /// `crates/tui/tests/README.md` for the full record/replay flow.
    pub record_dir: Option<PathBuf>,
}

impl Default for EvalHarnessConfig {
    fn default() -> Self {
        let shell_command = if cfg!(windows) {
            "echo eval-harness".to_string()
        } else {
            "printf eval-harness".to_string()
        };
        Self {
            scenario_name: "offline-tool-loop".to_string(),
            fail_step: None,
            shell_command,
            shell_expect_token: "eval-harness".to_string(),
            max_output_chars: 240,
            record_dir: None,
        }
    }
}

/// Offline harness that exercises representative tool loops in a temp workspace.
#[derive(Debug, Clone)]
pub struct EvalHarness {
    config: EvalHarnessConfig,
}

impl EvalHarness {
    /// Create a new harness with the provided configuration.
    pub fn new(config: EvalHarnessConfig) -> Self {
        Self { config }
    }

    /// Execute the offline evaluation scenario and return detailed results.
    pub fn run(&self) -> Result<EvalRun> {
        let started_at = Instant::now();
        let workspace = tempfile::Builder::new()
            .prefix("deepseek-eval-")
            .tempdir()
            .context("failed to create evaluation workspace")?;

        let seed = seed_workspace(workspace.path())?;

        let mut steps = Vec::new();
        let mut per_tool: BTreeMap<ScenarioStepKind, ToolStats> = BTreeMap::new();

        let list_output = self.run_step(ScenarioStepKind::List, &mut steps, &mut per_tool, || {
            let entries = list_dir(workspace.path())?;
            Ok(entries.join(", "))
        });

        let _read_output = self.run_step(ScenarioStepKind::Read, &mut steps, &mut per_tool, || {
            let path = if self.config.fail_step == Some(ScenarioStepKind::Read) {
                workspace.path().join("missing.txt")
            } else {
                seed.notes_path.clone()
            };
            read_file(&path)
        });

        let search_output =
            self.run_step(ScenarioStepKind::Search, &mut steps, &mut per_tool, || {
                let root = if self.config.fail_step == Some(ScenarioStepKind::Search) {
                    workspace.path().join("missing-dir")
                } else {
                    workspace.path().to_path_buf()
                };
                let result = search_files(&root, "offline")?;
                Ok(format!("matches={}", result.matches.len()))
            });

        let edit_output = self.run_step(ScenarioStepKind::Edit, &mut steps, &mut per_tool, || {
            let path = if self.config.fail_step == Some(ScenarioStepKind::Edit) {
                workspace.path().join("missing.txt")
            } else {
                seed.notes_path.clone()
            };
            edit_file_append(&path, "edited = true")?;
            Ok("appended line".to_string())
        });

        let patch_output = self.run_step(
            ScenarioStepKind::ApplyPatch,
            &mut steps,
            &mut per_tool,
            || {
                let patch = if self.config.fail_step == Some(ScenarioStepKind::ApplyPatch) {
                    "*** Begin Patch\n*** Update File: notes.txt\n@@\n-THIS LINE DOES NOT EXIST\n+broken\n*** End Patch\n"
                        .to_string()
                } else {
                    "*** Begin Patch\n*** Update File: notes.txt\n@@\n status = \"draft\"\n-todo: offline metrics\n+todo: offline metrics (patched)\n*** End Patch\n"
                        .to_string()
                };
                apply_patch(workspace.path(), &patch)?;
                Ok("patch applied".to_string())
            },
        );

        let shell_output = self.run_step(
            ScenarioStepKind::ExecShell,
            &mut steps,
            &mut per_tool,
            || {
                let command = if self.config.fail_step == Some(ScenarioStepKind::ExecShell) {
                    "command_that_does_not_exist".to_string()
                } else {
                    self.config.shell_command.clone()
                };
                exec_shell(workspace.path(), &command)
            },
        );

        let duration = started_at.elapsed();

        let workspace_summary = summarize_workspace(workspace.path(), list_output.as_deref())?;

        let validation_success = validate_outputs(
            workspace.path(),
            &self.config.shell_expect_token,
            search_output.as_deref(),
            edit_output.as_deref(),
            patch_output.as_deref(),
            shell_output.as_deref(),
        );

        let tool_errors = steps.iter().filter(|s| !s.success).count();
        let success = tool_errors == 0 && validation_success;

        let metrics = EvalMetrics {
            success,
            tool_errors,
            steps: steps.len(),
            duration,
            per_tool,
        };

        Ok(EvalRun {
            scenario_name: self.config.scenario_name.clone(),
            workspace,
            workspace_summary,
            metrics,
            steps,
        })
    }

    fn run_step<T, F>(
        &self,
        kind: ScenarioStepKind,
        steps: &mut Vec<EvalStep>,
        per_tool: &mut BTreeMap<ScenarioStepKind, ToolStats>,
        f: F,
    ) -> Option<T>
    where
        F: FnOnce() -> Result<T>,
        T: ToString,
    {
        let started_at = Instant::now();
        let result = f();
        let duration = started_at.elapsed();

        let stats = per_tool.entry(kind).or_default();
        stats.invocations += 1;
        stats.total_duration += duration;

        match result {
            Ok(value) => {
                let output = truncate_output(&value.to_string(), self.config.max_output_chars);
                steps.push(EvalStep {
                    kind,
                    tool_name: kind.tool_name(),
                    success: true,
                    duration,
                    error: None,
                    output: Some(output.clone()),
                });
                if let Some(dir) = self.config.record_dir.as_deref() {
                    let _ = record_fixture(
                        dir,
                        &self.config.scenario_name,
                        FixtureRecord::ok(kind, &output),
                    );
                }
                Some(value)
            }
            Err(err) => {
                stats.errors += 1;
                let err_str = err.to_string();
                steps.push(EvalStep {
                    kind,
                    tool_name: kind.tool_name(),
                    success: false,
                    duration,
                    error: Some(err_str.clone()),
                    output: None,
                });
                if let Some(dir) = self.config.record_dir.as_deref() {
                    let _ = record_fixture(
                        dir,
                        &self.config.scenario_name,
                        FixtureRecord::err(kind, &err_str),
                    );
                }
                None
            }
        }
    }
}

// === Fixture record/replay format ===========================================
//
// The `--record` flag writes one JSON object per line to a `.jsonl` file:
//
//     { "request": { "step": "list_dir", "kind": "List" },
//       "response_events": [{ "type": "ok", "output": "…" }] }
//
// The mock LLM client replays these fixtures via
// `MockLlmClient::push_message_response` (or the streaming variant) by mapping
// each `response_events` array onto a canned `Vec<StreamEvent>`.
//
// This format is intentionally minimal — additional fields (timing, model,
// usage) can be added without breaking older fixtures because each line is a
// self-contained JSON object.

/// Schema for one line of a `--record` JSONL fixture file.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FixtureRecord {
    /// Step descriptor (`{ step, kind }`).
    pub request: serde_json::Value,
    /// One or more synthetic response events.
    pub response_events: Vec<serde_json::Value>,
}

impl FixtureRecord {
    fn ok(kind: ScenarioStepKind, output: &str) -> Self {
        Self {
            request: serde_json::json!({
                "step": kind.tool_name(),
                "kind": format!("{kind:?}"),
            }),
            response_events: vec![serde_json::json!({
                "type": "ok",
                "output": output,
            })],
        }
    }

    fn err(kind: ScenarioStepKind, error: &str) -> Self {
        Self {
            request: serde_json::json!({
                "step": kind.tool_name(),
                "kind": format!("{kind:?}"),
            }),
            response_events: vec![serde_json::json!({
                "type": "error",
                "error": error,
            })],
        }
    }
}

/// Append one fixture record to `<dir>/<scenario>.jsonl` (creating dir + file
/// if missing). Best-effort: I/O errors are returned but generally ignored by
/// the harness so a recording failure does not mask the run's primary result.
pub fn record_fixture(dir: &Path, scenario_name: &str, record: FixtureRecord) -> Result<PathBuf> {
    fs::create_dir_all(dir)
        .with_context(|| format!("failed to create fixture dir: {}", dir.display()))?;
    let safe_scenario = scenario_name
        .chars()
        .map(|c| {
            if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
                c
            } else {
                '_'
            }
        })
        .collect::<String>();
    let path = dir.join(format!("{safe_scenario}.jsonl"));
    let line = serde_json::to_string(&record).context("failed to serialize fixture record")?;

    let mut file = fs::OpenOptions::new()
        .create(true)
        .append(true)
        .open(&path)
        .with_context(|| format!("failed to open fixture file: {}", path.display()))?;
    writeln!(file, "{line}")
        .with_context(|| format!("failed to write fixture line to {}", path.display()))?;
    Ok(path)
}

impl Default for EvalHarness {
    fn default() -> Self {
        Self::new(EvalHarnessConfig::default())
    }
}

/// Result of running the evaluation harness.
#[derive(Debug)]
pub struct EvalRun {
    pub scenario_name: String,
    workspace: TempDir,
    pub workspace_summary: WorkspaceSummary,
    pub metrics: EvalMetrics,
    pub steps: Vec<EvalStep>,
}

impl EvalRun {
    /// Get the root of the temporary workspace.
    pub fn workspace_root(&self) -> &Path {
        self.workspace.path()
    }

    /// Convert the run into a serializable report for CLI output.
    pub fn to_report(&self) -> EvalReport {
        EvalReport {
            scenario_name: self.scenario_name.clone(),
            workspace_root: self.workspace_root().to_path_buf(),
            workspace_summary: self.workspace_summary.clone(),
            metrics: self.metrics.clone(),
            steps: self.steps.clone(),
        }
    }
}

/// Serializable report derived from an `EvalRun`.
#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
pub struct EvalReport {
    pub scenario_name: String,
    pub workspace_root: PathBuf,
    pub workspace_summary: WorkspaceSummary,
    pub metrics: EvalMetrics,
    pub steps: Vec<EvalStep>,
}

#[derive(Debug, Clone)]
struct SeedWorkspace {
    notes_path: PathBuf,
}

fn seed_workspace(root: &Path) -> Result<SeedWorkspace> {
    let src_dir = root.join("src");
    fs::create_dir_all(&src_dir)
        .with_context(|| format!("failed to create seed directory: {}", src_dir.display()))?;

    let readme_path = root.join("README.md");
    fs::write(
        &readme_path,
        "# Eval Harness Workspace\n\nThis workspace is offline.\n",
    )
    .with_context(|| format!("failed to write {}", readme_path.display()))?;

    let notes_path = root.join("notes.txt");
    fs::write(
        &notes_path,
        "# Eval Harness\nstatus = \"draft\"\ntodo: offline metrics\n",
    )
    .with_context(|| format!("failed to write {}", notes_path.display()))?;

    let lib_path = src_dir.join("lib.rs");
    fs::write(
        &lib_path,
        "pub fn add(a: i32, b: i32) -> i32 {\n    a + b\n}\n",
    )
    .with_context(|| format!("failed to write {}", lib_path.display()))?;

    Ok(SeedWorkspace { notes_path })
}

fn summarize_workspace(root: &Path, list_output: Option<&str>) -> Result<WorkspaceSummary> {
    let mut files = Vec::new();

    let walker = WalkBuilder::new(root)
        .hidden(false)
        .git_ignore(false)
        .git_global(false)
        .git_exclude(false)
        .build();

    for entry in walker {
        let entry = entry.with_context(|| format!("failed to walk {}", root.display()))?;
        if entry.file_type().is_some_and(|t| t.is_file()) {
            files.push(entry.into_path());
        }
    }

    if files.is_empty()
        && let Some(output) = list_output
        && !output.trim().is_empty()
    {
        return Err(anyhow!(
            "workspace appears empty after list_dir: {}",
            output.trim()
        ));
    }

    files.sort();

    Ok(WorkspaceSummary {
        root: root.to_path_buf(),
        file_count: files.len(),
        files,
    })
}

fn validate_outputs(
    root: &Path,
    shell_expect_token: &str,
    search_output: Option<&str>,
    edit_output: Option<&str>,
    patch_output: Option<&str>,
    shell_output: Option<&str>,
) -> bool {
    let notes_path = root.join("notes.txt");
    let notes = match fs::read_to_string(&notes_path) {
        Ok(content) => content,
        Err(_) => return false,
    };

    let search_ok = search_output.is_some_and(|s| s.contains("matches="));
    let edit_ok = edit_output.is_some_and(|s| !s.is_empty()) && notes.contains("edited = true");
    let patch_ok = patch_output.is_some_and(|s| !s.is_empty())
        && notes.contains("todo: offline metrics (patched)");
    let shell_ok = shell_output
        .map(str::trim)
        .is_some_and(|s| s.contains(shell_expect_token));

    search_ok && edit_ok && patch_ok && shell_ok
}

fn list_dir(path: &Path) -> Result<Vec<String>> {
    let mut entries = Vec::new();
    let dir = fs::read_dir(path)
        .with_context(|| format!("failed to read directory: {}", path.display()))?;

    for entry in dir {
        let entry = entry.with_context(|| format!("failed to list {}", path.display()))?;
        entries.push(entry.file_name().to_string_lossy().to_string());
    }

    entries.sort();
    Ok(entries)
}

fn read_file(path: &Path) -> Result<String> {
    fs::read_to_string(path).with_context(|| format!("failed to read {}", path.display()))
}

#[derive(Debug, Clone, PartialEq, Eq)]
struct SearchMatch {
    path: PathBuf,
    line: usize,
    content: String,
}

#[derive(Debug, Clone, PartialEq, Eq)]
struct SearchResult {
    matches: Vec<SearchMatch>,
}

fn search_files(root: &Path, pattern: &str) -> Result<SearchResult> {
    if !root.exists() {
        return Err(anyhow!("search root does not exist: {}", root.display()));
    }

    let regex = Regex::new(pattern).context("failed to compile search regex")?;
    let mut matches = Vec::new();

    let walker = WalkBuilder::new(root)
        .hidden(false)
        .git_ignore(false)
        .git_global(false)
        .git_exclude(false)
        .build();

    for entry in walker {
        let entry = entry.with_context(|| format!("failed to walk {}", root.display()))?;
        if !entry.file_type().is_some_and(|t| t.is_file()) {
            continue;
        }

        let path = entry.path();
        let content = match fs::read_to_string(path) {
            Ok(c) => c,
            Err(_) => continue,
        };

        for (idx, line) in content.lines().enumerate() {
            if regex.is_match(line) {
                matches.push(SearchMatch {
                    path: path.to_path_buf(),
                    line: idx + 1,
                    content: line.to_string(),
                });
            }
            if matches.len() >= 64 {
                break;
            }
        }
        if matches.len() >= 64 {
            break;
        }
    }

    Ok(SearchResult { matches })
}

fn edit_file_append(path: &Path, line: &str) -> Result<()> {
    let mut content = read_file(path)?;
    if !content.ends_with('\n') {
        content.push('\n');
    }
    content.push_str(line);
    content.push('\n');
    fs::write(path, content).with_context(|| format!("failed to write {}", path.display()))
}

fn apply_patch(root: &Path, patch: &str) -> Result<()> {
    let mut lines = patch.lines();

    let begin = lines.next().unwrap_or_default();
    if begin != "*** Begin Patch" {
        return Err(anyhow!("patch missing *** Begin Patch header"));
    }

    let header = lines.next().unwrap_or_default();
    let file_rel = header
        .strip_prefix("*** Update File: ")
        .ok_or_else(|| anyhow!("only *** Update File patches are supported"))?;
    if file_rel.contains("..") {
        return Err(anyhow!("patch path must be workspace-relative"));
    }

    let file_path = root.join(file_rel);
    let original = read_file(&file_path)?;
    let had_trailing_newline = original.ends_with('\n');
    let mut file_lines: Vec<String> = original.lines().map(|l| l.to_string()).collect();

    let mut cursor = 0usize;
    for raw_line in lines {
        if raw_line == "*** End Patch" {
            break;
        }
        if raw_line.starts_with("*** ") {
            return Err(anyhow!("unexpected patch directive: {raw_line}"));
        }
        if raw_line.starts_with("@@") {
            continue;
        }

        let (kind, rest) = raw_line.split_at(1);
        let content = rest.to_string();

        match kind {
            " " => {
                let Some(found) = file_lines[cursor..]
                    .iter()
                    .position(|line| line == &content)
                    .map(|offset| cursor + offset)
                else {
                    return Err(anyhow!(
                        "patch context not found in {}: {}",
                        file_path.display(),
                        content
                    ));
                };
                cursor = found + 1;
            }
            "-" => {
                if cursor >= file_lines.len() || file_lines[cursor] != content {
                    return Err(anyhow!(
                        "patch removal mismatch in {}: expected '{}'",
                        file_path.display(),
                        content
                    ));
                }
                file_lines.remove(cursor);
            }
            "+" => {
                file_lines.insert(cursor, content);
                cursor += 1;
            }
            _ => return Err(anyhow!("unsupported patch line: {raw_line}")),
        }
    }

    let mut updated = file_lines.join("\n");
    if had_trailing_newline {
        updated.push('\n');
    }

    fs::write(&file_path, updated)
        .with_context(|| format!("failed to write patched file {}", file_path.display()))
}

fn exec_shell(root: &Path, command: &str) -> Result<String> {
    crate::shell_dispatcher::global_dispatcher().run_foreground(command, root)
}

fn truncate_output(value: &str, max_chars: usize) -> String {
    if value.chars().count() <= max_chars {
        return value.to_string();
    }

    let truncated: String = value.chars().take(max_chars).collect();
    format!("{truncated}...")
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn eval_shell_invocation_preserves_quoted_payload_as_single_arg() {
        let command = r#"git commit -m "feat: complete sub-pages""#;

        let windows = eval_shell_invocation_for_platform(command, EvalShellPlatform::Windows);
        assert_eq!(windows.program, "cmd");
        assert_eq!(windows.args, vec!["/C".to_string(), command.to_string()]);
        assert!(windows.raw_payload_on_windows);

        let unix = eval_shell_invocation_for_platform(command, EvalShellPlatform::Unix);
        assert_eq!(unix.program, "sh");
        assert_eq!(unix.args, vec!["-c".to_string(), command.to_string()]);
        assert!(!unix.raw_payload_on_windows);
    }
}