From 0cd8bcde1b268d7bb9bb829fe01a50496034732d Mon Sep 17 00:00:00 2001
From: idling11 <8055620+idling11@users.noreply.github.com>
Date: Fri, 12 Jun 2026 01:15:00 -0700
Subject: [PATCH] feat(bench): add CLI comparison harness

Harvest #3009 for the v0.8.59 release lane. Adds a paired Terminal-Bench harness for CodeWhale and Codex, a Codex Harbor adapter, generated-result ignore protection, and benchmark docs.

Maintainer amendments keep explicit zero-valued metrics, regenerate parent task names, write refreshed summaries in regenerate mode, and allow transcript paths outside the repo.

Fixes #2952.
---
 .gitignore                               |   2 +
 benchmark_results/.gitkeep               |   0
 docs/BENCHMARKS.md                       |  17 +
 scripts/benchmarks/README.md             |   7 +
 scripts/benchmarks/cli-compare.py        | 602 +++++++++++++++++++++++
 scripts/benchmarks/harbor/codex_agent.py | 126 +++++
 6 files changed, 754 insertions(+)
 create mode 100644 benchmark_results/.gitkeep
 create mode 100755 scripts/benchmarks/cli-compare.py
 create mode 100755 scripts/benchmarks/harbor/codex_agent.py

diff --git a/.gitignore b/.gitignore
index ab1cf906..50ba4f49 100644
--- a/.gitignore
+++ b/.gitignore
@@ -123,4 +123,6 @@ scripts/run_deep_swe.py
 
 # Benchmark artifacts and caches re-included by !scripts/**
 results/
+benchmark_results/*
+!benchmark_results/.gitkeep
 scripts/**/__pycache__/
diff --git a/benchmark_results/.gitkeep b/benchmark_results/.gitkeep
new file mode 100644
index 00000000..e69de29b
diff --git a/docs/BENCHMARKS.md b/docs/BENCHMARKS.md
index 390e2a02..41c1ccb9 100644
--- a/docs/BENCHMARKS.md
+++ b/docs/BENCHMARKS.md
@@ -103,6 +103,23 @@ harbor run \
   --model deepseek/deepseek-chat
 ```
 
+### Compare CodeWhale and Codex
+
+Use the paired comparison harness when you need one normalized row per CLI for
+the same task, model, timeout, and environment:
+
+```bash
+python scripts/benchmarks/cli-compare.py \
+  --task prove-plus-comm \
+  --model deepseek/deepseek-chat \
+  --runs 3
+```
+
+The harness writes raw Harbor logs plus `summary.json`, `summary.md`, and
+`metadata.json` under `benchmark_results/cli-compare-*`. Missing metrics are
+reported as JSON `null`, and generated run directories are intentionally ignored
+by git; keep only curated summaries in docs or release notes.
+
 ## PinchBench
 
 PinchBench measures agent performance on real-world tasks — scheduling, email
diff --git a/scripts/benchmarks/README.md b/scripts/benchmarks/README.md
index 600e4741..df0bd92e 100644
--- a/scripts/benchmarks/README.md
+++ b/scripts/benchmarks/README.md
@@ -17,6 +17,11 @@ export DEEPSEEK_API_KEY="sk-..."
 ./scripts/benchmarks/run-terminal-bench.sh \
   --model deepseek/deepseek-chat
 
+# CodeWhale vs Codex comparison rows
+python scripts/benchmarks/cli-compare.py \
+  --task prove-plus-comm \
+  --model deepseek/deepseek-chat
+
 # PinchBench (auto-install + run)
 ./scripts/benchmarks/run-pinchbench.sh \
   --install \
@@ -28,8 +33,10 @@ export DEEPSEEK_API_KEY="sk-..."
 - `run-swebench.sh` — SWE-bench batch driver and evaluator
 - `run-terminal-bench.sh` — Terminal-Bench runner via Harbor
 - `run-pinchbench.sh` — PinchBench runner with auto-install
+- `cli-compare.py` — CodeWhale/Codex Terminal-Bench comparison harness
 - `harbor/__init__.py` — Harbor adapter for CodeWhale (Python)
 - `harbor/codewhale_agent.py` — Adapter entry point
+- `harbor/codex_agent.py` — Codex adapter for paired CLI comparisons
 
 ## Documentation
 
diff --git a/scripts/benchmarks/cli-compare.py b/scripts/benchmarks/cli-compare.py
new file mode 100755
index 00000000..27ddace5
--- /dev/null
+++ b/scripts/benchmarks/cli-compare.py
@@ -0,0 +1,602 @@
+#!/usr/bin/env python3
+"""
+cli-compare.py - Run Terminal-Bench tasks through CodeWhale and Codex CLIs,
+emit normalized token/performance comparison rows.
+
+Usage:
+    # Run default tasks
+    python scripts/benchmarks/cli-compare.py
+
+    # Specific task and model
+    python scripts/benchmarks/cli-compare.py --task prove-plus-comm \\
+        --model deepseek/deepseek-chat --runs 3
+
+    # Regenerate from existing run artifacts
+    python scripts/benchmarks/cli-compare.py \\
+        --regenerate benchmark_results/cli-compare-20260609
+
+Output (per run date):
+    benchmark_results/cli-compare-YYYYMMDD/
+        summary.json         - one row per agent, all fields normalized
+        summary.md           - Markdown table suitable for release notes
+        metadata.json        - versions, model, timestamp, platform
+        codewhale/<task>/    - raw Harbor output
+        codex/<task>/        - raw Harbor output
+
+Prerequisites:
+    pip install harbor
+    Docker running
+    DEEPSEEK_API_KEY set (for CodeWhale)
+    CODEX_API_KEY or equivalent set (for Codex)
+
+Field semantics (summary.json rows):
+    task              str    - Terminal-Bench task name
+    agent             str    - "codewhale" or "codex"
+    run_idx           int    - 0-based run index
+    reward            float  - pass/fail score (1.0 = pass)
+    runtime_s         float  - wall-clock seconds (null if not available)
+    exception         str    - raised exception text (null = clean finish)
+    input_tokens      int    - provider-reported input tokens
+    cached_tokens     int    - provider-reported cached input tokens (null if N/A)
+    output_tokens     int    - provider-reported output tokens
+    reasoning_tokens  int    - provider-reported reasoning tokens (null if N/A)
+    answer_len        int    - locally-derived visible final-answer character count
+    transcript_path   str    - relative path to raw agent output file
+
+All missing metrics are serialized as JSON ``null`` - never silently zeroed.
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Optional
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+REPO_ROOT = SCRIPT_DIR.parent.parent
+
+# ---------------------------------------------------------------------------
+# Config
+# ---------------------------------------------------------------------------
+
+DEFAULT_TASKS = [
+    "prove-plus-comm",
+    "cancel-async-tasks",
+    "configure-git-webserver",
+    "fix-code-vulnerability",
+]
+DEFAULT_MODEL = "deepseek/deepseek-chat"
+DEFAULT_TIMEOUT_PER_RUN = 900  # seconds (Harbor handles its own timeout internally)
+DEFAULT_RUNS = 1
+HARBOR_DATASET = "terminal-bench@2.0"
+CODEWHALE_AGENT = "scripts.benchmarks.harbor:CodeWhaleAgent"
+CODEX_AGENT = "scripts.benchmarks.harbor.codex_agent:CodexAgent"
+
+# ---------------------------------------------------------------------------
+# Harbor integration
+# ---------------------------------------------------------------------------
+
+
+def check_harbor() -> None:
+    """Verify Harbor is installed and Docker is running."""
+    if subprocess.run(["which", "harbor"], capture_output=True).returncode != 0:
+        sys.exit("Error: 'harbor' not found. Install with: pip install harbor")
+    if subprocess.run(["docker", "info"], capture_output=True).returncode != 0:
+        sys.exit("Error: Docker not running. Harbor requires Docker.")
+
+
+def run_harbor_single_task(
+    task: str,
+    model: str,
+    agent_path: str,
+    results_dir: Path,
+    timeout: int,
+) -> dict[str, Any]:
+    """Run a single Terminal-Bench task through Harbor.
+
+    Harbor supports single-task runs with dataset colon syntax.
+    """
+    dataset = f"{HARBOR_DATASET}:{task}"  # Harbor colon-syntax for single task
+    results_dir.mkdir(parents=True, exist_ok=True)
+
+    cmd = [
+        "harbor", "run",
+        "--dataset", dataset,
+        "--agent", agent_path,
+        "--model", model,
+        "--n-concurrent", "1",
+        "--results-dir", str(results_dir),
+    ]
+
+    start = time.time()
+    try:
+        proc = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+            cwd=REPO_ROOT,
+        )
+        runtime_s = round(time.time() - start, 2)
+    except subprocess.TimeoutExpired:
+        runtime_s = round(time.time() - start, 2)
+        return {
+            "task": task, "model": model, "agent": agent_path,
+            "runtime_s": runtime_s, "exit_code": -1,
+            "exception": f"Timeout after {timeout}s",
+            "stdout": "", "stderr": "", "results_dir": str(results_dir),
+        }
+
+    return {
+        "task": task, "model": model, "agent": agent_path,
+        "runtime_s": runtime_s,
+        "exit_code": proc.returncode,
+        "exception": None,
+        "stdout": proc.stdout,
+        "stderr": proc.stderr,
+        "results_dir": str(results_dir),
+    }
+
+
+# ---------------------------------------------------------------------------
+# Result parsing
+# ---------------------------------------------------------------------------
+
+
+def _try_int(val: Any) -> Optional[int]:
+    if val is None:
+        return None
+    try:
+        return int(val)
+    except (ValueError, TypeError):
+        return None
+
+
+def _try_float(val: Any) -> Optional[float]:
+    if val is None:
+        return None
+    try:
+        return float(val)
+    except (ValueError, TypeError):
+        return None
+
+
+def _first_present(mapping: dict[str, Any], *keys: str) -> Any:
+    for key in keys:
+        if key in mapping and mapping[key] is not None:
+            return mapping[key]
+    return None
+
+
+def _stable_path(path: Path) -> str:
+    try:
+        return str(path.relative_to(REPO_ROOT))
+    except ValueError:
+        return str(path)
+
+
+def parse_token_jsonl(lines: list[str]) -> dict[str, Optional[int]]:
+    """Extract token usage from CodeWhale/Codex stream JSONL lines.
+
+    CodeWhale emits ``{"type":"result","usage":{...}}`` at end-of-stream.
+    Codex may emit usage in closing messages or transcript footers.
+    """
+    result: dict[str, Optional[int]] = {
+        "input_tokens": None, "cached_tokens": None,
+        "output_tokens": None, "reasoning_tokens": None,
+    }
+    if not lines:
+        return result
+
+    for line in reversed(lines):  # usage typically at the end
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            obj = json.loads(line)
+        except json.JSONDecodeError:
+            # Try regex extraction for non-JSON transcript lines
+            continue
+
+        usage = obj.get("usage") or obj.get("token_usage") or {}
+        if isinstance(usage, dict):
+            if result["input_tokens"] is None:
+                result["input_tokens"] = _try_int(
+                    _first_present(usage, "input_tokens", "prompt_tokens")
+                )
+            if result["cached_tokens"] is None:
+                result["cached_tokens"] = _try_int(
+                    _first_present(
+                        usage,
+                        "cached_input_tokens",
+                        "cache_read_input_tokens",
+                        "cached_tokens",
+                    )
+                )
+            if result["output_tokens"] is None:
+                result["output_tokens"] = _try_int(
+                    _first_present(usage, "output_tokens", "completion_tokens")
+                )
+            if result["reasoning_tokens"] is None:
+                result["reasoning_tokens"] = _try_int(
+                    _first_present(
+                        usage,
+                        "reasoning_tokens",
+                        "thinking_tokens",
+                        "reasoning_completion_tokens",
+                    )
+                )
+        if all(v is not None for v in result.values()):
+            break
+
+    return result
+
+
+def extract_answer_len(text: str) -> Optional[int]:
+    """Heuristic: length of the last substantial text block that looks like an answer.
+
+    Looks for the last non-code, non-log paragraph after the agent has finished
+    its tool-calling phase. Returns character count or None.
+    """
+    if not text:
+        return None
+    # Agent outputs often have a "## Final Answer" or similar marker.
+    # Try to find the last answer section.
+    for marker in ("## Final Answer", "## Answer", "final answer",
+                   "Here is the", "The solution"):
+        idx = text.rfind(marker)
+        if idx >= 0:
+            # Take text from marker to end, strip trailing shell logs
+            tail = text[idx:]
+            # Stop at next shell prompt or markdown separator
+            for term in ("```", "$ ", "# ", "/workspace"):
+                term_idx = tail.find(term, len(marker))
+                if term_idx > 0:
+                    tail = tail[:term_idx]
+            return len(tail.strip())
+
+    # Fallback: last paragraph that isn't code or a prompt
+    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
+    for p in reversed(paragraphs):
+        if not p.startswith("```") and not p.startswith("$") and len(p) > 20:
+            return len(p)
+
+    return len(text.strip()) if text.strip() else None
+
+
+def parse_harbor_run(task_dir: Path, agent_name: str) -> dict[str, Any]:
+    """Parse Harbor results for a single task run.
+
+    Harbor stores per-task output in:
+        <task_dir>/
+            results.json      - Harbor's own eval summary
+            logs/agent/*.txt  - raw agent transcript (if stdout captured)
+    """
+    row: dict[str, Any] = {
+        "task": task_dir.name,
+        "agent": agent_name,
+        "reward": None,
+        "runtime_s": None,
+        "exception": None,
+        "input_tokens": None,
+        "cached_tokens": None,
+        "output_tokens": None,
+        "reasoning_tokens": None,
+        "answer_len": None,
+        "transcript_path": None,
+    }
+
+    # 1. Harbor results.json - pass/fail and runtime
+    for candidate in sorted(task_dir.rglob("results.json")):
+        try:
+            data = json.loads(candidate.read_text())
+            if isinstance(data, dict):
+                row["reward"] = _try_float(_first_present(data, "score", "reward"))
+                row["runtime_s"] = _try_float(
+                    _first_present(data, "runtime", "duration")
+                )
+                exc = data.get("exception") or data.get("error")
+                row["exception"] = str(exc) if exc else None
+                break
+        except (json.JSONDecodeError, OSError):
+            continue
+
+    # 2. Agent transcript - token usage and answer
+    for txt_file in sorted(task_dir.rglob("*.txt")):
+        if txt_file.name.startswith("."):
+            continue
+        try:
+            text = txt_file.read_text(errors="ignore")
+        except OSError:
+            continue
+        if not text.strip():
+            continue
+
+        row["transcript_path"] = _stable_path(txt_file)
+
+        tokens = parse_token_jsonl(text.split("\n"))
+        for key, value in tokens.items():
+            if row[key] is None:
+                row[key] = value
+
+        if row["answer_len"] is None:
+            row["answer_len"] = extract_answer_len(text)
+        break
+
+    # 3. Harbor run metadata - runtime fallback
+    for meta_file in sorted(task_dir.rglob("run_metadata.json")):
+        try:
+            data = json.loads(meta_file.read_text())
+            if isinstance(data, dict) and row["runtime_s"] is None:
+                row["runtime_s"] = _try_float(data.get("runtime_seconds"))
+        except (json.JSONDecodeError, OSError):
+            continue
+
+    return row
+
+
+# ---------------------------------------------------------------------------
+# Summary generation
+# ---------------------------------------------------------------------------
+
+
+def generate_markdown_table(rows: list[dict[str, Any]]) -> str:
+    """Generate a Markdown comparison table from normalized rows."""
+    if not rows:
+        return "*(no data)*\n"
+
+    headers = [
+        "task", "agent", "reward", "input_tokens", "cached_tokens",
+        "output_tokens", "reasoning_tokens", "runtime_s", "answer_len",
+    ]
+
+    md = "| " + " | ".join(h.replace("_", " ") for h in headers) + " |\n"
+    md += "|" + "|".join(" ---: " for _ in headers) + "|\n"
+
+    for row in rows:
+        cells: list[str] = []
+        for h in headers:
+            val = row.get(h)
+            if val is None:
+                cells.append("null")
+            elif isinstance(val, float):
+                cells.append(f"{val:.2f}")
+            elif isinstance(val, int):
+                cells.append(f"{val:,}")
+            else:
+                cells.append(str(val))
+        md += "| " + " | ".join(cells) + " |\n"
+
+    return md
+
+
+def generate_json_summary(rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Return rows sorted by task, agent, run_idx."""
+    return sorted(
+        rows,
+        key=lambda r: (r.get("task", ""), r.get("agent", ""), r.get("run_idx", 0)),
+    )
+
+
+# ---------------------------------------------------------------------------
+# Regenerate from existing logs
+# ---------------------------------------------------------------------------
+
+
+def regenerate(results_dir: Path) -> list[dict[str, Any]]:
+    """Walk existing run directory and rebuild normalized rows."""
+    rows: list[dict[str, Any]] = []
+    for agent_dir in sorted(results_dir.iterdir()):
+        if not agent_dir.is_dir() or agent_dir.name.startswith("."):
+            continue
+        agent_name = agent_dir.name
+        for task_dir in sorted(agent_dir.iterdir()):
+            if not task_dir.is_dir():
+                continue
+            # Check for per-run subdirectories
+            subdirs = [d for d in task_dir.iterdir() if d.is_dir()]
+            if subdirs and all(d.name.startswith("run_") for d in subdirs):
+                for run_dir in sorted(subdirs):
+                    row = parse_harbor_run(run_dir, agent_name)
+                    row["task"] = task_dir.name
+                    try:
+                        row["run_idx"] = int(run_dir.name.split("_")[-1])
+                    except (ValueError, IndexError):
+                        row["run_idx"] = 0
+                    rows.append(row)
+            else:
+                row = parse_harbor_run(task_dir, agent_name)
+                row["task"] = task_dir.name
+                row["run_idx"] = 0
+                rows.append(row)
+    return rows
+
+
+# ---------------------------------------------------------------------------
+# Metadata capture
+# ---------------------------------------------------------------------------
+
+
+def capture_metadata(model: str) -> dict[str, Any]:
+    """Capture environment metadata for reproducibility."""
+    meta: dict[str, Any] = {
+        "timestamp_utc": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
+        "platform": os.uname().sysname + "/" + os.uname().machine,
+        "model": model,
+        "dataset": HARBOR_DATASET,
+    }
+    # CodeWhale version
+    r = subprocess.run(["codewhale", "--version"], capture_output=True, text=True)
+    if r.returncode == 0:
+        meta["codewhale_version"] = r.stdout.strip()
+    # Codex version
+    r = subprocess.run(["codex", "--version"], capture_output=True, text=True)
+    if r.returncode == 0:
+        meta["codex_version"] = r.stdout.strip()
+    # Harbor version
+    r = subprocess.run(["harbor", "--version"], capture_output=True, text=True)
+    if r.returncode == 0:
+        meta["harbor_version"] = r.stdout.strip()
+    # Git commit
+    r = subprocess.run(
+        ["git", "rev-parse", "HEAD"],
+        capture_output=True, text=True, cwd=REPO_ROOT,
+    )
+    if r.returncode == 0:
+        meta["git_commit"] = r.stdout.strip()[:12]
+    return meta
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="CodeWhale vs Codex CLI token comparison harness",
+    )
+    parser.add_argument(
+        "--task", nargs="+", default=DEFAULT_TASKS,
+        help=f"Terminal-Bench task names (default: {' '.join(DEFAULT_TASKS)})",
+    )
+    parser.add_argument(
+        "--model", default=DEFAULT_MODEL,
+        help=f"Model in provider/name format (default: {DEFAULT_MODEL})",
+    )
+    parser.add_argument(
+        "--runs", type=int, default=DEFAULT_RUNS,
+        help=f"Number of runs per agent per task (default: {DEFAULT_RUNS})",
+    )
+    parser.add_argument(
+        "--timeout", type=int, default=DEFAULT_TIMEOUT_PER_RUN,
+        help=f"Timeout per run in seconds (default: {DEFAULT_TIMEOUT_PER_RUN})",
+    )
+    parser.add_argument(
+        "--regenerate", type=Path, default=None,
+        help="Regenerate summary from existing raw results directory",
+    )
+    parser.add_argument(
+        "--codewhale-agent", default=CODEWHALE_AGENT,
+        help="Harbor agent import path for CodeWhale",
+    )
+    parser.add_argument(
+        "--codex-agent", default=CODEX_AGENT,
+        help="Harbor agent import path for Codex",
+    )
+    args = parser.parse_args()
+
+    # --------------- Regenerate mode ---------------
+    if args.regenerate:
+        results_dir = args.regenerate
+        if not results_dir.exists():
+            sys.exit(f"Error: results directory not found: {results_dir}")
+        rows = regenerate(results_dir)
+        summary_rows = generate_json_summary(rows)
+        (results_dir / "summary.json").write_text(json.dumps(summary_rows, indent=2))
+        md = generate_markdown_table(summary_rows)
+        (results_dir / "summary.md").write_text(md)
+        print(md)
+        return
+
+    # --------------- Fresh run mode ---------------
+    check_harbor()
+
+    date_str = datetime.now().strftime("%Y%m%d")
+    run_dir = REPO_ROOT / "benchmark_results" / f"cli-compare-{date_str}"
+    if run_dir.exists():
+        # Append run number if directory already exists
+        suffix = 2
+        while (run_dir := REPO_ROOT / "benchmark_results" /
+               f"cli-compare-{date_str}-{suffix}").exists():
+            suffix += 1
+    run_dir.mkdir(parents=True, exist_ok=True)
+
+    # Metadata
+    meta = capture_metadata(args.model)
+    meta["tasks"] = args.task
+    meta["runs_per_task"] = args.runs
+    (run_dir / "metadata.json").write_text(json.dumps(meta, indent=2))
+
+    cw_dir = run_dir / "codewhale"
+    cx_dir = run_dir / "codex"
+    cw_dir.mkdir(parents=True, exist_ok=True)
+    cx_dir.mkdir(parents=True, exist_ok=True)
+
+    all_rows: list[dict[str, Any]] = []
+
+    for task in args.task:
+        for run_idx in range(args.runs):
+            header = f"Task: {task}  Run: {run_idx+1}/{args.runs}"
+            print(f"\n{'='*60}")
+            print(header)
+            print("=" * 60)
+
+            print("\n--- CodeWhale ---")
+            cw_run_dir = cw_dir / task / f"run_{run_idx}"
+            cw_result = run_harbor_single_task(
+                task=task, model=args.model,
+                agent_path=args.codewhale_agent,
+                results_dir=cw_run_dir, timeout=args.timeout,
+            )
+            cw_row = parse_harbor_run(cw_run_dir, "codewhale")
+            cw_row["task"] = task
+            cw_row["run_idx"] = run_idx
+            if cw_row["runtime_s"] is None:
+                cw_row["runtime_s"] = cw_result["runtime_s"]
+            if cw_result["exception"]:
+                cw_row["exception"] = cw_row["exception"] or cw_result["exception"]
+            all_rows.append(cw_row)
+            self_report(cw_row)
+
+            print("\n--- Codex ---")
+            cx_run_dir = cx_dir / task / f"run_{run_idx}"
+            cx_result = run_harbor_single_task(
+                task=task, model=args.model,
+                agent_path=args.codex_agent,
+                results_dir=cx_run_dir, timeout=args.timeout,
+            )
+            cx_row = parse_harbor_run(cx_run_dir, "codex")
+            cx_row["task"] = task
+            cx_row["run_idx"] = run_idx
+            if cx_row["runtime_s"] is None:
+                cx_row["runtime_s"] = cx_result["runtime_s"]
+            if cx_result["exception"]:
+                cx_row["exception"] = cx_row["exception"] or cx_result["exception"]
+            all_rows.append(cx_row)
+            self_report(cx_row)
+
+    # Write summaries
+    summary_json = run_dir / "summary.json"
+    summary_json.write_text(
+        json.dumps(generate_json_summary(all_rows), indent=2)
+    )
+    print(f"\nSummary JSON: {summary_json}")
+
+    md = generate_markdown_table(all_rows)
+    (run_dir / "summary.md").write_text(md)
+    print(f"Summary MD:   {run_dir / 'summary.md'}")
+    print(f"Metadata:     {run_dir / 'metadata.json'}")
+    print("\n" + md)
+
+
+def self_report(row: dict[str, Any]) -> None:
+    """Print a one-line summary of a parsed run."""
+    parts = [
+        f"reward={row['reward']}" if row["reward"] is not None else "reward=null",
+        f"input={row['input_tokens']}" if row["input_tokens"] is not None else "input=null",
+        f"output={row['output_tokens']}" if row["output_tokens"] is not None else "output=null",
+        f"cached={row['cached_tokens']}" if row["cached_tokens"] is not None else "",
+        f"reasoning={row['reasoning_tokens']}" if row["reasoning_tokens"] is not None else "",
+        f"answer_len={row['answer_len']}" if row["answer_len"] is not None else "",
+        f"runtime={row['runtime_s']:.1f}s" if row["runtime_s"] is not None else "",
+    ]
+    print("  " + ", ".join(p for p in parts if p))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/benchmarks/harbor/codex_agent.py b/scripts/benchmarks/harbor/codex_agent.py
new file mode 100755
index 00000000..abae2104
--- /dev/null
+++ b/scripts/benchmarks/harbor/codex_agent.py
@@ -0,0 +1,126 @@
+"""Harbor adapter for Codex CLI."""
+
+import json
+import os
+import shlex
+from pathlib import Path, PurePosixPath
+from typing import Any
+
+from harbor.agents.installed.base import (
+    BaseInstalledAgent,
+    CliFlag,
+    with_prompt_template,
+)
+from harbor.environments.base import BaseEnvironment
+from harbor.models.agent.context import AgentContext
+
+
+class CodexAgent(BaseInstalledAgent):
+    """Codex CLI agent adapter for Harbor."""
+
+    _OUTPUT_FILENAME = "codex.txt"
+
+    CLI_FLAGS = [
+        CliFlag(
+            "allowed-tools",
+            cli="--allowed-tools",
+            type="str",
+            default="Bash,Read,Write,Edit,Glob,Grep",
+        ),
+    ]
+
+    @staticmethod
+    def name() -> str:
+        return "codex"
+
+    def version(self) -> str | None:
+        return getattr(self, "_version", None)
+
+    def get_version_command(self) -> str | None:
+        return "codex --version 2>/dev/null || codex-cli --version 2>/dev/null"
+
+    def parse_version(self, stdout: str) -> str:
+        text = stdout.strip()
+        for line in text.splitlines():
+            line = line.strip()
+            if line:
+                for prefix in ("codex-cli ", "codex "):
+                    if line.lower().startswith(prefix):
+                        return line[len(prefix):]
+                return line
+        return text
+
+    async def install(self, environment: BaseEnvironment) -> None:
+        """Install Codex CLI in the container."""
+        await self.exec_as_root(
+            environment,
+            command=(
+                "if ldd --version 2>&1 | grep -qi musl || [ -f /etc/alpine-release ]; then"
+                "  apk add --no-cache curl bash nodejs npm git ripgrep;"
+                " elif command -v apt-get &>/dev/null; then"
+                "  apt-get update && apt-get install -y curl git ripgrep;"
+                " elif command -v yum &>/dev/null; then"
+                "  yum install -y curl git ripgrep;"
+                " fi"
+            ),
+            env={"DEBIAN_FRONTEND": "noninteractive"},
+        )
+
+        await self.exec_as_root(
+            environment,
+            command=(
+                "if ! command -v node &>/dev/null; then"
+                "  curl -fsSL https://deb.nodesource.com/setup_20.x | bash - &&"
+                "  apt-get install -y nodejs;"
+                " fi"
+            ),
+            env={"DEBIAN_FRONTEND": "noninteractive"},
+        )
+
+        await self.exec_as_agent(
+            environment,
+            command="npm install -g codex",
+        )
+
+    @with_prompt_template
+    async def run(
+        self,
+        instruction: str,
+        environment: BaseEnvironment,
+        context: AgentContext,
+    ) -> None:
+        """Run Codex CLI in non-interactive exec mode."""
+        escaped_instruction = shlex.quote(instruction)
+
+        cli_flags = self.build_cli_flags()
+        extra_flags = (cli_flags + " ") if cli_flags else ""
+
+        model_flag = ""
+        if self.model_name:
+            model_flag = f"--model {shlex.quote(self.model_name)} "
+
+        # Forward API keys
+        env: dict[str, str] = {}
+        for key in ("CODEX_API_KEY", "DEEPSEEK_API_KEY", "OPENAI_API_KEY",
+                     "ANTHROPIC_API_KEY", "OPENROUTER_API_KEY"):
+            val = os.environ.get(key, "")
+            if val:
+                env[key] = val
+
+        output_path = f"/logs/agent/{self._OUTPUT_FILENAME}"
+
+        await self.exec_as_agent(
+            environment,
+            command=(
+                f"codex exec --yes "
+                f"{model_flag}{extra_flags}"
+                f"--workspace /workspace "
+                f"{escaped_instruction} "
+                f"2>&1 | tee {shlex.quote(output_path)}"
+                f" || true"
+            ),
+            env=env if env else None,
+        )
+
+    def populate_context_post_run(self, context: AgentContext) -> None:
+        pass