feat(bench): improve cli-compare harness with real Harbor integration (#3009)

- Match actual Harbor CLI interface (no invented flags) - Proper BaseInstalledAgent subclass for Codex - Robust token extraction from stream JSONL + transcript parsing - Heuristic answer_len extraction (## Final Answer markers) - Metadata capture: versions, git commit, platform, timestamp - --regenerate walks existing run directories - All missing fields explicit null, never zero - Support multiple runs per task with run_idx tracking The harness is designed to run: harbor run --dataset terminal-bench@2.0:<task> --agent ... --model ... for both codex and codewhale agents, then normalize the results.
2026-06-13 01:53:36 +08:00
parent 38519552fd
commit 5d9b5f67cb
3 changed files with 706 additions and 0 deletions
@@ -0,0 +1,580 @@
 #!/usr/bin/env python3
 """
 cli-compare.py — Run Terminal-Bench tasks through CodeWhale and Codex CLIs,
 emit normalized token/performance comparison rows.
 Usage:
    # Run default tasks
    python scripts/benchmarks/cli-compare.py
    # Specific task and model
    python scripts/benchmarks/cli-compare.py --task prove-plus-comm \\
        --model deepseek/deepseek-chat --runs 3
    # Regenerate from existing run artifacts
    python scripts/benchmarks/cli-compare.py \\
        --regenerate benchmark_results/cli-compare-20260609
 Output (per run date):
    benchmark_results/cli-compare-YYYYMMDD/
        summary.json         — one row per agent, all fields normalized
        summary.md           — Markdown table suitable for release notes
        metadata.json        — versions, model, timestamp, platform
        codewhale/<task>/    — raw Harbor output
        codex/<task>/        — raw Harbor output
 Prerequisites:
    pip install harbor
    Docker running
    DEEPSEEK_API_KEY set (for CodeWhale)
    CODEX_API_KEY or equivalent set (for Codex)
 Field semantics (summary.json rows):
    task              str    — Terminal-Bench task name
    agent             str    — "codewhale" or "codex"
    run_idx           int    — 0-based run index
    reward            float  — pass/fail score (1.0 = pass)
    runtime_s         float  — wall-clock seconds (null if not available)
    exception         str    — raised exception text (null = clean finish)
    input_tokens      int    — provider-reported input tokens
    cached_tokens     int    — provider-reported cached input tokens (null if N/A)
    output_tokens     int    — provider-reported output tokens
    reasoning_tokens  int    — provider-reported reasoning tokens (null if N/A)
    answer_len        int    — locally-derived visible final-answer character count
    transcript_path   str    — relative path to raw agent output file
 All missing metrics are serialized as JSON ``null`` — never silently zeroed.
 """
 import argparse
 import json
 import os
 import subprocess
 import sys
 import time
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Optional
 SCRIPT_DIR = Path(__file__).resolve().parent
 REPO_ROOT = SCRIPT_DIR.parent.parent
 # ---------------------------------------------------------------------------
 # Config
 # ---------------------------------------------------------------------------
 DEFAULT_TASKS = [
    "prove-plus-comm",
    "cancel-async-tasks",
    "configure-git-webserver",
    "fix-code-vulnerability",
 ]
 DEFAULT_MODEL = "deepseek/deepseek-chat"
 DEFAULT_TIMEOUT_PER_RUN = 900  # seconds (Harbor handles its own timeout internally)
 DEFAULT_RUNS = 1
 HARBOR_DATASET = "terminal-bench@2.0"
 CODEWHALE_AGENT = "scripts.benchmarks.harbor:CodeWhaleAgent"
 CODEX_AGENT = "scripts.benchmarks.harbor.codex_agent:CodexAgent"
 # ---------------------------------------------------------------------------
 # Harbor integration
 # ---------------------------------------------------------------------------
 def check_harbor() -> None:
    """Verify Harbor is installed and Docker is running."""
    if subprocess.run(["which", "harbor"], capture_output=True).returncode != 0:
        sys.exit("Error: 'harbor' not found. Install with: pip install harbor")
    if subprocess.run(["docker", "info"], capture_output=True).returncode != 0:
        sys.exit("Error: Docker not running. Harbor requires Docker.")
 def run_harbor_single_task(
    task: str,
    model: str,
    agent_path: str,
    results_dir: Path,
    timeout: int,
 ) -> dict[str, Any]:
    """Run a single Terminal-Bench task through Harbor.
    Harbor supports task-level filtering via the ``--task`` flag (Harbor ≥0.4).
    If unavailable, falls back to running the full dataset with env-based filtering.
    """
    dataset = f"{HARBOR_DATASET}:{task}"  # Harbor colon-syntax for single task
    results_dir.mkdir(parents=True, exist_ok=True)
    cmd = [
        "harbor", "run",
        "--dataset", dataset,
        "--agent", agent_path,
        "--model", model,
        "--n-concurrent", "1",
        "--results-dir", str(results_dir),
    ]
    start = time.time()
    try:
        proc = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            timeout=timeout,
            cwd=REPO_ROOT,
        )
        runtime_s = round(time.time() - start, 2)
    except subprocess.TimeoutExpired:
        runtime_s = round(time.time() - start, 2)
        return {
            "task": task, "model": model, "agent": agent_path,
            "runtime_s": runtime_s, "exit_code": -1,
            "exception": f"Timeout after {timeout}s",
            "stdout": "", "stderr": "", "results_dir": str(results_dir),
        }
    return {
        "task": task, "model": model, "agent": agent_path,
        "runtime_s": runtime_s,
        "exit_code": proc.returncode,
        "exception": None,
        "stdout": proc.stdout,
        "stderr": proc.stderr,
        "results_dir": str(results_dir),
    }
 # ---------------------------------------------------------------------------
 # Result parsing
 # ---------------------------------------------------------------------------
 def _try_int(val: Any) -> Optional[int]:
    if val is None:
        return None
    try:
        return int(val)
    except (ValueError, TypeError):
        return None
 def _try_float(val: Any) -> Optional[float]:
    if val is None:
        return None
    try:
        return float(val)
    except (ValueError, TypeError):
        return None
 def parse_token_jsonl(lines: list[str]) -> dict[str, Optional[int]]:
    """Extract token usage from CodeWhale/Codex stream JSONL lines.
    CodeWhale emits ``{"type":"result","usage":{...}}`` at end-of-stream.
    Codex may emit usage in closing messages or transcript footers.
    """
    result: dict[str, Optional[int]] = {
        "input_tokens": None, "cached_tokens": None,
        "output_tokens": None, "reasoning_tokens": None,
    }
    if not lines:
        return result
    for line in reversed(lines):  # usage typically at the end
        line = line.strip()
        if not line:
            continue
        try:
            obj = json.loads(line)
        except json.JSONDecodeError:
            # Try regex extraction for non-JSON transcript lines
            continue
        usage = obj.get("usage") or obj.get("token_usage") or {}
        if isinstance(usage, dict):
            if result["input_tokens"] is None:
                result["input_tokens"] = _try_int(
                    usage.get("input_tokens") or usage.get("prompt_tokens")
                )
            if result["cached_tokens"] is None:
                result["cached_tokens"] = _try_int(
                    usage.get("cached_input_tokens")
                    or usage.get("cache_read_input_tokens")
                    or usage.get("cached_tokens")
                )
            if result["output_tokens"] is None:
                result["output_tokens"] = _try_int(
                    usage.get("output_tokens") or usage.get("completion_tokens")
                )
            if result["reasoning_tokens"] is None:
                result["reasoning_tokens"] = _try_int(
                    usage.get("reasoning_tokens")
                    or usage.get("thinking_tokens")
                    or usage.get("reasoning_completion_tokens")
                )
        if all(v is not None for v in result.values()):
            break
    return result
 def extract_answer_len(text: str) -> Optional[int]:
    """Heuristic: length of the last substantial text block that looks like an answer.
    Looks for the last non-code, non-log paragraph after the agent has finished
    its tool-calling phase. Returns character count or None.
    """
    if not text:
        return None
    # Agent outputs often have a "## Final Answer" or similar marker.
    # Try to find the last answer section.
    for marker in ("## Final Answer", "## Answer", "final answer",
                   "Here is the", "The solution"):
        idx = text.rfind(marker)
        if idx >= 0:
            # Take text from marker to end, strip trailing shell logs
            tail = text[idx:]
            # Stop at next shell prompt or markdown separator
            for term in ("```", "$ ", "# ", "/workspace"):
                term_idx = tail.find(term, len(marker))
                if term_idx > 0:
                    tail = tail[:term_idx]
            return len(tail.strip())
    # Fallback: last paragraph that isn't code or a prompt
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
    for p in reversed(paragraphs):
        if not p.startswith("```") and not p.startswith("$") and len(p) > 20:
            return len(p)
    return len(text.strip()) if text.strip() else None
 def parse_harbor_run(task_dir: Path, agent_name: str) -> dict[str, Any]:
    """Parse Harbor results for a single task run.
    Harbor stores per-task output in:
        <task_dir>/
            results.json      — Harbor's own eval summary
            logs/agent/*.txt  — raw agent transcript (if stdout captured)
    """
    row: dict[str, Any] = {
        "task": task_dir.name,
        "agent": agent_name,
        "reward": None,
        "runtime_s": None,
        "exception": None,
        "input_tokens": None,
        "cached_tokens": None,
        "output_tokens": None,
        "reasoning_tokens": None,
        "answer_len": None,
        "transcript_path": None,
    }
    # 1. Harbor results.json — pass/fail and runtime
    for candidate in sorted(task_dir.rglob("results.json")):
        try:
            data = json.loads(candidate.read_text())
            if isinstance(data, dict):
                row["reward"] = _try_float(data.get("score") or data.get("reward"))
                row["runtime_s"] = _try_float(data.get("runtime") or data.get("duration"))
                exc = data.get("exception") or data.get("error")
                row["exception"] = str(exc) if exc else None
                break
        except (json.JSONDecodeError, OSError):
            continue
    # 2. Agent transcript — token usage and answer
    for txt_file in sorted(task_dir.rglob("*.txt")):
        if txt_file.name.startswith("."):
            continue
        try:
            text = txt_file.read_text(errors="ignore")
        except OSError:
            continue
        if not text.strip():
            continue
        row["transcript_path"] = str(txt_file.relative_to(REPO_ROOT))
        # Token extraction from stream JSONL
        tokens = parse_token_jsonl(text.split("\n"))
        row["input_tokens"] = row["input_tokens"] or tokens["input_tokens"]
        row["cached_tokens"] = row["cached_tokens"] or tokens["cached_tokens"]
        row["output_tokens"] = row["output_tokens"] or tokens["output_tokens"]
        row["reasoning_tokens"] = row["reasoning_tokens"] or tokens["reasoning_tokens"]
        # Answer length
        if row["answer_len"] is None:
            row["answer_len"] = extract_answer_len(text)
        break
    # 3. Harbor run metadata — runtime fallback
    for meta_file in sorted(task_dir.rglob("run_metadata.json")):
        try:
            data = json.loads(meta_file.read_text())
            if isinstance(data, dict) and row["runtime_s"] is None:
                row["runtime_s"] = _try_float(data.get("runtime_seconds"))
        except (json.JSONDecodeError, OSError):
            continue
    return row
 # ---------------------------------------------------------------------------
 # Summary generation
 # ---------------------------------------------------------------------------
 def generate_markdown_table(rows: list[dict[str, Any]]) -> str:
    """Generate a Markdown comparison table from normalized rows."""
    if not rows:
        return "*(no data)*\n"
    headers = [
        "task", "agent", "reward", "input_tokens", "cached_tokens",
        "output_tokens", "reasoning_tokens", "runtime_s", "answer_len",
    ]
    md = "| " + " | ".join(h.replace("_", " ") for h in headers) + " |\n"
    md += "|" + "|".join(" ---: " for _ in headers) + "|\n"
    for row in rows:
        cells: list[str] = []
        for h in headers:
            val = row.get(h)
            if val is None:
                cells.append("null")
            elif isinstance(val, float):
                cells.append(f"{val:.2f}")
            elif isinstance(val, int):
                cells.append(f"{val:,}")
            else:
                cells.append(str(val))
        md += "| " + " | ".join(cells) + " |\n"
    return md
 def generate_json_summary(rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
    """Return rows sorted by task, agent, run_idx."""
    return sorted(
        rows,
        key=lambda r: (r.get("task", ""), r.get("agent", ""), r.get("run_idx", 0)),
    )
 # ---------------------------------------------------------------------------
 # Regenerate from existing logs
 # ---------------------------------------------------------------------------
 def regenerate(results_dir: Path) -> list[dict[str, Any]]:
    """Walk existing run directory and rebuild normalized rows."""
    rows: list[dict[str, Any]] = []
    for agent_dir in sorted(results_dir.iterdir()):
        if not agent_dir.is_dir() or agent_dir.name.startswith("."):
            continue
        agent_name = agent_dir.name
        for task_dir in sorted(agent_dir.iterdir()):
            if not task_dir.is_dir():
                continue
            # Check for per-run subdirectories
            subdirs = [d for d in task_dir.iterdir() if d.is_dir()]
            if subdirs and all(d.name.startswith("run_") for d in subdirs):
                for run_dir in sorted(subdirs):
                    row = parse_harbor_run(run_dir, agent_name)
                    row["task"] = row["task"] or task_dir.name
                    try:
                        row["run_idx"] = int(run_dir.name.split("_")[-1])
                    except (ValueError, IndexError):
                        row["run_idx"] = 0
                    rows.append(row)
            else:
                row = parse_harbor_run(task_dir, agent_name)
                row["task"] = row["task"] or task_dir.name
                row["run_idx"] = 0
                rows.append(row)
    return rows
 # ---------------------------------------------------------------------------
 # Metadata capture
 # ---------------------------------------------------------------------------
 def capture_metadata(model: str) -> dict[str, Any]:
    """Capture environment metadata for reproducibility."""
    meta: dict[str, Any] = {
        "timestamp_utc": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
        "platform": os.uname().sysname + "/" + os.uname().machine,
        "model": model,
        "dataset": HARBOR_DATASET,
    }
    # CodeWhale version
    r = subprocess.run(["codewhale", "--version"], capture_output=True, text=True)
    if r.returncode == 0:
        meta["codewhale_version"] = r.stdout.strip()
    # Codex version
    r = subprocess.run(["codex", "--version"], capture_output=True, text=True)
    if r.returncode == 0:
        meta["codex_version"] = r.stdout.strip()
    # Harbor version
    r = subprocess.run(["harbor", "--version"], capture_output=True, text=True)
    if r.returncode == 0:
        meta["harbor_version"] = r.stdout.strip()
    # Git commit
    r = subprocess.run(
        ["git", "rev-parse", "HEAD"],
        capture_output=True, text=True, cwd=REPO_ROOT,
    )
    if r.returncode == 0:
        meta["git_commit"] = r.stdout.strip()[:12]
    return meta
 # ---------------------------------------------------------------------------
 # Main
 # ---------------------------------------------------------------------------
 def main() -> None:
    parser = argparse.ArgumentParser(
        description="CodeWhale vs Codex CLI token comparison harness",
    )
    parser.add_argument(
        "--task", nargs="+", default=DEFAULT_TASKS,
        help=f"Terminal-Bench task names (default: {' '.join(DEFAULT_TASKS)})",
    )
    parser.add_argument(
        "--model", default=DEFAULT_MODEL,
        help=f"Model in provider/name format (default: {DEFAULT_MODEL})",
    )
    parser.add_argument(
        "--runs", type=int, default=DEFAULT_RUNS,
        help=f"Number of runs per agent per task (default: {DEFAULT_RUNS})",
    )
    parser.add_argument(
        "--timeout", type=int, default=DEFAULT_TIMEOUT_PER_RUN,
        help=f"Timeout per run in seconds (default: {DEFAULT_TIMEOUT_PER_RUN})",
    )
    parser.add_argument(
        "--regenerate", type=Path, default=None,
        help="Regenerate summary from existing raw results directory",
    )
    parser.add_argument(
        "--codewhale-agent", default=CODEWHALE_AGENT,
        help="Harbor agent import path for CodeWhale",
    )
    parser.add_argument(
        "--codex-agent", default=CODEX_AGENT,
        help="Harbor agent import path for Codex",
    )
    args = parser.parse_args()
    # --------------- Regenerate mode ---------------
    if args.regenerate:
        results_dir = args.regenerate
        if not results_dir.exists():
            sys.exit(f"Error: results directory not found: {results_dir}")
        rows = regenerate(results_dir)
        print(generate_markdown_table(rows))
        return
    # --------------- Fresh run mode ---------------
    check_harbor()
    date_str = datetime.now().strftime("%Y%m%d")
    run_dir = REPO_ROOT / "benchmark_results" / f"cli-compare-{date_str}"
    if run_dir.exists():
        # Append run number if directory already exists
        suffix = 2
        while (run_dir := REPO_ROOT / "benchmark_results" /
               f"cli-compare-{date_str}-{suffix}").exists():
            suffix += 1
    run_dir.mkdir(parents=True, exist_ok=True)
    # Metadata
    meta = capture_metadata(args.model)
    meta["tasks"] = args.task
    meta["runs_per_task"] = args.runs
    (run_dir / "metadata.json").write_text(json.dumps(meta, indent=2))
    cw_dir = run_dir / "codewhale"
    cx_dir = run_dir / "codex"
    cw_dir.mkdir(parents=True, exist_ok=True)
    cx_dir.mkdir(parents=True, exist_ok=True)
    all_rows: list[dict[str, Any]] = []
    for task in args.task:
        for run_idx in range(args.runs):
            header = f"Task: {task}  Run: {run_idx+1}/{args.runs}"
            print(f"\n{'='*60}")
            print(header)
            print("=" * 60)
            # --- CodeWhale ---
            print("\n--- CodeWhale ---")
            cw_run_dir = cw_dir / task / f"run_{run_idx}"
            cw_result = run_harbor_single_task(
                task=task, model=args.model,
                agent_path=args.codewhale_agent,
                results_dir=cw_run_dir, timeout=args.timeout,
            )
            cw_row = parse_harbor_run(cw_run_dir, "codewhale")
            cw_row["task"] = task
            cw_row["run_idx"] = run_idx
            cw_row["runtime_s"] = cw_row["runtime_s"] or cw_result["runtime_s"]
            if cw_result["exception"]:
                cw_row["exception"] = cw_row["exception"] or cw_result["exception"]
            all_rows.append(cw_row)
            self_report(cw_row)
            # --- Codex ---
            print("\n--- Codex ---")
            cx_run_dir = cx_dir / task / f"run_{run_idx}"
            cx_result = run_harbor_single_task(
                task=task, model=args.model,
                agent_path=args.codex_agent,
                results_dir=cx_run_dir, timeout=args.timeout,
            )
            cx_row = parse_harbor_run(cx_run_dir, "codex")
            cx_row["task"] = task
            cx_row["run_idx"] = run_idx
            cx_row["runtime_s"] = cx_row["runtime_s"] or cx_result["runtime_s"]
            if cx_result["exception"]:
                cx_row["exception"] = cx_row["exception"] or cx_result["exception"]
            all_rows.append(cx_row)
            self_report(cx_row)
    # Write summaries
    summary_json = run_dir / "summary.json"
    summary_json.write_text(
        json.dumps(generate_json_summary(all_rows), indent=2)
    )
    print(f"\nSummary JSON: {summary_json}")
    md = generate_markdown_table(all_rows)
    (run_dir / "summary.md").write_text(md)
    print(f"Summary MD:   {run_dir / 'summary.md'}")
    print(f"Metadata:     {run_dir / 'metadata.json'}")
    print("\n" + md)
 def self_report(row: dict[str, Any]) -> None:
    """Print a one-line summary of a parsed run."""
    parts = [
        f"reward={row['reward']}" if row["reward"] is not None else "reward=null",
        f"input={row['input_tokens']}" if row["input_tokens"] is not None else "input=null",
        f"output={row['output_tokens']}" if row["output_tokens"] is not None else "output=null",
        f"cached={row['cached_tokens']}" if row["cached_tokens"] is not None else "",
        f"reasoning={row['reasoning_tokens']}" if row["reasoning_tokens"] is not None else "",
        f"answer_len={row['answer_len']}" if row["answer_len"] is not None else "",
        f"runtime={row['runtime_s']:.1f}s" if row["runtime_s"] is not None else "",
    ]
    print("  " + ", ".join(p for p in parts if p))
 if __name__ == "__main__":
    main()
@@ -0,0 +1,126 @@
 """Harbor adapter for Codex CLI."""
 import json
 import os
 import shlex
 from pathlib import Path, PurePosixPath
 from typing import Any
 from harbor.agents.installed.base import (
    BaseInstalledAgent,
    CliFlag,
    with_prompt_template,
 )
 from harbor.environments.base import BaseEnvironment
 from harbor.models.agent.context import AgentContext
 class CodexAgent(BaseInstalledAgent):
    """Codex CLI agent adapter for Harbor."""
    _OUTPUT_FILENAME = "codex.txt"
    CLI_FLAGS = [
        CliFlag(
            "allowed-tools",
            cli="--allowed-tools",
            type="str",
            default="Bash,Read,Write,Edit,Glob,Grep",
        ),
    ]
    @staticmethod
    def name() -> str:
        return "codex"
    def version(self) -> str | None:
        return getattr(self, "_version", None)
    def get_version_command(self) -> str | None:
        return "codex --version 2>/dev/null || codex-cli --version 2>/dev/null"
    def parse_version(self, stdout: str) -> str:
        text = stdout.strip()
        for line in text.splitlines():
            line = line.strip()
            if line:
                for prefix in ("codex-cli ", "codex "):
                    if line.lower().startswith(prefix):
                        return line[len(prefix):]
                return line
        return text
    async def install(self, environment: BaseEnvironment) -> None:
        """Install Codex CLI in the container."""
        await self.exec_as_root(
            environment,
            command=(
                "if ldd --version 2>&1 | grep -qi musl || [ -f /etc/alpine-release ]; then"
                "  apk add --no-cache curl bash nodejs npm git ripgrep;"
                " elif command -v apt-get &>/dev/null; then"
                "  apt-get update && apt-get install -y curl git ripgrep;"
                " elif command -v yum &>/dev/null; then"
                "  yum install -y curl git ripgrep;"
                " fi"
            ),
            env={"DEBIAN_FRONTEND": "noninteractive"},
        )
        await self.exec_as_root(
            environment,
            command=(
                "if ! command -v node &>/dev/null; then"
                "  curl -fsSL https://deb.nodesource.com/setup_20.x | bash - &&"
                "  apt-get install -y nodejs;"
                " fi"
            ),
            env={"DEBIAN_FRONTEND": "noninteractive"},
        )
        await self.exec_as_agent(
            environment,
            command="npm install -g codex",
        )
    @with_prompt_template
    async def run(
        self,
        instruction: str,
        environment: BaseEnvironment,
        context: AgentContext,
    ) -> None:
        """Run Codex CLI in non-interactive exec mode."""
        escaped_instruction = shlex.quote(instruction)
        cli_flags = self.build_cli_flags()
        extra_flags = (cli_flags + " ") if cli_flags else ""
        model_flag = ""
        if self.model_name:
            model_flag = f"--model {shlex.quote(self.model_name)} "
        # Forward API keys
        env: dict[str, str] = {}
        for key in ("CODEX_API_KEY", "DEEPSEEK_API_KEY", "OPENAI_API_KEY",
                     "ANTHROPIC_API_KEY", "OPENROUTER_API_KEY"):
            val = os.environ.get(key, "")
            if val:
                env[key] = val
        output_path = f"/logs/agent/{self._OUTPUT_FILENAME}"
        await self.exec_as_agent(
            environment,
            command=(
                f"codex exec --yes "
                f"{model_flag}{extra_flags}"
                f"--workspace /workspace "
                f"{escaped_instruction} "
                f"2>&1 | tee {shlex.quote(output_path)}"
                f" || true"
            ),
            env=env if env else None,
        )
    def populate_context_post_run(self, context: AgentContext) -> None:
        pass