From 0cd8bcde1b268d7bb9bb829fe01a50496034732d Mon Sep 17 00:00:00 2001 From: idling11 <8055620+idling11@users.noreply.github.com> Date: Fri, 12 Jun 2026 01:15:00 -0700 Subject: [PATCH] feat(bench): add CLI comparison harness Harvest #3009 for the v0.8.59 release lane. Adds a paired Terminal-Bench harness for CodeWhale and Codex, a Codex Harbor adapter, generated-result ignore protection, and benchmark docs. Maintainer amendments keep explicit zero-valued metrics, regenerate parent task names, write refreshed summaries in regenerate mode, and allow transcript paths outside the repo. Fixes #2952. --- .gitignore | 2 + benchmark_results/.gitkeep | 0 docs/BENCHMARKS.md | 17 + scripts/benchmarks/README.md | 7 + scripts/benchmarks/cli-compare.py | 602 +++++++++++++++++++++++ scripts/benchmarks/harbor/codex_agent.py | 126 +++++ 6 files changed, 754 insertions(+) create mode 100644 benchmark_results/.gitkeep create mode 100755 scripts/benchmarks/cli-compare.py create mode 100755 scripts/benchmarks/harbor/codex_agent.py diff --git a/.gitignore b/.gitignore index ab1cf906..50ba4f49 100644 --- a/.gitignore +++ b/.gitignore @@ -123,4 +123,6 @@ scripts/run_deep_swe.py # Benchmark artifacts and caches re-included by !scripts/** results/ +benchmark_results/* +!benchmark_results/.gitkeep scripts/**/__pycache__/ diff --git a/benchmark_results/.gitkeep b/benchmark_results/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/docs/BENCHMARKS.md b/docs/BENCHMARKS.md index 390e2a02..41c1ccb9 100644 --- a/docs/BENCHMARKS.md +++ b/docs/BENCHMARKS.md @@ -103,6 +103,23 @@ harbor run \ --model deepseek/deepseek-chat ``` +### Compare CodeWhale and Codex + +Use the paired comparison harness when you need one normalized row per CLI for +the same task, model, timeout, and environment: + +```bash +python scripts/benchmarks/cli-compare.py \ + --task prove-plus-comm \ + --model deepseek/deepseek-chat \ + --runs 3 +``` + +The harness writes raw Harbor logs plus `summary.json`, `summary.md`, and +`metadata.json` under `benchmark_results/cli-compare-*`. Missing metrics are +reported as JSON `null`, and generated run directories are intentionally ignored +by git; keep only curated summaries in docs or release notes. + ## PinchBench PinchBench measures agent performance on real-world tasks — scheduling, email diff --git a/scripts/benchmarks/README.md b/scripts/benchmarks/README.md index 600e4741..df0bd92e 100644 --- a/scripts/benchmarks/README.md +++ b/scripts/benchmarks/README.md @@ -17,6 +17,11 @@ export DEEPSEEK_API_KEY="sk-..." ./scripts/benchmarks/run-terminal-bench.sh \ --model deepseek/deepseek-chat +# CodeWhale vs Codex comparison rows +python scripts/benchmarks/cli-compare.py \ + --task prove-plus-comm \ + --model deepseek/deepseek-chat + # PinchBench (auto-install + run) ./scripts/benchmarks/run-pinchbench.sh \ --install \ @@ -28,8 +33,10 @@ export DEEPSEEK_API_KEY="sk-..." - `run-swebench.sh` — SWE-bench batch driver and evaluator - `run-terminal-bench.sh` — Terminal-Bench runner via Harbor - `run-pinchbench.sh` — PinchBench runner with auto-install +- `cli-compare.py` — CodeWhale/Codex Terminal-Bench comparison harness - `harbor/__init__.py` — Harbor adapter for CodeWhale (Python) - `harbor/codewhale_agent.py` — Adapter entry point +- `harbor/codex_agent.py` — Codex adapter for paired CLI comparisons ## Documentation diff --git a/scripts/benchmarks/cli-compare.py b/scripts/benchmarks/cli-compare.py new file mode 100755 index 00000000..27ddace5 --- /dev/null +++ b/scripts/benchmarks/cli-compare.py @@ -0,0 +1,602 @@ +#!/usr/bin/env python3 +""" +cli-compare.py - Run Terminal-Bench tasks through CodeWhale and Codex CLIs, +emit normalized token/performance comparison rows. + +Usage: + # Run default tasks + python scripts/benchmarks/cli-compare.py + + # Specific task and model + python scripts/benchmarks/cli-compare.py --task prove-plus-comm \\ + --model deepseek/deepseek-chat --runs 3 + + # Regenerate from existing run artifacts + python scripts/benchmarks/cli-compare.py \\ + --regenerate benchmark_results/cli-compare-20260609 + +Output (per run date): + benchmark_results/cli-compare-YYYYMMDD/ + summary.json - one row per agent, all fields normalized + summary.md - Markdown table suitable for release notes + metadata.json - versions, model, timestamp, platform + codewhale// - raw Harbor output + codex// - raw Harbor output + +Prerequisites: + pip install harbor + Docker running + DEEPSEEK_API_KEY set (for CodeWhale) + CODEX_API_KEY or equivalent set (for Codex) + +Field semantics (summary.json rows): + task str - Terminal-Bench task name + agent str - "codewhale" or "codex" + run_idx int - 0-based run index + reward float - pass/fail score (1.0 = pass) + runtime_s float - wall-clock seconds (null if not available) + exception str - raised exception text (null = clean finish) + input_tokens int - provider-reported input tokens + cached_tokens int - provider-reported cached input tokens (null if N/A) + output_tokens int - provider-reported output tokens + reasoning_tokens int - provider-reported reasoning tokens (null if N/A) + answer_len int - locally-derived visible final-answer character count + transcript_path str - relative path to raw agent output file + +All missing metrics are serialized as JSON ``null`` - never silently zeroed. +""" + +import argparse +import json +import os +import subprocess +import sys +import time +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Optional + +SCRIPT_DIR = Path(__file__).resolve().parent +REPO_ROOT = SCRIPT_DIR.parent.parent + +# --------------------------------------------------------------------------- +# Config +# --------------------------------------------------------------------------- + +DEFAULT_TASKS = [ + "prove-plus-comm", + "cancel-async-tasks", + "configure-git-webserver", + "fix-code-vulnerability", +] +DEFAULT_MODEL = "deepseek/deepseek-chat" +DEFAULT_TIMEOUT_PER_RUN = 900 # seconds (Harbor handles its own timeout internally) +DEFAULT_RUNS = 1 +HARBOR_DATASET = "terminal-bench@2.0" +CODEWHALE_AGENT = "scripts.benchmarks.harbor:CodeWhaleAgent" +CODEX_AGENT = "scripts.benchmarks.harbor.codex_agent:CodexAgent" + +# --------------------------------------------------------------------------- +# Harbor integration +# --------------------------------------------------------------------------- + + +def check_harbor() -> None: + """Verify Harbor is installed and Docker is running.""" + if subprocess.run(["which", "harbor"], capture_output=True).returncode != 0: + sys.exit("Error: 'harbor' not found. Install with: pip install harbor") + if subprocess.run(["docker", "info"], capture_output=True).returncode != 0: + sys.exit("Error: Docker not running. Harbor requires Docker.") + + +def run_harbor_single_task( + task: str, + model: str, + agent_path: str, + results_dir: Path, + timeout: int, +) -> dict[str, Any]: + """Run a single Terminal-Bench task through Harbor. + + Harbor supports single-task runs with dataset colon syntax. + """ + dataset = f"{HARBOR_DATASET}:{task}" # Harbor colon-syntax for single task + results_dir.mkdir(parents=True, exist_ok=True) + + cmd = [ + "harbor", "run", + "--dataset", dataset, + "--agent", agent_path, + "--model", model, + "--n-concurrent", "1", + "--results-dir", str(results_dir), + ] + + start = time.time() + try: + proc = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=timeout, + cwd=REPO_ROOT, + ) + runtime_s = round(time.time() - start, 2) + except subprocess.TimeoutExpired: + runtime_s = round(time.time() - start, 2) + return { + "task": task, "model": model, "agent": agent_path, + "runtime_s": runtime_s, "exit_code": -1, + "exception": f"Timeout after {timeout}s", + "stdout": "", "stderr": "", "results_dir": str(results_dir), + } + + return { + "task": task, "model": model, "agent": agent_path, + "runtime_s": runtime_s, + "exit_code": proc.returncode, + "exception": None, + "stdout": proc.stdout, + "stderr": proc.stderr, + "results_dir": str(results_dir), + } + + +# --------------------------------------------------------------------------- +# Result parsing +# --------------------------------------------------------------------------- + + +def _try_int(val: Any) -> Optional[int]: + if val is None: + return None + try: + return int(val) + except (ValueError, TypeError): + return None + + +def _try_float(val: Any) -> Optional[float]: + if val is None: + return None + try: + return float(val) + except (ValueError, TypeError): + return None + + +def _first_present(mapping: dict[str, Any], *keys: str) -> Any: + for key in keys: + if key in mapping and mapping[key] is not None: + return mapping[key] + return None + + +def _stable_path(path: Path) -> str: + try: + return str(path.relative_to(REPO_ROOT)) + except ValueError: + return str(path) + + +def parse_token_jsonl(lines: list[str]) -> dict[str, Optional[int]]: + """Extract token usage from CodeWhale/Codex stream JSONL lines. + + CodeWhale emits ``{"type":"result","usage":{...}}`` at end-of-stream. + Codex may emit usage in closing messages or transcript footers. + """ + result: dict[str, Optional[int]] = { + "input_tokens": None, "cached_tokens": None, + "output_tokens": None, "reasoning_tokens": None, + } + if not lines: + return result + + for line in reversed(lines): # usage typically at the end + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + except json.JSONDecodeError: + # Try regex extraction for non-JSON transcript lines + continue + + usage = obj.get("usage") or obj.get("token_usage") or {} + if isinstance(usage, dict): + if result["input_tokens"] is None: + result["input_tokens"] = _try_int( + _first_present(usage, "input_tokens", "prompt_tokens") + ) + if result["cached_tokens"] is None: + result["cached_tokens"] = _try_int( + _first_present( + usage, + "cached_input_tokens", + "cache_read_input_tokens", + "cached_tokens", + ) + ) + if result["output_tokens"] is None: + result["output_tokens"] = _try_int( + _first_present(usage, "output_tokens", "completion_tokens") + ) + if result["reasoning_tokens"] is None: + result["reasoning_tokens"] = _try_int( + _first_present( + usage, + "reasoning_tokens", + "thinking_tokens", + "reasoning_completion_tokens", + ) + ) + if all(v is not None for v in result.values()): + break + + return result + + +def extract_answer_len(text: str) -> Optional[int]: + """Heuristic: length of the last substantial text block that looks like an answer. + + Looks for the last non-code, non-log paragraph after the agent has finished + its tool-calling phase. Returns character count or None. + """ + if not text: + return None + # Agent outputs often have a "## Final Answer" or similar marker. + # Try to find the last answer section. + for marker in ("## Final Answer", "## Answer", "final answer", + "Here is the", "The solution"): + idx = text.rfind(marker) + if idx >= 0: + # Take text from marker to end, strip trailing shell logs + tail = text[idx:] + # Stop at next shell prompt or markdown separator + for term in ("```", "$ ", "# ", "/workspace"): + term_idx = tail.find(term, len(marker)) + if term_idx > 0: + tail = tail[:term_idx] + return len(tail.strip()) + + # Fallback: last paragraph that isn't code or a prompt + paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()] + for p in reversed(paragraphs): + if not p.startswith("```") and not p.startswith("$") and len(p) > 20: + return len(p) + + return len(text.strip()) if text.strip() else None + + +def parse_harbor_run(task_dir: Path, agent_name: str) -> dict[str, Any]: + """Parse Harbor results for a single task run. + + Harbor stores per-task output in: + / + results.json - Harbor's own eval summary + logs/agent/*.txt - raw agent transcript (if stdout captured) + """ + row: dict[str, Any] = { + "task": task_dir.name, + "agent": agent_name, + "reward": None, + "runtime_s": None, + "exception": None, + "input_tokens": None, + "cached_tokens": None, + "output_tokens": None, + "reasoning_tokens": None, + "answer_len": None, + "transcript_path": None, + } + + # 1. Harbor results.json - pass/fail and runtime + for candidate in sorted(task_dir.rglob("results.json")): + try: + data = json.loads(candidate.read_text()) + if isinstance(data, dict): + row["reward"] = _try_float(_first_present(data, "score", "reward")) + row["runtime_s"] = _try_float( + _first_present(data, "runtime", "duration") + ) + exc = data.get("exception") or data.get("error") + row["exception"] = str(exc) if exc else None + break + except (json.JSONDecodeError, OSError): + continue + + # 2. Agent transcript - token usage and answer + for txt_file in sorted(task_dir.rglob("*.txt")): + if txt_file.name.startswith("."): + continue + try: + text = txt_file.read_text(errors="ignore") + except OSError: + continue + if not text.strip(): + continue + + row["transcript_path"] = _stable_path(txt_file) + + tokens = parse_token_jsonl(text.split("\n")) + for key, value in tokens.items(): + if row[key] is None: + row[key] = value + + if row["answer_len"] is None: + row["answer_len"] = extract_answer_len(text) + break + + # 3. Harbor run metadata - runtime fallback + for meta_file in sorted(task_dir.rglob("run_metadata.json")): + try: + data = json.loads(meta_file.read_text()) + if isinstance(data, dict) and row["runtime_s"] is None: + row["runtime_s"] = _try_float(data.get("runtime_seconds")) + except (json.JSONDecodeError, OSError): + continue + + return row + + +# --------------------------------------------------------------------------- +# Summary generation +# --------------------------------------------------------------------------- + + +def generate_markdown_table(rows: list[dict[str, Any]]) -> str: + """Generate a Markdown comparison table from normalized rows.""" + if not rows: + return "*(no data)*\n" + + headers = [ + "task", "agent", "reward", "input_tokens", "cached_tokens", + "output_tokens", "reasoning_tokens", "runtime_s", "answer_len", + ] + + md = "| " + " | ".join(h.replace("_", " ") for h in headers) + " |\n" + md += "|" + "|".join(" ---: " for _ in headers) + "|\n" + + for row in rows: + cells: list[str] = [] + for h in headers: + val = row.get(h) + if val is None: + cells.append("null") + elif isinstance(val, float): + cells.append(f"{val:.2f}") + elif isinstance(val, int): + cells.append(f"{val:,}") + else: + cells.append(str(val)) + md += "| " + " | ".join(cells) + " |\n" + + return md + + +def generate_json_summary(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Return rows sorted by task, agent, run_idx.""" + return sorted( + rows, + key=lambda r: (r.get("task", ""), r.get("agent", ""), r.get("run_idx", 0)), + ) + + +# --------------------------------------------------------------------------- +# Regenerate from existing logs +# --------------------------------------------------------------------------- + + +def regenerate(results_dir: Path) -> list[dict[str, Any]]: + """Walk existing run directory and rebuild normalized rows.""" + rows: list[dict[str, Any]] = [] + for agent_dir in sorted(results_dir.iterdir()): + if not agent_dir.is_dir() or agent_dir.name.startswith("."): + continue + agent_name = agent_dir.name + for task_dir in sorted(agent_dir.iterdir()): + if not task_dir.is_dir(): + continue + # Check for per-run subdirectories + subdirs = [d for d in task_dir.iterdir() if d.is_dir()] + if subdirs and all(d.name.startswith("run_") for d in subdirs): + for run_dir in sorted(subdirs): + row = parse_harbor_run(run_dir, agent_name) + row["task"] = task_dir.name + try: + row["run_idx"] = int(run_dir.name.split("_")[-1]) + except (ValueError, IndexError): + row["run_idx"] = 0 + rows.append(row) + else: + row = parse_harbor_run(task_dir, agent_name) + row["task"] = task_dir.name + row["run_idx"] = 0 + rows.append(row) + return rows + + +# --------------------------------------------------------------------------- +# Metadata capture +# --------------------------------------------------------------------------- + + +def capture_metadata(model: str) -> dict[str, Any]: + """Capture environment metadata for reproducibility.""" + meta: dict[str, Any] = { + "timestamp_utc": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + "platform": os.uname().sysname + "/" + os.uname().machine, + "model": model, + "dataset": HARBOR_DATASET, + } + # CodeWhale version + r = subprocess.run(["codewhale", "--version"], capture_output=True, text=True) + if r.returncode == 0: + meta["codewhale_version"] = r.stdout.strip() + # Codex version + r = subprocess.run(["codex", "--version"], capture_output=True, text=True) + if r.returncode == 0: + meta["codex_version"] = r.stdout.strip() + # Harbor version + r = subprocess.run(["harbor", "--version"], capture_output=True, text=True) + if r.returncode == 0: + meta["harbor_version"] = r.stdout.strip() + # Git commit + r = subprocess.run( + ["git", "rev-parse", "HEAD"], + capture_output=True, text=True, cwd=REPO_ROOT, + ) + if r.returncode == 0: + meta["git_commit"] = r.stdout.strip()[:12] + return meta + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main() -> None: + parser = argparse.ArgumentParser( + description="CodeWhale vs Codex CLI token comparison harness", + ) + parser.add_argument( + "--task", nargs="+", default=DEFAULT_TASKS, + help=f"Terminal-Bench task names (default: {' '.join(DEFAULT_TASKS)})", + ) + parser.add_argument( + "--model", default=DEFAULT_MODEL, + help=f"Model in provider/name format (default: {DEFAULT_MODEL})", + ) + parser.add_argument( + "--runs", type=int, default=DEFAULT_RUNS, + help=f"Number of runs per agent per task (default: {DEFAULT_RUNS})", + ) + parser.add_argument( + "--timeout", type=int, default=DEFAULT_TIMEOUT_PER_RUN, + help=f"Timeout per run in seconds (default: {DEFAULT_TIMEOUT_PER_RUN})", + ) + parser.add_argument( + "--regenerate", type=Path, default=None, + help="Regenerate summary from existing raw results directory", + ) + parser.add_argument( + "--codewhale-agent", default=CODEWHALE_AGENT, + help="Harbor agent import path for CodeWhale", + ) + parser.add_argument( + "--codex-agent", default=CODEX_AGENT, + help="Harbor agent import path for Codex", + ) + args = parser.parse_args() + + # --------------- Regenerate mode --------------- + if args.regenerate: + results_dir = args.regenerate + if not results_dir.exists(): + sys.exit(f"Error: results directory not found: {results_dir}") + rows = regenerate(results_dir) + summary_rows = generate_json_summary(rows) + (results_dir / "summary.json").write_text(json.dumps(summary_rows, indent=2)) + md = generate_markdown_table(summary_rows) + (results_dir / "summary.md").write_text(md) + print(md) + return + + # --------------- Fresh run mode --------------- + check_harbor() + + date_str = datetime.now().strftime("%Y%m%d") + run_dir = REPO_ROOT / "benchmark_results" / f"cli-compare-{date_str}" + if run_dir.exists(): + # Append run number if directory already exists + suffix = 2 + while (run_dir := REPO_ROOT / "benchmark_results" / + f"cli-compare-{date_str}-{suffix}").exists(): + suffix += 1 + run_dir.mkdir(parents=True, exist_ok=True) + + # Metadata + meta = capture_metadata(args.model) + meta["tasks"] = args.task + meta["runs_per_task"] = args.runs + (run_dir / "metadata.json").write_text(json.dumps(meta, indent=2)) + + cw_dir = run_dir / "codewhale" + cx_dir = run_dir / "codex" + cw_dir.mkdir(parents=True, exist_ok=True) + cx_dir.mkdir(parents=True, exist_ok=True) + + all_rows: list[dict[str, Any]] = [] + + for task in args.task: + for run_idx in range(args.runs): + header = f"Task: {task} Run: {run_idx+1}/{args.runs}" + print(f"\n{'='*60}") + print(header) + print("=" * 60) + + print("\n--- CodeWhale ---") + cw_run_dir = cw_dir / task / f"run_{run_idx}" + cw_result = run_harbor_single_task( + task=task, model=args.model, + agent_path=args.codewhale_agent, + results_dir=cw_run_dir, timeout=args.timeout, + ) + cw_row = parse_harbor_run(cw_run_dir, "codewhale") + cw_row["task"] = task + cw_row["run_idx"] = run_idx + if cw_row["runtime_s"] is None: + cw_row["runtime_s"] = cw_result["runtime_s"] + if cw_result["exception"]: + cw_row["exception"] = cw_row["exception"] or cw_result["exception"] + all_rows.append(cw_row) + self_report(cw_row) + + print("\n--- Codex ---") + cx_run_dir = cx_dir / task / f"run_{run_idx}" + cx_result = run_harbor_single_task( + task=task, model=args.model, + agent_path=args.codex_agent, + results_dir=cx_run_dir, timeout=args.timeout, + ) + cx_row = parse_harbor_run(cx_run_dir, "codex") + cx_row["task"] = task + cx_row["run_idx"] = run_idx + if cx_row["runtime_s"] is None: + cx_row["runtime_s"] = cx_result["runtime_s"] + if cx_result["exception"]: + cx_row["exception"] = cx_row["exception"] or cx_result["exception"] + all_rows.append(cx_row) + self_report(cx_row) + + # Write summaries + summary_json = run_dir / "summary.json" + summary_json.write_text( + json.dumps(generate_json_summary(all_rows), indent=2) + ) + print(f"\nSummary JSON: {summary_json}") + + md = generate_markdown_table(all_rows) + (run_dir / "summary.md").write_text(md) + print(f"Summary MD: {run_dir / 'summary.md'}") + print(f"Metadata: {run_dir / 'metadata.json'}") + print("\n" + md) + + +def self_report(row: dict[str, Any]) -> None: + """Print a one-line summary of a parsed run.""" + parts = [ + f"reward={row['reward']}" if row["reward"] is not None else "reward=null", + f"input={row['input_tokens']}" if row["input_tokens"] is not None else "input=null", + f"output={row['output_tokens']}" if row["output_tokens"] is not None else "output=null", + f"cached={row['cached_tokens']}" if row["cached_tokens"] is not None else "", + f"reasoning={row['reasoning_tokens']}" if row["reasoning_tokens"] is not None else "", + f"answer_len={row['answer_len']}" if row["answer_len"] is not None else "", + f"runtime={row['runtime_s']:.1f}s" if row["runtime_s"] is not None else "", + ] + print(" " + ", ".join(p for p in parts if p)) + + +if __name__ == "__main__": + main() diff --git a/scripts/benchmarks/harbor/codex_agent.py b/scripts/benchmarks/harbor/codex_agent.py new file mode 100755 index 00000000..abae2104 --- /dev/null +++ b/scripts/benchmarks/harbor/codex_agent.py @@ -0,0 +1,126 @@ +"""Harbor adapter for Codex CLI.""" + +import json +import os +import shlex +from pathlib import Path, PurePosixPath +from typing import Any + +from harbor.agents.installed.base import ( + BaseInstalledAgent, + CliFlag, + with_prompt_template, +) +from harbor.environments.base import BaseEnvironment +from harbor.models.agent.context import AgentContext + + +class CodexAgent(BaseInstalledAgent): + """Codex CLI agent adapter for Harbor.""" + + _OUTPUT_FILENAME = "codex.txt" + + CLI_FLAGS = [ + CliFlag( + "allowed-tools", + cli="--allowed-tools", + type="str", + default="Bash,Read,Write,Edit,Glob,Grep", + ), + ] + + @staticmethod + def name() -> str: + return "codex" + + def version(self) -> str | None: + return getattr(self, "_version", None) + + def get_version_command(self) -> str | None: + return "codex --version 2>/dev/null || codex-cli --version 2>/dev/null" + + def parse_version(self, stdout: str) -> str: + text = stdout.strip() + for line in text.splitlines(): + line = line.strip() + if line: + for prefix in ("codex-cli ", "codex "): + if line.lower().startswith(prefix): + return line[len(prefix):] + return line + return text + + async def install(self, environment: BaseEnvironment) -> None: + """Install Codex CLI in the container.""" + await self.exec_as_root( + environment, + command=( + "if ldd --version 2>&1 | grep -qi musl || [ -f /etc/alpine-release ]; then" + " apk add --no-cache curl bash nodejs npm git ripgrep;" + " elif command -v apt-get &>/dev/null; then" + " apt-get update && apt-get install -y curl git ripgrep;" + " elif command -v yum &>/dev/null; then" + " yum install -y curl git ripgrep;" + " fi" + ), + env={"DEBIAN_FRONTEND": "noninteractive"}, + ) + + await self.exec_as_root( + environment, + command=( + "if ! command -v node &>/dev/null; then" + " curl -fsSL https://deb.nodesource.com/setup_20.x | bash - &&" + " apt-get install -y nodejs;" + " fi" + ), + env={"DEBIAN_FRONTEND": "noninteractive"}, + ) + + await self.exec_as_agent( + environment, + command="npm install -g codex", + ) + + @with_prompt_template + async def run( + self, + instruction: str, + environment: BaseEnvironment, + context: AgentContext, + ) -> None: + """Run Codex CLI in non-interactive exec mode.""" + escaped_instruction = shlex.quote(instruction) + + cli_flags = self.build_cli_flags() + extra_flags = (cli_flags + " ") if cli_flags else "" + + model_flag = "" + if self.model_name: + model_flag = f"--model {shlex.quote(self.model_name)} " + + # Forward API keys + env: dict[str, str] = {} + for key in ("CODEX_API_KEY", "DEEPSEEK_API_KEY", "OPENAI_API_KEY", + "ANTHROPIC_API_KEY", "OPENROUTER_API_KEY"): + val = os.environ.get(key, "") + if val: + env[key] = val + + output_path = f"/logs/agent/{self._OUTPUT_FILENAME}" + + await self.exec_as_agent( + environment, + command=( + f"codex exec --yes " + f"{model_flag}{extra_flags}" + f"--workspace /workspace " + f"{escaped_instruction} " + f"2>&1 | tee {shlex.quote(output_path)}" + f" || true" + ), + env=env if env else None, + ) + + def populate_context_post_run(self, context: AgentContext) -> None: + pass