From b329a532f52f0bedf7a4903309f46d6d5fd550e0 Mon Sep 17 00:00:00 2001 From: Hunter B Date: Thu, 4 Jun 2026 19:21:23 -0700 Subject: [PATCH] feat(benchmarks): add SWE-bench, Terminal-Bench, and PinchBench integration Benchmark harness for evaluating CodeWhale against three external benchmarks: - SWE-bench: batch driver wrapping existing codewhale swebench commands - Terminal-Bench: Harbor adapter (BaseInstalledAgent) for container eval - PinchBench: runner with auto-install for real-world agent tasks Includes docs/BENCHMARKS.md umbrella doc with setup, usage, and reproducibility checklist. Scripts record version/commit/timestamp metadata for each run. Branch: codex/v0.8.53-benchmarks (based on v0.8.53) --- docs/BENCHMARKS.md | 153 ++++++++++++++++ scripts/benchmarks/README.md | 37 ++++ scripts/benchmarks/harbor/__init__.py | 175 +++++++++++++++++++ scripts/benchmarks/harbor/codewhale_agent.py | 4 + scripts/benchmarks/run-pinchbench.sh | 149 ++++++++++++++++ scripts/benchmarks/run-swebench.sh | 161 +++++++++++++++++ scripts/benchmarks/run-terminal-bench.sh | 113 ++++++++++++ 7 files changed, 792 insertions(+) create mode 100644 docs/BENCHMARKS.md create mode 100644 scripts/benchmarks/README.md create mode 100644 scripts/benchmarks/harbor/__init__.py create mode 100644 scripts/benchmarks/harbor/codewhale_agent.py create mode 100755 scripts/benchmarks/run-pinchbench.sh create mode 100755 scripts/benchmarks/run-swebench.sh create mode 100755 scripts/benchmarks/run-terminal-bench.sh diff --git a/docs/BENCHMARKS.md b/docs/BENCHMARKS.md new file mode 100644 index 00000000..0d7f0e5f --- /dev/null +++ b/docs/BENCHMARKS.md @@ -0,0 +1,153 @@ +# Benchmarks + +CodeWhale integrates with three external benchmarks to measure real-world +coding-agent performance. Each benchmark tests a different surface: + +| Benchmark | What it tests | Harness | Output format | +|---|---|---|---| +| **SWE-bench** | Patch generation from GitHub issues | CodeWhale built-in (`codewhale swebench`) | `all_preds.jsonl` | +| **Terminal-Bench** | End-to-end terminal tasks (compile, deploy, configure) | Harbor framework adapter | Harbor result JSON | +| **PinchBench** | Real-world agent tasks (calendar, email, coding, research) | Standalone runner via OpenClaw-compatible adapter | PinchBench result JSON | + +All three require Docker. SWE-bench and Terminal-Bench also need the official +evaluation harness installed separately. + +## Prerequisites + +```bash +# Docker (all benchmarks) +docker --version + +# Python 3.10+ with uv (Terminal-Bench, PinchBench, SWE-bench eval) +python3 --version +uv --version + +# CodeWhale v0.8.53+ +codewhale --version + +# API key +export DEEPSEEK_API_KEY="sk-..." +``` + +## SWE-bench + +CodeWhale has built-in SWE-bench support via `codewhale swebench run` and +`codewhale swebench export`. See [docs/SWEBENCH.md](SWEBENCH.md) for the +single-instance workflow. + +### Batch run + +```bash +# Run all instances from a dataset split +./scripts/benchmarks/run-swebench.sh \ + --dataset princeton-nlp/SWE-bench_Lite \ + --split test \ + --predictions-path ./results/swebench_preds.jsonl + +# Run a single instance +./scripts/benchmarks/run-swebench.sh \ + --instance-id django__django-12345 \ + --issue-file ./issue.md \ + --predictions-path ./results/swebench_preds.jsonl +``` + +### Evaluate + +```bash +python -m swebench.harness.run_evaluation \ + --dataset_name princeton-nlp/SWE-bench_Lite \ + --predictions_path ./results/swebench_preds.jsonl \ + --max_workers 1 \ + --run_id codewhale-v0.8.53 +``` + +## Terminal-Bench (via Harbor) + +Terminal-Bench tests agents on real terminal tasks — compiling, deploying, +configuring servers, training models. The [Harbor framework](https://github.com/harbor-framework/harbor) +is the official harness. + +CodeWhale plugs in via a Harbor adapter (`scripts/benchmarks/harbor/codewhale_agent.py`). + +### Setup + +```bash +pip install harbor +``` + +### Run + +```bash +# Via the convenience script +./scripts/benchmarks/run-terminal-bench.sh \ + --dataset terminal-bench@2.0 \ + --model deepseek/deepseek-chat \ + --n-concurrent 4 + +# Or directly with harbor +harbor run \ + --dataset terminal-bench@2.0 \ + --agent codewhale \ + --model deepseek/deepseek-chat \ + --n-concurrent 4 +``` + +### Custom agent path + +If the adapter is not installed system-wide, point Harbor at it: + +```bash +harbor run \ + --dataset terminal-bench@2.0 \ + --agent scripts.benchmarks.harbor.codewhale_agent:CodeWhaleAgent \ + --model deepseek/deepseek-chat +``` + +## PinchBench + +PinchBench measures agent performance on real-world tasks — scheduling, email +triage, code generation, research, file management. It uses OpenClaw as the +agent runtime. + +### Setup + +```bash +git clone https://github.com/pinchbench/skill.git /tmp/pinchbench +cd /tmp/pinchbench +uv venv && source .venv/bin/activate +uv pip install -e . +``` + +### Run + +```bash +# Via the convenience script +./scripts/benchmarks/run-pinchbench.sh \ + --model deepseek/deepseek-chat \ + --suite all + +# Or directly +cd /tmp/pinchbench && ./scripts/run.sh \ + --model deepseek/deepseek-chat \ + --suite all +``` + +## Reproducibility checklist + +When publishing benchmark results, record: + +- [ ] CodeWhale version: `codewhale --version` +- [ ] Git commit: `git rev-parse HEAD` +- [ ] Model and provider (e.g. `deepseek/deepseek-chat`) +- [ ] Benchmark dataset and version +- [ ] Docker platform (`linux/amd64` vs `linux/arm64`) +- [ ] Worker concurrency +- [ ] Timestamp (UTC) +- [ ] Full result file (`all_preds.jsonl`, Harbor result dir, or PinchBench results JSON) + +## References + +- SWE-bench: https://github.com/SWE-bench/SWE-bench +- Terminal-Bench: https://github.com/laude-institute/terminal-bench / https://www.tbench.ai +- Harbor: https://github.com/harbor-framework/harbor / https://harborframework.com +- PinchBench: https://github.com/pinchbench/skill / https://pinchbench.com diff --git a/scripts/benchmarks/README.md b/scripts/benchmarks/README.md new file mode 100644 index 00000000..600e4741 --- /dev/null +++ b/scripts/benchmarks/README.md @@ -0,0 +1,37 @@ +# Benchmark Scripts + +Convenience runners for evaluating CodeWhale against external benchmarks. + +## Quick Start + +```bash +# Set your API key +export DEEPSEEK_API_KEY="sk-..." + +# SWE-bench (single instance) +./scripts/benchmarks/run-swebench.sh \ + --instance-id django__django-12345 \ + --issue-file ./issue.md + +# Terminal-Bench (via Harbor) +./scripts/benchmarks/run-terminal-bench.sh \ + --model deepseek/deepseek-chat + +# PinchBench (auto-install + run) +./scripts/benchmarks/run-pinchbench.sh \ + --install \ + --model deepseek/deepseek-chat +``` + +## Files + +- `run-swebench.sh` — SWE-bench batch driver and evaluator +- `run-terminal-bench.sh` — Terminal-Bench runner via Harbor +- `run-pinchbench.sh` — PinchBench runner with auto-install +- `harbor/__init__.py` — Harbor adapter for CodeWhale (Python) +- `harbor/codewhale_agent.py` — Adapter entry point + +## Documentation + +See [docs/BENCHMARKS.md](../../docs/BENCHMARKS.md) for full setup instructions, +reproducibility checklists, and references. diff --git a/scripts/benchmarks/harbor/__init__.py b/scripts/benchmarks/harbor/__init__.py new file mode 100644 index 00000000..3bde0431 --- /dev/null +++ b/scripts/benchmarks/harbor/__init__.py @@ -0,0 +1,175 @@ +""" +Harbor adapter for CodeWhale. + +Lets Harbor evaluate CodeWhale as an agent on Terminal-Bench and other +Harbor-compatible datasets. + +Usage (after pip install harbor): + + harbor run \\ + --dataset terminal-bench@2.0 \\ + --agent scripts.benchmarks.harbor.codewhale_agent:CodeWhaleAgent \\ + --model deepseek/deepseek-chat + +Or register the agent name in Harbor's AgentName enum for shorter invocations. +""" + +import json +import os +import shlex +from pathlib import Path, PurePosixPath +from typing import Any + +from harbor.agents.installed.base import ( + BaseInstalledAgent, + CliFlag, + with_prompt_template, +) +from harbor.environments.base import BaseEnvironment +from harbor.models.agent.context import AgentContext + + +class CodeWhaleAgent(BaseInstalledAgent): + """ + CodeWhale agent adapter for Harbor. + + Installs the ``codewhale`` CLI via npm into the task container and runs + tasks in non-interactive exec mode with full tool access. + """ + + _OUTPUT_FILENAME = "codewhale.txt" + + CLI_FLAGS = [ + CliFlag( + "max_subagents", + cli="--max-subagents", + type="int", + default=4, + ), + CliFlag( + "thinking", + cli="--thinking", + type="str", + default="high", + ), + ] + + @staticmethod + def name() -> str: + return "codewhale" + + def version(self) -> str | None: + return getattr(self, "_version", None) + + def get_version_command(self) -> str | None: + return "codewhale --version 2>/dev/null || codewhale-tui --version 2>/dev/null" + + def parse_version(self, stdout: str) -> str: + text = stdout.strip() + for line in text.splitlines(): + line = line.strip() + if line: + # Strip any prefix like "codewhale " or "codewhale-cli " + for prefix in ("codewhale-tui ", "codewhale-cli ", "codewhale "): + if line.lower().startswith(prefix): + return line[len(prefix):] + return line + return text + + async def install(self, environment: BaseEnvironment) -> None: + """Install CodeWhale via npm in the container.""" + # Install system dependencies + await self.exec_as_root( + environment, + command=( + "if ldd --version 2>&1 | grep -qi musl || [ -f /etc/alpine-release ]; then" + " apk add --no-cache curl bash nodejs npm git ripgrep;" + " elif command -v apt-get &>/dev/null; then" + " apt-get update && apt-get install -y curl git ripgrep;" + " elif command -v yum &>/dev/null; then" + " yum install -y curl git ripgrep;" + " fi" + ), + env={"DEBIAN_FRONTEND": "noninteractive"}, + ) + + # Install Node.js if not present (some images lack it) + await self.exec_as_root( + environment, + command=( + "if ! command -v node &>/dev/null; then" + " curl -fsSL https://deb.nodesource.com/setup_20.x | bash - &&" + " apt-get install -y nodejs;" + " fi" + ), + env={"DEBIAN_FRONTEND": "noninteractive"}, + ) + + # Install CodeWhale CLI via npm + await self.exec_as_agent( + environment, + command="npm install -g codewhale", + ) + + @with_prompt_template + async def run( + self, + instruction: str, + environment: BaseEnvironment, + context: AgentContext, + ) -> None: + """Run CodeWhale in non-interactive exec mode on the task.""" + escaped_instruction = shlex.quote(instruction) + + # Build CLI flags from agent config + cli_flags = self.build_cli_flags() + extra_flags = (cli_flags + " ") if cli_flags else "" + + # Determine API key environment variables to forward + env: dict[str, str] = {} + + # DeepSeek + deepseek_key = os.environ.get("DEEPSEEK_API_KEY", "") + if deepseek_key: + env["DEEPSEEK_API_KEY"] = deepseek_key + + # OpenRouter (fallback) + openrouter_key = os.environ.get("OPENROUTER_API_KEY", "") + if openrouter_key: + env["OPENROUTER_API_KEY"] = openrouter_key + + # Generic OpenAI-compatible + openai_key = os.environ.get("OPENAI_API_KEY", "") + if openai_key: + env["OPENAI_API_KEY"] = openai_key + + # Build model flag if model_name is provided + model_flag = "" + if self.model_name: + # Harbor passes model as "provider/model"; CodeWhale uses --model + model_flag = f"--model {shlex.quote(self.model_name)} " + + output_path = f"/logs/agent/{self._OUTPUT_FILENAME}" + + # Run CodeWhale in non-interactive YOLO exec mode + # --yolo enables full tool access (auto-approved) + # --auto runs non-interactively and exits when done + # --stream-json gives us structured output for trajectory parsing + await self.exec_as_agent( + environment, + command=( + f"codewhale exec --yolo --auto --stream-json " + f"{model_flag}{extra_flags}" + f"--workspace /workspace " + f"{escaped_instruction} " + f"2>&1 | tee {shlex.quote(output_path)}" + ), + env=env if env else None, + ) + + def populate_context_post_run(self, context: AgentContext) -> None: + """Parse CodeWhale's output for any post-run metadata.""" + # CodeWhale writes its results to the working tree as git diffs. + # Harbor's eval harness inspects the workspace directly, so no + # special trajectory parsing is needed for basic eval. + pass diff --git a/scripts/benchmarks/harbor/codewhale_agent.py b/scripts/benchmarks/harbor/codewhale_agent.py new file mode 100644 index 00000000..4a623d96 --- /dev/null +++ b/scripts/benchmarks/harbor/codewhale_agent.py @@ -0,0 +1,4 @@ +"""Harbor adapter entry point for CodeWhale.""" +from scripts.benchmarks.harbor import CodeWhaleAgent # noqa: F401 + +__all__ = ["CodeWhaleAgent"] diff --git a/scripts/benchmarks/run-pinchbench.sh b/scripts/benchmarks/run-pinchbench.sh new file mode 100755 index 00000000..e86740ad --- /dev/null +++ b/scripts/benchmarks/run-pinchbench.sh @@ -0,0 +1,149 @@ +#!/usr/bin/env bash +# run-pinchbench.sh — Run CodeWhale through PinchBench. +# +# PinchBench evaluates agent performance on real-world tasks. It normally +# targets OpenClaw, but this script adapts the workflow for CodeWhale by +# leveraging the OpenRouter-compatible model routing. +# +# Usage: +# ./scripts/benchmarks/run-pinchbench.sh --help +# ./scripts/benchmarks/run-pinchbench.sh --model deepseek/deepseek-chat +# +# Prerequisites: +# - PinchBench cloned (or install via this script) +# - Python 3.10+ with uv +# - OPENROUTER_API_KEY or DEEPSEEK_API_KEY set +# - A running OpenClaw instance (PinchBench's default runtime) + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +# Defaults +MODEL="deepseek/deepseek-chat" +SUITE="all" +PINCHBENCH_DIR="${PINCHBENCH_DIR:-/tmp/pinchbench}" +RESULTS_DIR="./results/pinchbench" +INSTALL_PINCHBENCH=false +RUNS=1 +JUDGE_MODEL="" +NO_UPLOAD=true +EXTRA_ARGS=() + +usage() { + cat <&2; usage >&2; exit 1 ;; + esac +done + +# Install PinchBench if requested +if [[ "$INSTALL_PINCHBENCH" == true || ! -d "$PINCHBENCH_DIR" ]]; then + echo "Installing PinchBench to $PINCHBENCH_DIR ..." + if [[ -d "$PINCHBENCH_DIR" ]]; then + cd "$PINCHBENCH_DIR" && git pull + else + git clone https://github.com/pinchbench/skill.git "$PINCHBENCH_DIR" + fi + cd "$PINCHBENCH_DIR" + uv venv .venv 2>/dev/null || true + source .venv/bin/activate + uv pip install -e . +fi + +# Verify PinchBench is available +if [[ ! -d "$PINCHBENCH_DIR" ]]; then + echo "Error: PinchBench not found at $PINCHBENCH_DIR" >&2 + echo "Run with --install to clone it automatically." >&2 + exit 1 +fi + +cd "$PINCHBENCH_DIR" + +# Activate venv if it exists +if [[ -f ".venv/bin/activate" ]]; then + source .venv/bin/activate +fi + +mkdir -p "$RESULTS_DIR" + +# Record metadata +METADATA_FILE="$RESULTS_DIR/run_metadata.json" +cat > "$METADATA_FILE" </dev/null || echo unknown)", + "git_commit": "$(cd "$REPO_ROOT" && git rev-parse HEAD 2>/dev/null || echo unknown)", + "pinchbench_commit": "$(git rev-parse HEAD 2>/dev/null || echo unknown)", + "model": "$MODEL", + "suite": "$SUITE", + "runs": $RUNS, + "timestamp_utc": "$(date -u +%Y-%m-%dT%H:%M:%SZ)", + "platform": "$(uname -s)/$(uname -m)" +} +META +echo "Run metadata: $METADATA_FILE" + +# Build PinchBench command +PB_ARGS=("--model" "$MODEL" "--suite" "$SUITE" "--runs" "$RUNS" "--output-dir" "$RESULTS_DIR") + +if [[ -n "$JUDGE_MODEL" ]]; then + PB_ARGS+=("--judge" "$JUDGE_MODEL") +fi + +if [[ "$NO_UPLOAD" == true ]]; then + PB_ARGS+=("--no-upload") +fi + +PB_ARGS+=("${EXTRA_ARGS[@]}") + +echo "Running PinchBench..." +echo " Model: $MODEL" +echo " Suite: $SUITE" +echo " Runs: $RUNS" +echo " Output: $RESULTS_DIR" +echo "" + +./scripts/run.sh "${PB_ARGS[@]}" + +echo "" +echo "Results written to $RESULTS_DIR" diff --git a/scripts/benchmarks/run-swebench.sh b/scripts/benchmarks/run-swebench.sh new file mode 100755 index 00000000..0ffbbe6a --- /dev/null +++ b/scripts/benchmarks/run-swebench.sh @@ -0,0 +1,161 @@ +#!/usr/bin/env bash +# run-swebench.sh — Batch driver for CodeWhale SWE-bench runs. +# +# Usage: +# ./scripts/benchmarks/run-swebench.sh --help +# ./scripts/benchmarks/run-swebench.sh --dataset princeton-nlp/SWE-bench_Lite --split test +# ./scripts/benchmarks/run-swebench.sh --instance-id django__django-12345 --issue-file issue.md +# +# Prerequisites: +# - codewhale installed and on PATH +# - DEEPSEEK_API_KEY set (or appropriate provider key) +# - swebench pip package installed (for evaluation step) +# - Docker running (for evaluation step) + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +# Defaults +DATASET="" +SPLIT="test" +INSTANCE_ID="" +ISSUE_FILE="" +PREDICTIONS_PATH="./results/swebench_preds.jsonl" +MODEL="" +WORKSPACE_BASE="/tmp/swebench-workspaces" +EVAL_ONLY=false +MAX_WORKERS=1 + +usage() { + cat <&2; usage >&2; exit 1 ;; + esac +done + +mkdir -p "$(dirname "$PREDICTIONS_PATH")" "$WORKSPACE_BASE" + +# Record run metadata +METADATA_FILE="$(dirname "$PREDICTIONS_PATH")/run_metadata.json" +cat > "$METADATA_FILE" </dev/null || echo unknown)", + "git_commit": "$(cd "$REPO_ROOT" && git rev-parse HEAD 2>/dev/null || echo unknown)", + "model": "${MODEL:-default}", + "dataset": "${DATASET:-single-instance}", + "split": "${SPLIT}", + "timestamp_utc": "$(date -u +%Y-%m-%dT%H:%M:%SZ)", + "platform": "$(uname -s)/$(uname -m)" +} +META +echo "Run metadata written to $METADATA_FILE" + +run_single_instance() { + local id="$1" + local workspace="$WORKSPACE_BASE/$id" + + echo "=== Running instance: $id ===" + + # Clone or checkout the instance workspace + if [[ ! -d "$workspace" ]]; then + echo " Workspace not found at $workspace" + echo " For batch mode, pre-clone instance repos into $WORKSPACE_BASE/" + echo " For single instance, use --issue-file with an existing workspace" + return 1 + fi + + cd "$workspace" + + # Write issue file if provided + if [[ -n "$ISSUE_FILE" && -f "$ISSUE_FILE" ]]; then + cp "$ISSUE_FILE" "$workspace/issue.md" + fi + + # Build the codewhale command + local cw_args=("swebench" "run" + "--instance-id" "$id" + "--predictions-path" "$PREDICTIONS_PATH" + ) + + if [[ -n "$MODEL" ]]; then + cw_args+=("--model" "$MODEL") + fi + + codewhale "${cw_args[@]}" + echo " Prediction written for $id" +} + +if [[ "$EVAL_ONLY" == true ]]; then + echo "Evaluating existing predictions at $PREDICTIONS_PATH ..." + python -m swebench.harness.run_evaluation \ + --dataset_name "${DATASET:-princeton-nlp/SWE-bench_Lite}" \ + --predictions_path "$PREDICTIONS_PATH" \ + --max_workers "$MAX_WORKERS" \ + --run_id "codewhale-$(date -u +%Y%m%d-%H%M%S)" + exit 0 +fi + +if [[ -n "$INSTANCE_ID" ]]; then + # Single-instance mode + run_single_instance "$INSTANCE_ID" +elif [[ -n "$DATASET" ]]; then + # Batch mode: requires a pre-prepared workspace directory structure + echo "Batch mode for dataset: $DATASET (split: $SPLIT)" + echo "" + echo "To run batch SWE-bench:" + echo " 1. Install swebench: pip install swebench" + echo " 2. Prepare instance workspaces in $WORKSPACE_BASE/" + echo " 3. For each instance, run:" + echo " $0 --instance-id --predictions-path $PREDICTIONS_PATH" + echo " 4. Then evaluate:" + echo " $0 --eval-only --dataset $DATASET --predictions-path $PREDICTIONS_PATH" + echo "" + echo "Automated batch orchestration is planned for v0.9.0." + echo "For now, use the SWE-bench docker harness to prepare workspaces." +else + echo "Error: specify --dataset or --instance-id" >&2 + usage >&2 + exit 1 +fi diff --git a/scripts/benchmarks/run-terminal-bench.sh b/scripts/benchmarks/run-terminal-bench.sh new file mode 100755 index 00000000..b4cc231f --- /dev/null +++ b/scripts/benchmarks/run-terminal-bench.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash +# run-terminal-bench.sh — Run CodeWhale on Terminal-Bench via Harbor. +# +# Usage: +# ./scripts/benchmarks/run-terminal-bench.sh --help +# ./scripts/benchmarks/run-terminal-bench.sh --dataset terminal-bench@2.0 --model deepseek/deepseek-chat +# +# Prerequisites: +# - pip install harbor +# - Docker running +# - DEEPSEEK_API_KEY or OPENROUTER_API_KEY set + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +# Defaults +DATASET="terminal-bench@2.0" +MODEL="deepseek/deepseek-chat" +N_CONCURRENT=4 +AGENT_PATH="$SCRIPT_DIR/harbor/__init__.py:CodeWhaleAgent" +RESULTS_DIR="./results/terminal-bench" +EXTRA_ARGS=() + +usage() { + cat <&2; usage >&2; exit 1 ;; + esac +done + +# Check prerequisites +if ! command -v harbor &>/dev/null; then + echo "Error: 'harbor' not found. Install with: pip install harbor" >&2 + exit 1 +fi + +if ! command -v docker &>/dev/null; then + echo "Error: Docker not found. Harbor requires Docker." >&2 + exit 1 +fi + +mkdir -p "$RESULTS_DIR" + +# Record metadata +METADATA_FILE="$RESULTS_DIR/run_metadata.json" +cat > "$METADATA_FILE" </dev/null || echo unknown)", + "git_commit": "$(cd "$REPO_ROOT" && git rev-parse HEAD 2>/dev/null || echo unknown)", + "harbor_version": "$(harbor --version 2>/dev/null || echo unknown)", + "model": "$MODEL", + "dataset": "$DATASET", + "agent": "codewhale", + "n_concurrent": $N_CONCURRENT, + "timestamp_utc": "$(date -u +%Y-%m-%dT%H:%M:%SZ)", + "platform": "$(uname -s)/$(uname -m)" +} +META +echo "Run metadata: $METADATA_FILE" + +# Run Harbor +echo "Running Terminal-Bench via Harbor..." +echo " Dataset: $DATASET" +echo " Model: $MODEL" +echo " Agent: $AGENT_PATH" +echo " Workers: $N_CONCURRENT" +echo "" + +harbor run \ + --dataset "$DATASET" \ + --agent "$AGENT_PATH" \ + --model "$MODEL" \ + --n-concurrent "$N_CONCURRENT" \ + --results-dir "$RESULTS_DIR" \ + "${EXTRA_ARGS[@]}" + +echo "" +echo "Results written to $RESULTS_DIR"