Files
Hunter B b329a532f5 feat(benchmarks): add SWE-bench, Terminal-Bench, and PinchBench integration
Benchmark harness for evaluating CodeWhale against three external
benchmarks:

- SWE-bench: batch driver wrapping existing codewhale swebench commands
- Terminal-Bench: Harbor adapter (BaseInstalledAgent) for container eval
- PinchBench: runner with auto-install for real-world agent tasks

Includes docs/BENCHMARKS.md umbrella doc with setup, usage, and
reproducibility checklist. Scripts record version/commit/timestamp
metadata for each run.

Branch: codex/v0.8.53-benchmarks (based on v0.8.53)
2026-06-04 19:22:06 -07:00

114 lines
3.2 KiB
Bash
Executable File

#!/usr/bin/env bash
# run-terminal-bench.sh — Run CodeWhale on Terminal-Bench via Harbor.
#
# Usage:
# ./scripts/benchmarks/run-terminal-bench.sh --help
# ./scripts/benchmarks/run-terminal-bench.sh --dataset terminal-bench@2.0 --model deepseek/deepseek-chat
#
# Prerequisites:
# - pip install harbor
# - Docker running
# - DEEPSEEK_API_KEY or OPENROUTER_API_KEY set
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# Defaults
DATASET="terminal-bench@2.0"
MODEL="deepseek/deepseek-chat"
N_CONCURRENT=4
AGENT_PATH="$SCRIPT_DIR/harbor/__init__.py:CodeWhaleAgent"
RESULTS_DIR="./results/terminal-bench"
EXTRA_ARGS=()
usage() {
cat <<EOF
Usage: $(basename "$0") [OPTIONS]
Run CodeWhale on Terminal-Bench tasks via the Harbor framework.
Options:
--dataset DATASET Harbor dataset (default: terminal-bench@2.0)
--model MODEL Model in provider/name format (default: deepseek/deepseek-chat)
--agent PATH Harbor agent import path (default: local CodeWhale adapter)
--n-concurrent N Parallel task workers (default: 4)
--results-dir DIR Results output directory (default: ./results/terminal-bench)
-- [EXTRA_ARGS...] Additional arguments passed to 'harbor run'
-h, --help Show this help
Examples:
# Default run
$(basename "$0")
# Custom model and concurrency
$(basename "$0") --model deepseek/deepseek-reasoner --n-concurrent 8
# Pass extra flags to harbor
$(basename "$0") -- --env daytona
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--dataset) DATASET="$2"; shift 2 ;;
--model) MODEL="$2"; shift 2 ;;
--agent) AGENT_PATH="$2"; shift 2 ;;
--n-concurrent) N_CONCURRENT="$2"; shift 2 ;;
--results-dir) RESULTS_DIR="$2"; shift 2 ;;
--) shift; EXTRA_ARGS=("$@"); break ;;
-h|--help) usage; exit 0 ;;
*) echo "Unknown option: $1" >&2; usage >&2; exit 1 ;;
esac
done
# Check prerequisites
if ! command -v harbor &>/dev/null; then
echo "Error: 'harbor' not found. Install with: pip install harbor" >&2
exit 1
fi
if ! command -v docker &>/dev/null; then
echo "Error: Docker not found. Harbor requires Docker." >&2
exit 1
fi
mkdir -p "$RESULTS_DIR"
# Record metadata
METADATA_FILE="$RESULTS_DIR/run_metadata.json"
cat > "$METADATA_FILE" <<META
{
"codewhale_version": "$(codewhale --version 2>/dev/null || echo unknown)",
"git_commit": "$(cd "$REPO_ROOT" && git rev-parse HEAD 2>/dev/null || echo unknown)",
"harbor_version": "$(harbor --version 2>/dev/null || echo unknown)",
"model": "$MODEL",
"dataset": "$DATASET",
"agent": "codewhale",
"n_concurrent": $N_CONCURRENT,
"timestamp_utc": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
"platform": "$(uname -s)/$(uname -m)"
}
META
echo "Run metadata: $METADATA_FILE"
# Run Harbor
echo "Running Terminal-Bench via Harbor..."
echo " Dataset: $DATASET"
echo " Model: $MODEL"
echo " Agent: $AGENT_PATH"
echo " Workers: $N_CONCURRENT"
echo ""
harbor run \
--dataset "$DATASET" \
--agent "$AGENT_PATH" \
--model "$MODEL" \
--n-concurrent "$N_CONCURRENT" \
--results-dir "$RESULTS_DIR" \
"${EXTRA_ARGS[@]}"
echo ""
echo "Results written to $RESULTS_DIR"