Files
Hunter B b329a532f5 feat(benchmarks): add SWE-bench, Terminal-Bench, and PinchBench integration
Benchmark harness for evaluating CodeWhale against three external
benchmarks:

- SWE-bench: batch driver wrapping existing codewhale swebench commands
- Terminal-Bench: Harbor adapter (BaseInstalledAgent) for container eval
- PinchBench: runner with auto-install for real-world agent tasks

Includes docs/BENCHMARKS.md umbrella doc with setup, usage, and
reproducibility checklist. Scripts record version/commit/timestamp
metadata for each run.

Branch: codex/v0.8.53-benchmarks (based on v0.8.53)
2026-06-04 19:22:06 -07:00

162 lines
5.3 KiB
Bash
Executable File

#!/usr/bin/env bash
# run-swebench.sh — Batch driver for CodeWhale SWE-bench runs.
#
# Usage:
# ./scripts/benchmarks/run-swebench.sh --help
# ./scripts/benchmarks/run-swebench.sh --dataset princeton-nlp/SWE-bench_Lite --split test
# ./scripts/benchmarks/run-swebench.sh --instance-id django__django-12345 --issue-file issue.md
#
# Prerequisites:
# - codewhale installed and on PATH
# - DEEPSEEK_API_KEY set (or appropriate provider key)
# - swebench pip package installed (for evaluation step)
# - Docker running (for evaluation step)
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# Defaults
DATASET=""
SPLIT="test"
INSTANCE_ID=""
ISSUE_FILE=""
PREDICTIONS_PATH="./results/swebench_preds.jsonl"
MODEL=""
WORKSPACE_BASE="/tmp/swebench-workspaces"
EVAL_ONLY=false
MAX_WORKERS=1
usage() {
cat <<EOF
Usage: $(basename "$0") [OPTIONS]
Run CodeWhale on SWE-bench instances and produce prediction JSONL.
Options:
--dataset DATASET HuggingFace dataset name (e.g. princeton-nlp/SWE-bench_Lite)
--split SPLIT Dataset split (default: test)
--instance-id ID Run a single instance by ID
--issue-file PATH Issue text file for single-instance mode
--predictions-path PATH Output JSONL file (default: ./results/swebench_preds.jsonl)
--model MODEL Model override for CodeWhale
--workspace-base DIR Base dir for instance workspaces (default: /tmp/swebench-workspaces)
--eval-only Skip runs; just evaluate existing predictions file
--max-workers N Parallel workers for evaluation (default: 1)
-h, --help Show this help
Examples:
# Run all instances from SWE-bench Lite
$(basename "$0") --dataset princeton-nlp/SWE-bench_Lite --split test
# Run a single instance
$(basename "$0") --instance-id django__django-12345 --issue-file ./issue.md
# Evaluate existing predictions
$(basename "$0") --eval-only --predictions-path ./results/swebench_preds.jsonl
EOF
}
# Parse args
while [[ $# -gt 0 ]]; do
case "$1" in
--dataset) DATASET="$2"; shift 2 ;;
--split) SPLIT="$2"; shift 2 ;;
--instance-id) INSTANCE_ID="$2"; shift 2 ;;
--issue-file) ISSUE_FILE="$2"; shift 2 ;;
--predictions-path) PREDICTIONS_PATH="$2"; shift 2 ;;
--model) MODEL="$2"; shift 2 ;;
--workspace-base) WORKSPACE_BASE="$2"; shift 2 ;;
--eval-only) EVAL_ONLY=true; shift ;;
--max-workers) MAX_WORKERS="$2"; shift 2 ;;
-h|--help) usage; exit 0 ;;
*) echo "Unknown option: $1" >&2; usage >&2; exit 1 ;;
esac
done
mkdir -p "$(dirname "$PREDICTIONS_PATH")" "$WORKSPACE_BASE"
# Record run metadata
METADATA_FILE="$(dirname "$PREDICTIONS_PATH")/run_metadata.json"
cat > "$METADATA_FILE" <<META
{
"codewhale_version": "$(codewhale --version 2>/dev/null || echo unknown)",
"git_commit": "$(cd "$REPO_ROOT" && git rev-parse HEAD 2>/dev/null || echo unknown)",
"model": "${MODEL:-default}",
"dataset": "${DATASET:-single-instance}",
"split": "${SPLIT}",
"timestamp_utc": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
"platform": "$(uname -s)/$(uname -m)"
}
META
echo "Run metadata written to $METADATA_FILE"
run_single_instance() {
local id="$1"
local workspace="$WORKSPACE_BASE/$id"
echo "=== Running instance: $id ==="
# Clone or checkout the instance workspace
if [[ ! -d "$workspace" ]]; then
echo " Workspace not found at $workspace"
echo " For batch mode, pre-clone instance repos into $WORKSPACE_BASE/"
echo " For single instance, use --issue-file with an existing workspace"
return 1
fi
cd "$workspace"
# Write issue file if provided
if [[ -n "$ISSUE_FILE" && -f "$ISSUE_FILE" ]]; then
cp "$ISSUE_FILE" "$workspace/issue.md"
fi
# Build the codewhale command
local cw_args=("swebench" "run"
"--instance-id" "$id"
"--predictions-path" "$PREDICTIONS_PATH"
)
if [[ -n "$MODEL" ]]; then
cw_args+=("--model" "$MODEL")
fi
codewhale "${cw_args[@]}"
echo " Prediction written for $id"
}
if [[ "$EVAL_ONLY" == true ]]; then
echo "Evaluating existing predictions at $PREDICTIONS_PATH ..."
python -m swebench.harness.run_evaluation \
--dataset_name "${DATASET:-princeton-nlp/SWE-bench_Lite}" \
--predictions_path "$PREDICTIONS_PATH" \
--max_workers "$MAX_WORKERS" \
--run_id "codewhale-$(date -u +%Y%m%d-%H%M%S)"
exit 0
fi
if [[ -n "$INSTANCE_ID" ]]; then
# Single-instance mode
run_single_instance "$INSTANCE_ID"
elif [[ -n "$DATASET" ]]; then
# Batch mode: requires a pre-prepared workspace directory structure
echo "Batch mode for dataset: $DATASET (split: $SPLIT)"
echo ""
echo "To run batch SWE-bench:"
echo " 1. Install swebench: pip install swebench"
echo " 2. Prepare instance workspaces in $WORKSPACE_BASE/"
echo " 3. For each instance, run:"
echo " $0 --instance-id <ID> --predictions-path $PREDICTIONS_PATH"
echo " 4. Then evaluate:"
echo " $0 --eval-only --dataset $DATASET --predictions-path $PREDICTIONS_PATH"
echo ""
echo "Automated batch orchestration is planned for v0.9.0."
echo "For now, use the SWE-bench docker harness to prepare workspaces."
else
echo "Error: specify --dataset or --instance-id" >&2
usage >&2
exit 1
fi