b329a532f5
Benchmark harness for evaluating CodeWhale against three external benchmarks: - SWE-bench: batch driver wrapping existing codewhale swebench commands - Terminal-Bench: Harbor adapter (BaseInstalledAgent) for container eval - PinchBench: runner with auto-install for real-world agent tasks Includes docs/BENCHMARKS.md umbrella doc with setup, usage, and reproducibility checklist. Scripts record version/commit/timestamp metadata for each run. Branch: codex/v0.8.53-benchmarks (based on v0.8.53)
162 lines
5.3 KiB
Bash
Executable File
162 lines
5.3 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# run-swebench.sh — Batch driver for CodeWhale SWE-bench runs.
|
|
#
|
|
# Usage:
|
|
# ./scripts/benchmarks/run-swebench.sh --help
|
|
# ./scripts/benchmarks/run-swebench.sh --dataset princeton-nlp/SWE-bench_Lite --split test
|
|
# ./scripts/benchmarks/run-swebench.sh --instance-id django__django-12345 --issue-file issue.md
|
|
#
|
|
# Prerequisites:
|
|
# - codewhale installed and on PATH
|
|
# - DEEPSEEK_API_KEY set (or appropriate provider key)
|
|
# - swebench pip package installed (for evaluation step)
|
|
# - Docker running (for evaluation step)
|
|
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
|
|
|
# Defaults
|
|
DATASET=""
|
|
SPLIT="test"
|
|
INSTANCE_ID=""
|
|
ISSUE_FILE=""
|
|
PREDICTIONS_PATH="./results/swebench_preds.jsonl"
|
|
MODEL=""
|
|
WORKSPACE_BASE="/tmp/swebench-workspaces"
|
|
EVAL_ONLY=false
|
|
MAX_WORKERS=1
|
|
|
|
usage() {
|
|
cat <<EOF
|
|
Usage: $(basename "$0") [OPTIONS]
|
|
|
|
Run CodeWhale on SWE-bench instances and produce prediction JSONL.
|
|
|
|
Options:
|
|
--dataset DATASET HuggingFace dataset name (e.g. princeton-nlp/SWE-bench_Lite)
|
|
--split SPLIT Dataset split (default: test)
|
|
--instance-id ID Run a single instance by ID
|
|
--issue-file PATH Issue text file for single-instance mode
|
|
--predictions-path PATH Output JSONL file (default: ./results/swebench_preds.jsonl)
|
|
--model MODEL Model override for CodeWhale
|
|
--workspace-base DIR Base dir for instance workspaces (default: /tmp/swebench-workspaces)
|
|
--eval-only Skip runs; just evaluate existing predictions file
|
|
--max-workers N Parallel workers for evaluation (default: 1)
|
|
-h, --help Show this help
|
|
|
|
Examples:
|
|
# Run all instances from SWE-bench Lite
|
|
$(basename "$0") --dataset princeton-nlp/SWE-bench_Lite --split test
|
|
|
|
# Run a single instance
|
|
$(basename "$0") --instance-id django__django-12345 --issue-file ./issue.md
|
|
|
|
# Evaluate existing predictions
|
|
$(basename "$0") --eval-only --predictions-path ./results/swebench_preds.jsonl
|
|
EOF
|
|
}
|
|
|
|
# Parse args
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--dataset) DATASET="$2"; shift 2 ;;
|
|
--split) SPLIT="$2"; shift 2 ;;
|
|
--instance-id) INSTANCE_ID="$2"; shift 2 ;;
|
|
--issue-file) ISSUE_FILE="$2"; shift 2 ;;
|
|
--predictions-path) PREDICTIONS_PATH="$2"; shift 2 ;;
|
|
--model) MODEL="$2"; shift 2 ;;
|
|
--workspace-base) WORKSPACE_BASE="$2"; shift 2 ;;
|
|
--eval-only) EVAL_ONLY=true; shift ;;
|
|
--max-workers) MAX_WORKERS="$2"; shift 2 ;;
|
|
-h|--help) usage; exit 0 ;;
|
|
*) echo "Unknown option: $1" >&2; usage >&2; exit 1 ;;
|
|
esac
|
|
done
|
|
|
|
mkdir -p "$(dirname "$PREDICTIONS_PATH")" "$WORKSPACE_BASE"
|
|
|
|
# Record run metadata
|
|
METADATA_FILE="$(dirname "$PREDICTIONS_PATH")/run_metadata.json"
|
|
cat > "$METADATA_FILE" <<META
|
|
{
|
|
"codewhale_version": "$(codewhale --version 2>/dev/null || echo unknown)",
|
|
"git_commit": "$(cd "$REPO_ROOT" && git rev-parse HEAD 2>/dev/null || echo unknown)",
|
|
"model": "${MODEL:-default}",
|
|
"dataset": "${DATASET:-single-instance}",
|
|
"split": "${SPLIT}",
|
|
"timestamp_utc": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
|
|
"platform": "$(uname -s)/$(uname -m)"
|
|
}
|
|
META
|
|
echo "Run metadata written to $METADATA_FILE"
|
|
|
|
run_single_instance() {
|
|
local id="$1"
|
|
local workspace="$WORKSPACE_BASE/$id"
|
|
|
|
echo "=== Running instance: $id ==="
|
|
|
|
# Clone or checkout the instance workspace
|
|
if [[ ! -d "$workspace" ]]; then
|
|
echo " Workspace not found at $workspace"
|
|
echo " For batch mode, pre-clone instance repos into $WORKSPACE_BASE/"
|
|
echo " For single instance, use --issue-file with an existing workspace"
|
|
return 1
|
|
fi
|
|
|
|
cd "$workspace"
|
|
|
|
# Write issue file if provided
|
|
if [[ -n "$ISSUE_FILE" && -f "$ISSUE_FILE" ]]; then
|
|
cp "$ISSUE_FILE" "$workspace/issue.md"
|
|
fi
|
|
|
|
# Build the codewhale command
|
|
local cw_args=("swebench" "run"
|
|
"--instance-id" "$id"
|
|
"--predictions-path" "$PREDICTIONS_PATH"
|
|
)
|
|
|
|
if [[ -n "$MODEL" ]]; then
|
|
cw_args+=("--model" "$MODEL")
|
|
fi
|
|
|
|
codewhale "${cw_args[@]}"
|
|
echo " Prediction written for $id"
|
|
}
|
|
|
|
if [[ "$EVAL_ONLY" == true ]]; then
|
|
echo "Evaluating existing predictions at $PREDICTIONS_PATH ..."
|
|
python -m swebench.harness.run_evaluation \
|
|
--dataset_name "${DATASET:-princeton-nlp/SWE-bench_Lite}" \
|
|
--predictions_path "$PREDICTIONS_PATH" \
|
|
--max_workers "$MAX_WORKERS" \
|
|
--run_id "codewhale-$(date -u +%Y%m%d-%H%M%S)"
|
|
exit 0
|
|
fi
|
|
|
|
if [[ -n "$INSTANCE_ID" ]]; then
|
|
# Single-instance mode
|
|
run_single_instance "$INSTANCE_ID"
|
|
elif [[ -n "$DATASET" ]]; then
|
|
# Batch mode: requires a pre-prepared workspace directory structure
|
|
echo "Batch mode for dataset: $DATASET (split: $SPLIT)"
|
|
echo ""
|
|
echo "To run batch SWE-bench:"
|
|
echo " 1. Install swebench: pip install swebench"
|
|
echo " 2. Prepare instance workspaces in $WORKSPACE_BASE/"
|
|
echo " 3. For each instance, run:"
|
|
echo " $0 --instance-id <ID> --predictions-path $PREDICTIONS_PATH"
|
|
echo " 4. Then evaluate:"
|
|
echo " $0 --eval-only --dataset $DATASET --predictions-path $PREDICTIONS_PATH"
|
|
echo ""
|
|
echo "Automated batch orchestration is planned for v0.9.0."
|
|
echo "For now, use the SWE-bench docker harness to prepare workspaces."
|
|
else
|
|
echo "Error: specify --dataset or --instance-id" >&2
|
|
usage >&2
|
|
exit 1
|
|
fi
|