feat(benchmarks): add SWE-bench, Terminal-Bench, and PinchBench integration

Benchmark harness for evaluating CodeWhale against three external
benchmarks:

- SWE-bench: batch driver wrapping existing codewhale swebench commands
- Terminal-Bench: Harbor adapter (BaseInstalledAgent) for container eval
- PinchBench: runner with auto-install for real-world agent tasks

Includes docs/BENCHMARKS.md umbrella doc with setup, usage, and
reproducibility checklist. Scripts record version/commit/timestamp
metadata for each run.

Branch: codex/v0.8.53-benchmarks (based on v0.8.53)
This commit is contained in:
Hunter B
2026-06-04 19:21:23 -07:00
parent 8dff2f7525
commit b329a532f5
7 changed files with 792 additions and 0 deletions
+149
View File
@@ -0,0 +1,149 @@
#!/usr/bin/env bash
# run-pinchbench.sh — Run CodeWhale through PinchBench.
#
# PinchBench evaluates agent performance on real-world tasks. It normally
# targets OpenClaw, but this script adapts the workflow for CodeWhale by
# leveraging the OpenRouter-compatible model routing.
#
# Usage:
# ./scripts/benchmarks/run-pinchbench.sh --help
# ./scripts/benchmarks/run-pinchbench.sh --model deepseek/deepseek-chat
#
# Prerequisites:
# - PinchBench cloned (or install via this script)
# - Python 3.10+ with uv
# - OPENROUTER_API_KEY or DEEPSEEK_API_KEY set
# - A running OpenClaw instance (PinchBench's default runtime)
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# Defaults
MODEL="deepseek/deepseek-chat"
SUITE="all"
PINCHBENCH_DIR="${PINCHBENCH_DIR:-/tmp/pinchbench}"
RESULTS_DIR="./results/pinchbench"
INSTALL_PINCHBENCH=false
RUNS=1
JUDGE_MODEL=""
NO_UPLOAD=true
EXTRA_ARGS=()
usage() {
cat <<EOF
Usage: $(basename "$0") [OPTIONS]
Run PinchBench benchmarks with CodeWhale-compatible model routing.
Options:
--model MODEL Model in provider/name format (default: deepseek/deepseek-chat)
--suite SUITE Task suite: all, automated-only, or comma-separated IDs (default: all)
--runs N Runs per task for averaging (default: 1)
--judge MODEL Judge model for LLM grading
--pinchbench-dir DIR PinchBench install directory (default: /tmp/pinchbench)
--results-dir DIR Local results directory (default: ./results/pinchbench)
--install Install/clone PinchBench before running
--upload Upload results to pinchbench.com leaderboard
-- [EXTRA_ARGS...] Additional arguments passed to PinchBench
-h, --help Show this help
Examples:
# Basic run with DeepSeek
$(basename "$0") --model deepseek/deepseek-chat
# Install and run
$(basename "$0") --install --model deepseek/deepseek-chat
# Specific tasks only
$(basename "$0") --suite task_calendar,task_stock --model deepseek/deepseek-chat
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--model) MODEL="$2"; shift 2 ;;
--suite) SUITE="$2"; shift 2 ;;
--runs) RUNS="$2"; shift 2 ;;
--judge) JUDGE_MODEL="$2"; shift 2 ;;
--pinchbench-dir) PINCHBENCH_DIR="$2"; shift 2 ;;
--results-dir) RESULTS_DIR="$2"; shift 2 ;;
--install) INSTALL_PINCHBENCH=true; shift ;;
--upload) NO_UPLOAD=false; shift ;;
--) shift; EXTRA_ARGS=("$@"); break ;;
-h|--help) usage; exit 0 ;;
*) echo "Unknown option: $1" >&2; usage >&2; exit 1 ;;
esac
done
# Install PinchBench if requested
if [[ "$INSTALL_PINCHBENCH" == true || ! -d "$PINCHBENCH_DIR" ]]; then
echo "Installing PinchBench to $PINCHBENCH_DIR ..."
if [[ -d "$PINCHBENCH_DIR" ]]; then
cd "$PINCHBENCH_DIR" && git pull
else
git clone https://github.com/pinchbench/skill.git "$PINCHBENCH_DIR"
fi
cd "$PINCHBENCH_DIR"
uv venv .venv 2>/dev/null || true
source .venv/bin/activate
uv pip install -e .
fi
# Verify PinchBench is available
if [[ ! -d "$PINCHBENCH_DIR" ]]; then
echo "Error: PinchBench not found at $PINCHBENCH_DIR" >&2
echo "Run with --install to clone it automatically." >&2
exit 1
fi
cd "$PINCHBENCH_DIR"
# Activate venv if it exists
if [[ -f ".venv/bin/activate" ]]; then
source .venv/bin/activate
fi
mkdir -p "$RESULTS_DIR"
# Record metadata
METADATA_FILE="$RESULTS_DIR/run_metadata.json"
cat > "$METADATA_FILE" <<META
{
"codewhale_version": "$(codewhale --version 2>/dev/null || echo unknown)",
"git_commit": "$(cd "$REPO_ROOT" && git rev-parse HEAD 2>/dev/null || echo unknown)",
"pinchbench_commit": "$(git rev-parse HEAD 2>/dev/null || echo unknown)",
"model": "$MODEL",
"suite": "$SUITE",
"runs": $RUNS,
"timestamp_utc": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
"platform": "$(uname -s)/$(uname -m)"
}
META
echo "Run metadata: $METADATA_FILE"
# Build PinchBench command
PB_ARGS=("--model" "$MODEL" "--suite" "$SUITE" "--runs" "$RUNS" "--output-dir" "$RESULTS_DIR")
if [[ -n "$JUDGE_MODEL" ]]; then
PB_ARGS+=("--judge" "$JUDGE_MODEL")
fi
if [[ "$NO_UPLOAD" == true ]]; then
PB_ARGS+=("--no-upload")
fi
PB_ARGS+=("${EXTRA_ARGS[@]}")
echo "Running PinchBench..."
echo " Model: $MODEL"
echo " Suite: $SUITE"
echo " Runs: $RUNS"
echo " Output: $RESULTS_DIR"
echo ""
./scripts/run.sh "${PB_ARGS[@]}"
echo ""
echo "Results written to $RESULTS_DIR"