feat(benchmarks): add CodeWhale-native PinchBench runner
Runs PinchBench tasks directly through codewhale exec --auto instead of going through OpenClaw. Loads task markdown, creates workspace, runs the prompt, and grades using PinchBench's embedded automated checks. No external agent framework dependency — just codewhale + pyyaml.
This commit is contained in:
@@ -0,0 +1,394 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
CodeWhale-native PinchBench runner.
|
||||
|
||||
Loads PinchBench tasks, runs them through codewhale exec, and grades results.
|
||||
No OpenClaw dependency.
|
||||
|
||||
Usage:
|
||||
python scripts/benchmarks/pinchbench_codewhale.py --help
|
||||
python scripts/benchmarks/pinchbench_codewhale.py --suite task_calendar
|
||||
python scripts/benchmarks/pinchbench_codewhale.py --suite task_calendar,task_stock
|
||||
python scripts/benchmarks/pinchbench_codewhale.py --all
|
||||
"""
|
||||
# /// script
|
||||
# requires-python = ">=3.10"
|
||||
# dependencies = [
|
||||
# "pyyaml>=6.0.1",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
|
||||
def load_task(task_path: Path) -> dict[str, Any]:
|
||||
"""Load a PinchBench task markdown file."""
|
||||
content = task_path.read_text(encoding="utf-8")
|
||||
|
||||
# Extract YAML frontmatter
|
||||
fm_match = re.match(r"^---\s*\n(.*?)\n---\s*\n(.*)$", content, re.DOTALL)
|
||||
if not fm_match:
|
||||
raise ValueError(f"No YAML frontmatter in {task_path}")
|
||||
|
||||
import yaml
|
||||
frontmatter = yaml.safe_load(fm_match.group(1))
|
||||
body = fm_match.group(2)
|
||||
|
||||
# Extract sections
|
||||
sections: dict[str, str] = {}
|
||||
current_section = None
|
||||
current_content: list[str] = []
|
||||
for line in body.split("\n"):
|
||||
header = re.match(r"^##\s+(.+)$", line)
|
||||
if header:
|
||||
if current_section:
|
||||
sections[current_section] = "\n".join(current_content).strip()
|
||||
current_section = header.group(1)
|
||||
current_content = []
|
||||
else:
|
||||
current_content.append(line)
|
||||
if current_section:
|
||||
sections[current_section] = "\n".join(current_content).strip()
|
||||
|
||||
return {
|
||||
"task_id": frontmatter.get("id", task_path.stem),
|
||||
"name": frontmatter.get("name", ""),
|
||||
"category": frontmatter.get("category", ""),
|
||||
"grading_type": frontmatter.get("grading_type", "automated"),
|
||||
"timeout_seconds": frontmatter.get("timeout_seconds", 120),
|
||||
"workspace_files": frontmatter.get("workspace_files", []),
|
||||
"prompt": sections.get("Prompt", "").strip(),
|
||||
"automated_checks": sections.get("Automated Checks", None),
|
||||
"llm_judge_rubric": sections.get("LLM Judge Rubric", None),
|
||||
"grading_criteria": sections.get("Grading Criteria", ""),
|
||||
"expected_behavior": sections.get("Expected Behavior", ""),
|
||||
"path": task_path,
|
||||
}
|
||||
|
||||
|
||||
def prepare_workspace(task: dict, run_dir: Path) -> Path:
|
||||
"""Create a temp workspace with any task-required files."""
|
||||
workspace = run_dir / task["task_id"]
|
||||
workspace.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Initialize git repo so codewhale works
|
||||
subprocess.run(["git", "init"], cwd=workspace, capture_output=True, check=False)
|
||||
subprocess.run(
|
||||
["git", "config", "user.email", "bench@codewhale"],
|
||||
cwd=workspace, capture_output=True, check=False,
|
||||
)
|
||||
subprocess.run(
|
||||
["git", "config", "user.name", "Benchmark"],
|
||||
cwd=workspace, capture_output=True, check=False,
|
||||
)
|
||||
|
||||
# Create workspace files from task definition
|
||||
for wf in task.get("workspace_files", []):
|
||||
if isinstance(wf, dict):
|
||||
for path, content in wf.items():
|
||||
fpath = workspace / path
|
||||
fpath.parent.mkdir(parents=True, exist_ok=True)
|
||||
fpath.write_text(content, encoding="utf-8")
|
||||
|
||||
# Commit initial state
|
||||
subprocess.run(["git", "add", "-A"], cwd=workspace, capture_output=True, check=False)
|
||||
subprocess.run(
|
||||
["git", "commit", "-m", "initial", "--allow-empty"],
|
||||
cwd=workspace, capture_output=True, check=False,
|
||||
)
|
||||
|
||||
return workspace
|
||||
|
||||
|
||||
def run_codewhale(
|
||||
workspace: Path,
|
||||
prompt: str,
|
||||
timeout_seconds: int,
|
||||
model: Optional[str] = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Run codewhale exec on a task and return the result."""
|
||||
cmd = [
|
||||
"codewhale", "exec",
|
||||
"--auto",
|
||||
"--workspace", str(workspace),
|
||||
]
|
||||
if model:
|
||||
cmd.extend(["--model", model])
|
||||
cmd.append(prompt)
|
||||
|
||||
start = time.time()
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout_seconds,
|
||||
cwd=workspace,
|
||||
check=False,
|
||||
)
|
||||
elapsed = time.time() - start
|
||||
return {
|
||||
"exit_code": result.returncode,
|
||||
"stdout": result.stdout,
|
||||
"stderr": result.stderr,
|
||||
"elapsed_seconds": elapsed,
|
||||
"timed_out": False,
|
||||
}
|
||||
except subprocess.TimeoutExpired:
|
||||
elapsed = time.time() - start
|
||||
return {
|
||||
"exit_code": -1,
|
||||
"stdout": "",
|
||||
"stderr": "TIMEOUT",
|
||||
"elapsed_seconds": elapsed,
|
||||
"timed_out": True,
|
||||
}
|
||||
|
||||
|
||||
def grade_automated(task: dict, workspace: Path, transcript: list) -> dict[str, Any]:
|
||||
"""Run the automated grading check from the task definition."""
|
||||
checks_code = task.get("automated_checks")
|
||||
if not checks_code:
|
||||
return {"score": 0.0, "reason": "no automated checks defined"}
|
||||
|
||||
# Extract the grade function from the markdown code block
|
||||
code_match = re.search(r"```python\n(.*?)```", checks_code, re.DOTALL)
|
||||
if not code_match:
|
||||
return {"score": 0.0, "reason": "no python code block in automated checks"}
|
||||
|
||||
code = code_match.group(1)
|
||||
|
||||
# Execute the grading function
|
||||
namespace: dict[str, Any] = {}
|
||||
try:
|
||||
exec(code, namespace)
|
||||
except Exception as e:
|
||||
return {"score": 0.0, "reason": f"grading code failed to load: {e}"}
|
||||
|
||||
grade_fn = namespace.get("grade")
|
||||
if not grade_fn:
|
||||
return {"score": 0.0, "reason": "no grade() function in automated checks"}
|
||||
|
||||
try:
|
||||
result = grade_fn(transcript, str(workspace))
|
||||
if isinstance(result, dict):
|
||||
# PinchBench returns per-criterion scores; average them
|
||||
numeric = [v for v in result.values() if isinstance(v, (int, float))]
|
||||
avg = sum(numeric) / len(numeric) if numeric else 0.0
|
||||
result["score"] = avg
|
||||
return result
|
||||
return {"score": float(result) if result else 0.0}
|
||||
except Exception as e:
|
||||
return {"score": 0.0, "reason": f"grading failed: {e}"}
|
||||
|
||||
|
||||
def run_benchmark(
|
||||
tasks_dir: Path,
|
||||
suite: str,
|
||||
results_dir: Path,
|
||||
model: Optional[str] = None,
|
||||
timeout_multiplier: float = 1.0,
|
||||
) -> dict[str, Any]:
|
||||
"""Run the benchmark suite."""
|
||||
# Load tasks
|
||||
all_tasks: list[dict] = []
|
||||
manifest_path = tasks_dir / "manifest.yaml"
|
||||
|
||||
if suite == "all":
|
||||
task_files = sorted(tasks_dir.glob("task_*.md"))
|
||||
for tf in task_files:
|
||||
try:
|
||||
all_tasks.append(load_task(tf))
|
||||
except Exception as e:
|
||||
print(f" Skip {tf.name}: {e}", file=sys.stderr)
|
||||
else:
|
||||
task_ids = [t.strip() for t in suite.split(",")]
|
||||
for tid in task_ids:
|
||||
tf = tasks_dir / f"{tid}.md"
|
||||
if not tf.exists():
|
||||
print(f" Task not found: {tf}", file=sys.stderr)
|
||||
continue
|
||||
all_tasks.append(load_task(tf))
|
||||
|
||||
if not all_tasks:
|
||||
print("No tasks loaded.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Loaded {len(all_tasks)} tasks")
|
||||
|
||||
# Create run directory
|
||||
results_dir.mkdir(parents=True, exist_ok=True)
|
||||
run_id = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
|
||||
run_dir = results_dir / run_id
|
||||
run_dir.mkdir()
|
||||
|
||||
# Record metadata
|
||||
cw_version = "unknown"
|
||||
try:
|
||||
vr = subprocess.run(["codewhale", "--version"], capture_output=True, text=True)
|
||||
if vr.returncode == 0:
|
||||
cw_version = vr.stdout.strip()
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
metadata = {
|
||||
"codewhale_version": cw_version,
|
||||
"model": model or "default",
|
||||
"suite": suite,
|
||||
"task_count": len(all_tasks),
|
||||
"run_id": run_id,
|
||||
"timestamp_utc": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
(run_dir / "metadata.json").write_text(json.dumps(metadata, indent=2))
|
||||
|
||||
# Run tasks
|
||||
results: list[dict] = []
|
||||
total_score = 0.0
|
||||
|
||||
for i, task in enumerate(all_tasks, 1):
|
||||
task_id = task["task_id"]
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Task {i}/{len(all_tasks)}: {task_id} — {task['name']}")
|
||||
print(f" Category: {task['category']}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
workspace = prepare_workspace(task, run_dir)
|
||||
timeout = int(task["timeout_seconds"] * timeout_multiplier)
|
||||
|
||||
# Run codewhale
|
||||
print(f" Running codewhale exec (timeout: {timeout}s)...")
|
||||
result = run_codewhale(workspace, task["prompt"], timeout, model=model)
|
||||
print(f" Completed in {result['elapsed_seconds']:.1f}s (exit {result['exit_code']})")
|
||||
|
||||
if result["timed_out"]:
|
||||
print(f" ⏰ TIMED OUT")
|
||||
|
||||
# Build a minimal transcript for grading
|
||||
transcript = [{"role": "user", "content": task["prompt"]}]
|
||||
if result["stdout"]:
|
||||
transcript.append({"role": "assistant", "content": result["stdout"]})
|
||||
|
||||
# Grade
|
||||
grade_result = {"score": 0.0, "reason": "not graded"}
|
||||
if task["automated_checks"]:
|
||||
grade_result = grade_automated(task, workspace, transcript)
|
||||
elif task.get("llm_judge_rubric"):
|
||||
grade_result = {"score": 0.0, "reason": "llm judge not implemented yet"}
|
||||
|
||||
score = grade_result.get("score", 0.0)
|
||||
total_score += score
|
||||
|
||||
status = "✅" if score >= 1.0 else "🔶" if score > 0 else "❌"
|
||||
print(f" {status} Score: {score:.1%} — {grade_result.get('reason', '')}")
|
||||
|
||||
task_result = {
|
||||
"task_id": task_id,
|
||||
"name": task["name"],
|
||||
"category": task["category"],
|
||||
"score": score,
|
||||
"grade": grade_result,
|
||||
"elapsed_seconds": result["elapsed_seconds"],
|
||||
"timed_out": result["timed_out"],
|
||||
"exit_code": result["exit_code"],
|
||||
}
|
||||
results.append(task_result)
|
||||
|
||||
# Save individual result
|
||||
(run_dir / f"{task_id}.json").write_text(json.dumps(task_result, indent=2))
|
||||
|
||||
# Summary
|
||||
avg_score = total_score / len(results) if results else 0.0
|
||||
|
||||
# Group by category
|
||||
categories: dict[str, list[dict]] = {}
|
||||
for r in results:
|
||||
cat = r["category"]
|
||||
categories.setdefault(cat, []).append(r)
|
||||
|
||||
summary = {
|
||||
"run_id": run_id,
|
||||
"total_score": total_score,
|
||||
"task_count": len(results),
|
||||
"average_score": avg_score,
|
||||
"categories": {
|
||||
cat: {
|
||||
"score": sum(r["score"] for r in tasks) / len(tasks) if tasks else 0,
|
||||
"tasks": len(tasks),
|
||||
}
|
||||
for cat, tasks in categories.items()
|
||||
},
|
||||
"results": results,
|
||||
"metadata": metadata,
|
||||
}
|
||||
|
||||
(run_dir / "summary.json").write_text(json.dumps(summary, indent=2))
|
||||
|
||||
# Print summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f"PINCHBENCH SCORE SUMMARY (CodeWhale)")
|
||||
print(f"{'='*60}")
|
||||
print(f"\n Overall: {avg_score:.1%} ({total_score:.1f}/{len(results)})\n")
|
||||
print(f" {'CATEGORY':<25} {'SCORE':>8} {'TASKS':>5}")
|
||||
print(f" {'-'*45}")
|
||||
for cat, info in sorted(summary["categories"].items()):
|
||||
pct = info["score"] * 100
|
||||
marker = "🔴" if pct < 25 else "🟡" if pct < 75 else "🟢"
|
||||
print(f" {marker} {cat:<23} {pct:>6.1f}% {info['tasks']:>5}")
|
||||
print(f" {'-'*45}")
|
||||
print(f"\nResults: {run_dir}")
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run PinchBench tasks through CodeWhale (no OpenClaw)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tasks-dir",
|
||||
type=Path,
|
||||
default=Path("/tmp/pinchbench/tasks"),
|
||||
help="PinchBench tasks directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--suite",
|
||||
default="task_calendar",
|
||||
help="Comma-separated task IDs, or 'all'",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--results-dir",
|
||||
type=Path,
|
||||
default=Path("./results/pinchbench-codewhale"),
|
||||
help="Results output directory",
|
||||
)
|
||||
parser.add_argument("--model", default=None, help="Model override for codewhale")
|
||||
parser.add_argument(
|
||||
"--timeout-multiplier",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Scale task timeouts",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
run_benchmark(
|
||||
tasks_dir=args.tasks_dir,
|
||||
suite=args.suite,
|
||||
results_dir=args.results_dir,
|
||||
model=args.model,
|
||||
timeout_multiplier=args.timeout_multiplier,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user