feat(benchmarks): add CodeWhale-native PinchBench runner

Runs PinchBench tasks directly through codewhale exec --auto instead of going through OpenClaw. Loads task markdown, creates workspace, runs the prompt, and grades using PinchBench's embedded automated checks. No external agent framework dependency — just codewhale + pyyaml.
2026-06-04 20:26:05 -07:00
parent b7798ba0f6
commit c8fcef7f1e
1 changed files with 394 additions and 0 deletions
@@ -0,0 +1,394 @@
+#!/usr/bin/env python3
+"""
+CodeWhale-native PinchBench runner.
+
+Loads PinchBench tasks, runs them through codewhale exec, and grades results.
+No OpenClaw dependency.
+
+Usage:
+    python scripts/benchmarks/pinchbench_codewhale.py --help
+    python scripts/benchmarks/pinchbench_codewhale.py --suite task_calendar
+    python scripts/benchmarks/pinchbench_codewhale.py --suite task_calendar,task_stock
+    python scripts/benchmarks/pinchbench_codewhale.py --all
+"""
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "pyyaml>=6.0.1",
+# ]
+# ///
+
+import argparse
+import json
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Optional
+
+
+def load_task(task_path: Path) -> dict[str, Any]:
+    """Load a PinchBench task markdown file."""
+    content = task_path.read_text(encoding="utf-8")
+
+    # Extract YAML frontmatter
+    fm_match = re.match(r"^---\s*\n(.*?)\n---\s*\n(.*)$", content, re.DOTALL)
+    if not fm_match:
+        raise ValueError(f"No YAML frontmatter in {task_path}")
+
+    import yaml
+    frontmatter = yaml.safe_load(fm_match.group(1))
+    body = fm_match.group(2)
+
+    # Extract sections
+    sections: dict[str, str] = {}
+    current_section = None
+    current_content: list[str] = []
+    for line in body.split("\n"):
+        header = re.match(r"^##\s+(.+)$", line)
+        if header:
+            if current_section:
+                sections[current_section] = "\n".join(current_content).strip()
+            current_section = header.group(1)
+            current_content = []
+        else:
+            current_content.append(line)
+    if current_section:
+        sections[current_section] = "\n".join(current_content).strip()
+
+    return {
+        "task_id": frontmatter.get("id", task_path.stem),
+        "name": frontmatter.get("name", ""),
+        "category": frontmatter.get("category", ""),
+        "grading_type": frontmatter.get("grading_type", "automated"),
+        "timeout_seconds": frontmatter.get("timeout_seconds", 120),
+        "workspace_files": frontmatter.get("workspace_files", []),
+        "prompt": sections.get("Prompt", "").strip(),
+        "automated_checks": sections.get("Automated Checks", None),
+        "llm_judge_rubric": sections.get("LLM Judge Rubric", None),
+        "grading_criteria": sections.get("Grading Criteria", ""),
+        "expected_behavior": sections.get("Expected Behavior", ""),
+        "path": task_path,
+    }
+
+
+def prepare_workspace(task: dict, run_dir: Path) -> Path:
+    """Create a temp workspace with any task-required files."""
+    workspace = run_dir / task["task_id"]
+    workspace.mkdir(parents=True, exist_ok=True)
+
+    # Initialize git repo so codewhale works
+    subprocess.run(["git", "init"], cwd=workspace, capture_output=True, check=False)
+    subprocess.run(
+        ["git", "config", "user.email", "bench@codewhale"],
+        cwd=workspace, capture_output=True, check=False,
+    )
+    subprocess.run(
+        ["git", "config", "user.name", "Benchmark"],
+        cwd=workspace, capture_output=True, check=False,
+    )
+
+    # Create workspace files from task definition
+    for wf in task.get("workspace_files", []):
+        if isinstance(wf, dict):
+            for path, content in wf.items():
+                fpath = workspace / path
+                fpath.parent.mkdir(parents=True, exist_ok=True)
+                fpath.write_text(content, encoding="utf-8")
+
+    # Commit initial state
+    subprocess.run(["git", "add", "-A"], cwd=workspace, capture_output=True, check=False)
+    subprocess.run(
+        ["git", "commit", "-m", "initial", "--allow-empty"],
+        cwd=workspace, capture_output=True, check=False,
+    )
+
+    return workspace
+
+
+def run_codewhale(
+    workspace: Path,
+    prompt: str,
+    timeout_seconds: int,
+    model: Optional[str] = None,
+) -> dict[str, Any]:
+    """Run codewhale exec on a task and return the result."""
+    cmd = [
+        "codewhale", "exec",
+        "--auto",
+        "--workspace", str(workspace),
+    ]
+    if model:
+        cmd.extend(["--model", model])
+    cmd.append(prompt)
+
+    start = time.time()
+    try:
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=timeout_seconds,
+            cwd=workspace,
+            check=False,
+        )
+        elapsed = time.time() - start
+        return {
+            "exit_code": result.returncode,
+            "stdout": result.stdout,
+            "stderr": result.stderr,
+            "elapsed_seconds": elapsed,
+            "timed_out": False,
+        }
+    except subprocess.TimeoutExpired:
+        elapsed = time.time() - start
+        return {
+            "exit_code": -1,
+            "stdout": "",
+            "stderr": "TIMEOUT",
+            "elapsed_seconds": elapsed,
+            "timed_out": True,
+        }
+
+
+def grade_automated(task: dict, workspace: Path, transcript: list) -> dict[str, Any]:
+    """Run the automated grading check from the task definition."""
+    checks_code = task.get("automated_checks")
+    if not checks_code:
+        return {"score": 0.0, "reason": "no automated checks defined"}
+
+    # Extract the grade function from the markdown code block
+    code_match = re.search(r"```python\n(.*?)```", checks_code, re.DOTALL)
+    if not code_match:
+        return {"score": 0.0, "reason": "no python code block in automated checks"}
+
+    code = code_match.group(1)
+
+    # Execute the grading function
+    namespace: dict[str, Any] = {}
+    try:
+        exec(code, namespace)
+    except Exception as e:
+        return {"score": 0.0, "reason": f"grading code failed to load: {e}"}
+
+    grade_fn = namespace.get("grade")
+    if not grade_fn:
+        return {"score": 0.0, "reason": "no grade() function in automated checks"}
+
+    try:
+        result = grade_fn(transcript, str(workspace))
+        if isinstance(result, dict):
+            # PinchBench returns per-criterion scores; average them
+            numeric = [v for v in result.values() if isinstance(v, (int, float))]
+            avg = sum(numeric) / len(numeric) if numeric else 0.0
+            result["score"] = avg
+            return result
+        return {"score": float(result) if result else 0.0}
+    except Exception as e:
+        return {"score": 0.0, "reason": f"grading failed: {e}"}
+
+
+def run_benchmark(
+    tasks_dir: Path,
+    suite: str,
+    results_dir: Path,
+    model: Optional[str] = None,
+    timeout_multiplier: float = 1.0,
+) -> dict[str, Any]:
+    """Run the benchmark suite."""
+    # Load tasks
+    all_tasks: list[dict] = []
+    manifest_path = tasks_dir / "manifest.yaml"
+
+    if suite == "all":
+        task_files = sorted(tasks_dir.glob("task_*.md"))
+        for tf in task_files:
+            try:
+                all_tasks.append(load_task(tf))
+            except Exception as e:
+                print(f"  Skip {tf.name}: {e}", file=sys.stderr)
+    else:
+        task_ids = [t.strip() for t in suite.split(",")]
+        for tid in task_ids:
+            tf = tasks_dir / f"{tid}.md"
+            if not tf.exists():
+                print(f"  Task not found: {tf}", file=sys.stderr)
+                continue
+            all_tasks.append(load_task(tf))
+
+    if not all_tasks:
+        print("No tasks loaded.", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Loaded {len(all_tasks)} tasks")
+
+    # Create run directory
+    results_dir.mkdir(parents=True, exist_ok=True)
+    run_id = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
+    run_dir = results_dir / run_id
+    run_dir.mkdir()
+
+    # Record metadata
+    cw_version = "unknown"
+    try:
+        vr = subprocess.run(["codewhale", "--version"], capture_output=True, text=True)
+        if vr.returncode == 0:
+            cw_version = vr.stdout.strip()
+    except FileNotFoundError:
+        pass
+
+    metadata = {
+        "codewhale_version": cw_version,
+        "model": model or "default",
+        "suite": suite,
+        "task_count": len(all_tasks),
+        "run_id": run_id,
+        "timestamp_utc": datetime.now(timezone.utc).isoformat(),
+    }
+    (run_dir / "metadata.json").write_text(json.dumps(metadata, indent=2))
+
+    # Run tasks
+    results: list[dict] = []
+    total_score = 0.0
+
+    for i, task in enumerate(all_tasks, 1):
+        task_id = task["task_id"]
+        print(f"\n{'='*60}")
+        print(f"Task {i}/{len(all_tasks)}: {task_id} — {task['name']}")
+        print(f"  Category: {task['category']}")
+        print(f"{'='*60}")
+
+        workspace = prepare_workspace(task, run_dir)
+        timeout = int(task["timeout_seconds"] * timeout_multiplier)
+
+        # Run codewhale
+        print(f"  Running codewhale exec (timeout: {timeout}s)...")
+        result = run_codewhale(workspace, task["prompt"], timeout, model=model)
+        print(f"  Completed in {result['elapsed_seconds']:.1f}s (exit {result['exit_code']})")
+
+        if result["timed_out"]:
+            print(f"  ⏰ TIMED OUT")
+
+        # Build a minimal transcript for grading
+        transcript = [{"role": "user", "content": task["prompt"]}]
+        if result["stdout"]:
+            transcript.append({"role": "assistant", "content": result["stdout"]})
+
+        # Grade
+        grade_result = {"score": 0.0, "reason": "not graded"}
+        if task["automated_checks"]:
+            grade_result = grade_automated(task, workspace, transcript)
+        elif task.get("llm_judge_rubric"):
+            grade_result = {"score": 0.0, "reason": "llm judge not implemented yet"}
+
+        score = grade_result.get("score", 0.0)
+        total_score += score
+
+        status = "✅" if score >= 1.0 else "🔶" if score > 0 else "❌"
+        print(f"  {status} Score: {score:.1%} — {grade_result.get('reason', '')}")
+
+        task_result = {
+            "task_id": task_id,
+            "name": task["name"],
+            "category": task["category"],
+            "score": score,
+            "grade": grade_result,
+            "elapsed_seconds": result["elapsed_seconds"],
+            "timed_out": result["timed_out"],
+            "exit_code": result["exit_code"],
+        }
+        results.append(task_result)
+
+        # Save individual result
+        (run_dir / f"{task_id}.json").write_text(json.dumps(task_result, indent=2))
+
+    # Summary
+    avg_score = total_score / len(results) if results else 0.0
+
+    # Group by category
+    categories: dict[str, list[dict]] = {}
+    for r in results:
+        cat = r["category"]
+        categories.setdefault(cat, []).append(r)
+
+    summary = {
+        "run_id": run_id,
+        "total_score": total_score,
+        "task_count": len(results),
+        "average_score": avg_score,
+        "categories": {
+            cat: {
+                "score": sum(r["score"] for r in tasks) / len(tasks) if tasks else 0,
+                "tasks": len(tasks),
+            }
+            for cat, tasks in categories.items()
+        },
+        "results": results,
+        "metadata": metadata,
+    }
+
+    (run_dir / "summary.json").write_text(json.dumps(summary, indent=2))
+
+    # Print summary
+    print(f"\n{'='*60}")
+    print(f"PINCHBENCH SCORE SUMMARY (CodeWhale)")
+    print(f"{'='*60}")
+    print(f"\n  Overall: {avg_score:.1%} ({total_score:.1f}/{len(results)})\n")
+    print(f"  {'CATEGORY':<25} {'SCORE':>8}  {'TASKS':>5}")
+    print(f"  {'-'*45}")
+    for cat, info in sorted(summary["categories"].items()):
+        pct = info["score"] * 100
+        marker = "🔴" if pct < 25 else "🟡" if pct < 75 else "🟢"
+        print(f"  {marker} {cat:<23} {pct:>6.1f}%  {info['tasks']:>5}")
+    print(f"  {'-'*45}")
+    print(f"\nResults: {run_dir}")
+
+    return summary
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Run PinchBench tasks through CodeWhale (no OpenClaw)"
+    )
+    parser.add_argument(
+        "--tasks-dir",
+        type=Path,
+        default=Path("/tmp/pinchbench/tasks"),
+        help="PinchBench tasks directory",
+    )
+    parser.add_argument(
+        "--suite",
+        default="task_calendar",
+        help="Comma-separated task IDs, or 'all'",
+    )
+    parser.add_argument(
+        "--results-dir",
+        type=Path,
+        default=Path("./results/pinchbench-codewhale"),
+        help="Results output directory",
+    )
+    parser.add_argument("--model", default=None, help="Model override for codewhale")
+    parser.add_argument(
+        "--timeout-multiplier",
+        type=float,
+        default=1.0,
+        help="Scale task timeouts",
+    )
+    args = parser.parse_args()
+
+    run_benchmark(
+        tasks_dir=args.tasks_dir,
+        suite=args.suite,
+        results_dir=args.results_dir,
+        model=args.model,
+        timeout_multiplier=args.timeout_multiplier,
+    )
+
+
+if __name__ == "__main__":
+    main()