From c8fcef7f1ea7ef7c640f2b42d5dbc690cf78809a Mon Sep 17 00:00:00 2001 From: Hunter B Date: Thu, 4 Jun 2026 20:26:05 -0700 Subject: [PATCH] feat(benchmarks): add CodeWhale-native PinchBench runner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Runs PinchBench tasks directly through codewhale exec --auto instead of going through OpenClaw. Loads task markdown, creates workspace, runs the prompt, and grades using PinchBench's embedded automated checks. No external agent framework dependency — just codewhale + pyyaml. --- scripts/benchmarks/pinchbench_codewhale.py | 394 +++++++++++++++++++++ 1 file changed, 394 insertions(+) create mode 100644 scripts/benchmarks/pinchbench_codewhale.py diff --git a/scripts/benchmarks/pinchbench_codewhale.py b/scripts/benchmarks/pinchbench_codewhale.py new file mode 100644 index 00000000..84f3f982 --- /dev/null +++ b/scripts/benchmarks/pinchbench_codewhale.py @@ -0,0 +1,394 @@ +#!/usr/bin/env python3 +""" +CodeWhale-native PinchBench runner. + +Loads PinchBench tasks, runs them through codewhale exec, and grades results. +No OpenClaw dependency. + +Usage: + python scripts/benchmarks/pinchbench_codewhale.py --help + python scripts/benchmarks/pinchbench_codewhale.py --suite task_calendar + python scripts/benchmarks/pinchbench_codewhale.py --suite task_calendar,task_stock + python scripts/benchmarks/pinchbench_codewhale.py --all +""" +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "pyyaml>=6.0.1", +# ] +# /// + +import argparse +import json +import os +import re +import shutil +import subprocess +import sys +import tempfile +import time +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Optional + + +def load_task(task_path: Path) -> dict[str, Any]: + """Load a PinchBench task markdown file.""" + content = task_path.read_text(encoding="utf-8") + + # Extract YAML frontmatter + fm_match = re.match(r"^---\s*\n(.*?)\n---\s*\n(.*)$", content, re.DOTALL) + if not fm_match: + raise ValueError(f"No YAML frontmatter in {task_path}") + + import yaml + frontmatter = yaml.safe_load(fm_match.group(1)) + body = fm_match.group(2) + + # Extract sections + sections: dict[str, str] = {} + current_section = None + current_content: list[str] = [] + for line in body.split("\n"): + header = re.match(r"^##\s+(.+)$", line) + if header: + if current_section: + sections[current_section] = "\n".join(current_content).strip() + current_section = header.group(1) + current_content = [] + else: + current_content.append(line) + if current_section: + sections[current_section] = "\n".join(current_content).strip() + + return { + "task_id": frontmatter.get("id", task_path.stem), + "name": frontmatter.get("name", ""), + "category": frontmatter.get("category", ""), + "grading_type": frontmatter.get("grading_type", "automated"), + "timeout_seconds": frontmatter.get("timeout_seconds", 120), + "workspace_files": frontmatter.get("workspace_files", []), + "prompt": sections.get("Prompt", "").strip(), + "automated_checks": sections.get("Automated Checks", None), + "llm_judge_rubric": sections.get("LLM Judge Rubric", None), + "grading_criteria": sections.get("Grading Criteria", ""), + "expected_behavior": sections.get("Expected Behavior", ""), + "path": task_path, + } + + +def prepare_workspace(task: dict, run_dir: Path) -> Path: + """Create a temp workspace with any task-required files.""" + workspace = run_dir / task["task_id"] + workspace.mkdir(parents=True, exist_ok=True) + + # Initialize git repo so codewhale works + subprocess.run(["git", "init"], cwd=workspace, capture_output=True, check=False) + subprocess.run( + ["git", "config", "user.email", "bench@codewhale"], + cwd=workspace, capture_output=True, check=False, + ) + subprocess.run( + ["git", "config", "user.name", "Benchmark"], + cwd=workspace, capture_output=True, check=False, + ) + + # Create workspace files from task definition + for wf in task.get("workspace_files", []): + if isinstance(wf, dict): + for path, content in wf.items(): + fpath = workspace / path + fpath.parent.mkdir(parents=True, exist_ok=True) + fpath.write_text(content, encoding="utf-8") + + # Commit initial state + subprocess.run(["git", "add", "-A"], cwd=workspace, capture_output=True, check=False) + subprocess.run( + ["git", "commit", "-m", "initial", "--allow-empty"], + cwd=workspace, capture_output=True, check=False, + ) + + return workspace + + +def run_codewhale( + workspace: Path, + prompt: str, + timeout_seconds: int, + model: Optional[str] = None, +) -> dict[str, Any]: + """Run codewhale exec on a task and return the result.""" + cmd = [ + "codewhale", "exec", + "--auto", + "--workspace", str(workspace), + ] + if model: + cmd.extend(["--model", model]) + cmd.append(prompt) + + start = time.time() + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=timeout_seconds, + cwd=workspace, + check=False, + ) + elapsed = time.time() - start + return { + "exit_code": result.returncode, + "stdout": result.stdout, + "stderr": result.stderr, + "elapsed_seconds": elapsed, + "timed_out": False, + } + except subprocess.TimeoutExpired: + elapsed = time.time() - start + return { + "exit_code": -1, + "stdout": "", + "stderr": "TIMEOUT", + "elapsed_seconds": elapsed, + "timed_out": True, + } + + +def grade_automated(task: dict, workspace: Path, transcript: list) -> dict[str, Any]: + """Run the automated grading check from the task definition.""" + checks_code = task.get("automated_checks") + if not checks_code: + return {"score": 0.0, "reason": "no automated checks defined"} + + # Extract the grade function from the markdown code block + code_match = re.search(r"```python\n(.*?)```", checks_code, re.DOTALL) + if not code_match: + return {"score": 0.0, "reason": "no python code block in automated checks"} + + code = code_match.group(1) + + # Execute the grading function + namespace: dict[str, Any] = {} + try: + exec(code, namespace) + except Exception as e: + return {"score": 0.0, "reason": f"grading code failed to load: {e}"} + + grade_fn = namespace.get("grade") + if not grade_fn: + return {"score": 0.0, "reason": "no grade() function in automated checks"} + + try: + result = grade_fn(transcript, str(workspace)) + if isinstance(result, dict): + # PinchBench returns per-criterion scores; average them + numeric = [v for v in result.values() if isinstance(v, (int, float))] + avg = sum(numeric) / len(numeric) if numeric else 0.0 + result["score"] = avg + return result + return {"score": float(result) if result else 0.0} + except Exception as e: + return {"score": 0.0, "reason": f"grading failed: {e}"} + + +def run_benchmark( + tasks_dir: Path, + suite: str, + results_dir: Path, + model: Optional[str] = None, + timeout_multiplier: float = 1.0, +) -> dict[str, Any]: + """Run the benchmark suite.""" + # Load tasks + all_tasks: list[dict] = [] + manifest_path = tasks_dir / "manifest.yaml" + + if suite == "all": + task_files = sorted(tasks_dir.glob("task_*.md")) + for tf in task_files: + try: + all_tasks.append(load_task(tf)) + except Exception as e: + print(f" Skip {tf.name}: {e}", file=sys.stderr) + else: + task_ids = [t.strip() for t in suite.split(",")] + for tid in task_ids: + tf = tasks_dir / f"{tid}.md" + if not tf.exists(): + print(f" Task not found: {tf}", file=sys.stderr) + continue + all_tasks.append(load_task(tf)) + + if not all_tasks: + print("No tasks loaded.", file=sys.stderr) + sys.exit(1) + + print(f"Loaded {len(all_tasks)} tasks") + + # Create run directory + results_dir.mkdir(parents=True, exist_ok=True) + run_id = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S") + run_dir = results_dir / run_id + run_dir.mkdir() + + # Record metadata + cw_version = "unknown" + try: + vr = subprocess.run(["codewhale", "--version"], capture_output=True, text=True) + if vr.returncode == 0: + cw_version = vr.stdout.strip() + except FileNotFoundError: + pass + + metadata = { + "codewhale_version": cw_version, + "model": model or "default", + "suite": suite, + "task_count": len(all_tasks), + "run_id": run_id, + "timestamp_utc": datetime.now(timezone.utc).isoformat(), + } + (run_dir / "metadata.json").write_text(json.dumps(metadata, indent=2)) + + # Run tasks + results: list[dict] = [] + total_score = 0.0 + + for i, task in enumerate(all_tasks, 1): + task_id = task["task_id"] + print(f"\n{'='*60}") + print(f"Task {i}/{len(all_tasks)}: {task_id} — {task['name']}") + print(f" Category: {task['category']}") + print(f"{'='*60}") + + workspace = prepare_workspace(task, run_dir) + timeout = int(task["timeout_seconds"] * timeout_multiplier) + + # Run codewhale + print(f" Running codewhale exec (timeout: {timeout}s)...") + result = run_codewhale(workspace, task["prompt"], timeout, model=model) + print(f" Completed in {result['elapsed_seconds']:.1f}s (exit {result['exit_code']})") + + if result["timed_out"]: + print(f" ⏰ TIMED OUT") + + # Build a minimal transcript for grading + transcript = [{"role": "user", "content": task["prompt"]}] + if result["stdout"]: + transcript.append({"role": "assistant", "content": result["stdout"]}) + + # Grade + grade_result = {"score": 0.0, "reason": "not graded"} + if task["automated_checks"]: + grade_result = grade_automated(task, workspace, transcript) + elif task.get("llm_judge_rubric"): + grade_result = {"score": 0.0, "reason": "llm judge not implemented yet"} + + score = grade_result.get("score", 0.0) + total_score += score + + status = "✅" if score >= 1.0 else "🔶" if score > 0 else "❌" + print(f" {status} Score: {score:.1%} — {grade_result.get('reason', '')}") + + task_result = { + "task_id": task_id, + "name": task["name"], + "category": task["category"], + "score": score, + "grade": grade_result, + "elapsed_seconds": result["elapsed_seconds"], + "timed_out": result["timed_out"], + "exit_code": result["exit_code"], + } + results.append(task_result) + + # Save individual result + (run_dir / f"{task_id}.json").write_text(json.dumps(task_result, indent=2)) + + # Summary + avg_score = total_score / len(results) if results else 0.0 + + # Group by category + categories: dict[str, list[dict]] = {} + for r in results: + cat = r["category"] + categories.setdefault(cat, []).append(r) + + summary = { + "run_id": run_id, + "total_score": total_score, + "task_count": len(results), + "average_score": avg_score, + "categories": { + cat: { + "score": sum(r["score"] for r in tasks) / len(tasks) if tasks else 0, + "tasks": len(tasks), + } + for cat, tasks in categories.items() + }, + "results": results, + "metadata": metadata, + } + + (run_dir / "summary.json").write_text(json.dumps(summary, indent=2)) + + # Print summary + print(f"\n{'='*60}") + print(f"PINCHBENCH SCORE SUMMARY (CodeWhale)") + print(f"{'='*60}") + print(f"\n Overall: {avg_score:.1%} ({total_score:.1f}/{len(results)})\n") + print(f" {'CATEGORY':<25} {'SCORE':>8} {'TASKS':>5}") + print(f" {'-'*45}") + for cat, info in sorted(summary["categories"].items()): + pct = info["score"] * 100 + marker = "🔴" if pct < 25 else "🟡" if pct < 75 else "🟢" + print(f" {marker} {cat:<23} {pct:>6.1f}% {info['tasks']:>5}") + print(f" {'-'*45}") + print(f"\nResults: {run_dir}") + + return summary + + +def main(): + parser = argparse.ArgumentParser( + description="Run PinchBench tasks through CodeWhale (no OpenClaw)" + ) + parser.add_argument( + "--tasks-dir", + type=Path, + default=Path("/tmp/pinchbench/tasks"), + help="PinchBench tasks directory", + ) + parser.add_argument( + "--suite", + default="task_calendar", + help="Comma-separated task IDs, or 'all'", + ) + parser.add_argument( + "--results-dir", + type=Path, + default=Path("./results/pinchbench-codewhale"), + help="Results output directory", + ) + parser.add_argument("--model", default=None, help="Model override for codewhale") + parser.add_argument( + "--timeout-multiplier", + type=float, + default=1.0, + help="Scale task timeouts", + ) + args = parser.parse_args() + + run_benchmark( + tasks_dir=args.tasks_dir, + suite=args.suite, + results_dir=args.results_dir, + model=args.model, + timeout_multiplier=args.timeout_multiplier, + ) + + +if __name__ == "__main__": + main()