codewhale/scripts/benchmarks/pinchbench_codewhale.py

#!/usr/bin/env python3
"""
CodeWhale-native PinchBench runner.

Loads PinchBench tasks, runs them through codewhale exec, and grades results.
No OpenClaw dependency.

Usage:
    python scripts/benchmarks/pinchbench_codewhale.py --help
    python scripts/benchmarks/pinchbench_codewhale.py --suite task_calendar
    python scripts/benchmarks/pinchbench_codewhale.py --suite task_calendar,task_stock
    python scripts/benchmarks/pinchbench_codewhale.py --all
"""
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "pyyaml>=6.0.1",
# ]
# ///

import argparse
import json
import os
import re
import shutil
import subprocess
import sys
import tempfile
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional


def load_task(task_path: Path) -> dict[str, Any]:
    """Load a PinchBench task markdown file."""
    content = task_path.read_text(encoding="utf-8")

    # Extract YAML frontmatter
    fm_match = re.match(r"^---\s*\n(.*?)\n---\s*\n(.*)$", content, re.DOTALL)
    if not fm_match:
        raise ValueError(f"No YAML frontmatter in {task_path}")

    import yaml
    frontmatter = yaml.safe_load(fm_match.group(1))
    body = fm_match.group(2)

    # Extract sections
    sections: dict[str, str] = {}
    current_section = None
    current_content: list[str] = []
    for line in body.split("\n"):
        header = re.match(r"^##\s+(.+)$", line)
        if header:
            if current_section:
                sections[current_section] = "\n".join(current_content).strip()
            current_section = header.group(1)
            current_content = []
        else:
            current_content.append(line)
    if current_section:
        sections[current_section] = "\n".join(current_content).strip()

    return {
        "task_id": frontmatter.get("id", task_path.stem),
        "name": frontmatter.get("name", ""),
        "category": frontmatter.get("category", ""),
        "grading_type": frontmatter.get("grading_type", "automated"),
        "timeout_seconds": frontmatter.get("timeout_seconds", 120),
        "workspace_files": frontmatter.get("workspace_files", []),
        "prompt": sections.get("Prompt", "").strip(),
        "automated_checks": sections.get("Automated Checks", None),
        "llm_judge_rubric": sections.get("LLM Judge Rubric", None),
        "grading_criteria": sections.get("Grading Criteria", ""),
        "expected_behavior": sections.get("Expected Behavior", ""),
        "path": task_path,
    }


def prepare_workspace(task: dict, run_dir: Path) -> Path:
    """Create a temp workspace with any task-required files."""
    workspace = run_dir / task["task_id"]
    workspace.mkdir(parents=True, exist_ok=True)

    # Initialize git repo so codewhale works
    subprocess.run(["git", "init"], cwd=workspace, capture_output=True, check=False)
    subprocess.run(
        ["git", "config", "user.email", "bench@codewhale"],
        cwd=workspace, capture_output=True, check=False,
    )
    subprocess.run(
        ["git", "config", "user.name", "Benchmark"],
        cwd=workspace, capture_output=True, check=False,
    )

    # Create workspace files from task definition
    for wf in task.get("workspace_files", []):
        if isinstance(wf, dict):
            for path, content in wf.items():
                fpath = workspace / path
                fpath.parent.mkdir(parents=True, exist_ok=True)
                fpath.write_text(content, encoding="utf-8")

    # Commit initial state
    subprocess.run(["git", "add", "-A"], cwd=workspace, capture_output=True, check=False)
    subprocess.run(
        ["git", "commit", "-m", "initial", "--allow-empty"],
        cwd=workspace, capture_output=True, check=False,
    )

    return workspace


def run_codewhale(
    workspace: Path,
    prompt: str,
    timeout_seconds: int,
    model: Optional[str] = None,
) -> dict[str, Any]:
    """Run codewhale exec on a task and return the result."""
    cmd = [
        "codewhale", "exec",
        "--auto",
        "--workspace", str(workspace),
    ]
    if model:
        cmd.extend(["--model", model])
    cmd.append(prompt)

    start = time.time()
    try:
        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            timeout=timeout_seconds,
            cwd=workspace,
            check=False,
        )
        elapsed = time.time() - start
        return {
            "exit_code": result.returncode,
            "stdout": result.stdout,
            "stderr": result.stderr,
            "elapsed_seconds": elapsed,
            "timed_out": False,
        }
    except subprocess.TimeoutExpired:
        elapsed = time.time() - start
        return {
            "exit_code": -1,
            "stdout": "",
            "stderr": "TIMEOUT",
            "elapsed_seconds": elapsed,
            "timed_out": True,
        }


def grade_automated(task: dict, workspace: Path, transcript: list) -> dict[str, Any]:
    """Run the automated grading check from the task definition."""
    checks_code = task.get("automated_checks")
    if not checks_code:
        return {"score": 0.0, "reason": "no automated checks defined"}

    # Extract the grade function from the markdown code block
    code_match = re.search(r"```python\n(.*?)```", checks_code, re.DOTALL)
    if not code_match:
        return {"score": 0.0, "reason": "no python code block in automated checks"}

    code = code_match.group(1)

    # Execute the grading function
    namespace: dict[str, Any] = {}
    try:
        exec(code, namespace)
    except Exception as e:
        return {"score": 0.0, "reason": f"grading code failed to load: {e}"}

    grade_fn = namespace.get("grade")
    if not grade_fn:
        return {"score": 0.0, "reason": "no grade() function in automated checks"}

    try:
        result = grade_fn(transcript, str(workspace))
        if isinstance(result, dict):
            # PinchBench returns per-criterion scores; average them
            numeric = [v for v in result.values() if isinstance(v, (int, float))]
            avg = sum(numeric) / len(numeric) if numeric else 0.0
            result["score"] = avg
            return result
        return {"score": float(result) if result else 0.0}
    except Exception as e:
        return {"score": 0.0, "reason": f"grading failed: {e}"}


def run_benchmark(
    tasks_dir: Path,
    suite: str,
    results_dir: Path,
    model: Optional[str] = None,
    timeout_multiplier: float = 1.0,
) -> dict[str, Any]:
    """Run the benchmark suite."""
    # Load tasks
    all_tasks: list[dict] = []
    manifest_path = tasks_dir / "manifest.yaml"

    if suite == "all":
        task_files = sorted(tasks_dir.glob("task_*.md"))
        for tf in task_files:
            try:
                all_tasks.append(load_task(tf))
            except Exception as e:
                print(f"  Skip {tf.name}: {e}", file=sys.stderr)
    else:
        task_ids = [t.strip() for t in suite.split(",")]
        for tid in task_ids:
            tf = tasks_dir / f"{tid}.md"
            if not tf.exists():
                print(f"  Task not found: {tf}", file=sys.stderr)
                continue
            all_tasks.append(load_task(tf))

    if not all_tasks:
        print("No tasks loaded.", file=sys.stderr)
        sys.exit(1)

    print(f"Loaded {len(all_tasks)} tasks")

    # Create run directory
    results_dir.mkdir(parents=True, exist_ok=True)
    run_id = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
    run_dir = results_dir / run_id
    run_dir.mkdir()

    # Record metadata
    cw_version = "unknown"
    try:
        vr = subprocess.run(["codewhale", "--version"], capture_output=True, text=True)
        if vr.returncode == 0:
            cw_version = vr.stdout.strip()
    except FileNotFoundError:
        pass

    metadata = {
        "codewhale_version": cw_version,
        "model": model or "default",
        "suite": suite,
        "task_count": len(all_tasks),
        "run_id": run_id,
        "timestamp_utc": datetime.now(timezone.utc).isoformat(),
    }
    (run_dir / "metadata.json").write_text(json.dumps(metadata, indent=2))

    # Run tasks
    results: list[dict] = []
    total_score = 0.0

    for i, task in enumerate(all_tasks, 1):
        task_id = task["task_id"]
        print(f"\n{'='*60}")
        print(f"Task {i}/{len(all_tasks)}: {task_id} — {task['name']}")
        print(f"  Category: {task['category']}")
        print(f"{'='*60}")

        workspace = prepare_workspace(task, run_dir)
        timeout = int(task["timeout_seconds"] * timeout_multiplier)

        # Run codewhale
        print(f"  Running codewhale exec (timeout: {timeout}s)...")
        result = run_codewhale(workspace, task["prompt"], timeout, model=model)
        print(f"  Completed in {result['elapsed_seconds']:.1f}s (exit {result['exit_code']})")

        if result["timed_out"]:
            print(f"  ⏰ TIMED OUT")

        # Build a minimal transcript for grading
        transcript = [{"role": "user", "content": task["prompt"]}]
        if result["stdout"]:
            transcript.append({"role": "assistant", "content": result["stdout"]})

        # Grade
        grade_result = {"score": 0.0, "reason": "not graded"}
        if task["automated_checks"]:
            grade_result = grade_automated(task, workspace, transcript)
        elif task.get("llm_judge_rubric"):
            grade_result = {"score": 0.0, "reason": "llm judge not implemented yet"}

        score = grade_result.get("score", 0.0)
        total_score += score

        status = "✅" if score >= 1.0 else "🔶" if score > 0 else "❌"
        print(f"  {status} Score: {score:.1%} — {grade_result.get('reason', '')}")

        task_result = {
            "task_id": task_id,
            "name": task["name"],
            "category": task["category"],
            "score": score,
            "grade": grade_result,
            "elapsed_seconds": result["elapsed_seconds"],
            "timed_out": result["timed_out"],
            "exit_code": result["exit_code"],
        }
        results.append(task_result)

        # Save individual result
        (run_dir / f"{task_id}.json").write_text(json.dumps(task_result, indent=2))

    # Summary
    avg_score = total_score / len(results) if results else 0.0

    # Group by category
    categories: dict[str, list[dict]] = {}
    for r in results:
        cat = r["category"]
        categories.setdefault(cat, []).append(r)

    summary = {
        "run_id": run_id,
        "total_score": total_score,
        "task_count": len(results),
        "average_score": avg_score,
        "categories": {
            cat: {
                "score": sum(r["score"] for r in tasks) / len(tasks) if tasks else 0,
                "tasks": len(tasks),
            }
            for cat, tasks in categories.items()
        },
        "results": results,
        "metadata": metadata,
    }

    (run_dir / "summary.json").write_text(json.dumps(summary, indent=2))

    # Print summary
    print(f"\n{'='*60}")
    print(f"PINCHBENCH SCORE SUMMARY (CodeWhale)")
    print(f"{'='*60}")
    print(f"\n  Overall: {avg_score:.1%} ({total_score:.1f}/{len(results)})\n")
    print(f"  {'CATEGORY':<25} {'SCORE':>8}  {'TASKS':>5}")
    print(f"  {'-'*45}")
    for cat, info in sorted(summary["categories"].items()):
        pct = info["score"] * 100
        marker = "🔴" if pct < 25 else "🟡" if pct < 75 else "🟢"
        print(f"  {marker} {cat:<23} {pct:>6.1f}%  {info['tasks']:>5}")
    print(f"  {'-'*45}")
    print(f"\nResults: {run_dir}")

    return summary


def main():
    parser = argparse.ArgumentParser(
        description="Run PinchBench tasks through CodeWhale (no OpenClaw)"
    )
    parser.add_argument(
        "--tasks-dir",
        type=Path,
        default=Path("/tmp/pinchbench/tasks"),
        help="PinchBench tasks directory",
    )
    parser.add_argument(
        "--suite",
        default="task_calendar",
        help="Comma-separated task IDs, or 'all'",
    )
    parser.add_argument(
        "--results-dir",
        type=Path,
        default=Path("./results/pinchbench-codewhale"),
        help="Results output directory",
    )
    parser.add_argument("--model", default=None, help="Model override for codewhale")
    parser.add_argument(
        "--timeout-multiplier",
        type=float,
        default=1.0,
        help="Scale task timeouts",
    )
    args = parser.parse_args()

    run_benchmark(
        tasks_dir=args.tasks_dir,
        suite=args.suite,
        results_dir=args.results_dir,
        model=args.model,
        timeout_multiplier=args.timeout_multiplier,
    )


if __name__ == "__main__":
    main()