From ce46e29e384180ad685fc54022a8556ea9aa9615 Mon Sep 17 00:00:00 2001
From: Hunter B <hmbown@gmail.com>
Date: Fri, 5 Jun 2026 15:57:06 -0700
Subject: [PATCH] fix(benchmarks): fix workspace file copying and add LLM judge
 grading
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two bugs from the initial run:
1. workspace_files format is [{source, dest}] not {path, content} —
   files live in PinchBench's assets/ directory, not tasks/. Now checks
   both tasks/ and assets/ directories.
2. LLM judge tasks (writing, research) scored 0% because the judge
   wasn't implemented. Now uses codewhale exec as the judge — sends
   the rubric + workspace contents and parses a JSON score response.

Also strips ANSI escape codes and control characters from judge output
to prevent JSON parse failures.
---
 scripts/benchmarks/pinchbench_codewhale.py | 143 +++++++++++++++++----
 1 file changed, 116 insertions(+), 27 deletions(-)

diff --git a/scripts/benchmarks/pinchbench_codewhale.py b/scripts/benchmarks/pinchbench_codewhale.py
index 84f3f982..c70e615d 100644
--- a/scripts/benchmarks/pinchbench_codewhale.py
+++ b/scripts/benchmarks/pinchbench_codewhale.py
@@ -9,7 +9,7 @@ Usage:
     python scripts/benchmarks/pinchbench_codewhale.py --help
     python scripts/benchmarks/pinchbench_codewhale.py --suite task_calendar
     python scripts/benchmarks/pinchbench_codewhale.py --suite task_calendar,task_stock
-    python scripts/benchmarks/pinchbench_codewhale.py --all
+    python scripts/benchmarks/pinchbench_codewhale.py --suite all
 """
 # /// script
 # requires-python = ">=3.10"
@@ -25,7 +25,6 @@ import re
 import shutil
 import subprocess
 import sys
-import tempfile
 import time
 from datetime import datetime, timezone
 from pathlib import Path
@@ -36,7 +35,6 @@ def load_task(task_path: Path) -> dict[str, Any]:
     """Load a PinchBench task markdown file."""
     content = task_path.read_text(encoding="utf-8")
 
-    # Extract YAML frontmatter
     fm_match = re.match(r"^---\s*\n(.*?)\n---\s*\n(.*)$", content, re.DOTALL)
     if not fm_match:
         raise ValueError(f"No YAML frontmatter in {task_path}")
@@ -45,7 +43,6 @@ def load_task(task_path: Path) -> dict[str, Any]:
     frontmatter = yaml.safe_load(fm_match.group(1))
     body = fm_match.group(2)
 
-    # Extract sections
     sections: dict[str, str] = {}
     current_section = None
     current_content: list[str] = []
@@ -77,7 +74,7 @@ def load_task(task_path: Path) -> dict[str, Any]:
     }
 
 
-def prepare_workspace(task: dict, run_dir: Path) -> Path:
+def prepare_workspace(task: dict, run_dir: Path, tasks_dir: Path) -> Path:
     """Create a temp workspace with any task-required files."""
     workspace = run_dir / task["task_id"]
     workspace.mkdir(parents=True, exist_ok=True)
@@ -93,13 +90,26 @@ def prepare_workspace(task: dict, run_dir: Path) -> Path:
         cwd=workspace, capture_output=True, check=False,
     )
 
-    # Create workspace files from task definition
+    # Copy workspace files — source paths may be relative to tasks/ or assets/
+    assets_dir = tasks_dir.parent / "assets"
     for wf in task.get("workspace_files", []):
-        if isinstance(wf, dict):
+        if isinstance(wf, dict) and "source" in wf and "dest" in wf:
+            # Try tasks_dir first, then assets_dir
+            src = tasks_dir / wf["source"]
+            if not src.exists():
+                src = assets_dir / wf["source"]
+            dst = workspace / wf["dest"]
+            dst.parent.mkdir(parents=True, exist_ok=True)
+            if src.exists():
+                shutil.copy2(src, dst)
+            else:
+                print(f"  Warning: workspace file not found: {wf['source']}", file=sys.stderr)
+        elif isinstance(wf, dict):
+            # Legacy format: {path: content}
             for path, content in wf.items():
                 fpath = workspace / path
                 fpath.parent.mkdir(parents=True, exist_ok=True)
-                fpath.write_text(content, encoding="utf-8")
+                fpath.write_text(str(content), encoding="utf-8")
 
     # Commit initial state
     subprocess.run(["git", "add", "-A"], cwd=workspace, capture_output=True, check=False)
@@ -162,14 +172,11 @@ def grade_automated(task: dict, workspace: Path, transcript: list) -> dict[str,
     if not checks_code:
         return {"score": 0.0, "reason": "no automated checks defined"}
 
-    # Extract the grade function from the markdown code block
     code_match = re.search(r"```python\n(.*?)```", checks_code, re.DOTALL)
     if not code_match:
         return {"score": 0.0, "reason": "no python code block in automated checks"}
 
     code = code_match.group(1)
-
-    # Execute the grading function
     namespace: dict[str, Any] = {}
     try:
         exec(code, namespace)
@@ -183,7 +190,6 @@ def grade_automated(task: dict, workspace: Path, transcript: list) -> dict[str,
     try:
         result = grade_fn(transcript, str(workspace))
         if isinstance(result, dict):
-            # PinchBench returns per-criterion scores; average them
             numeric = [v for v in result.values() if isinstance(v, (int, float))]
             avg = sum(numeric) / len(numeric) if numeric else 0.0
             result["score"] = avg
@@ -193,6 +199,83 @@ def grade_automated(task: dict, workspace: Path, transcript: list) -> dict[str,
         return {"score": 0.0, "reason": f"grading failed: {e}"}
 
 
+def grade_llm_judge(task: dict, workspace: Path, transcript: list, model: Optional[str] = None) -> dict[str, Any]:
+    """Use codewhale as an LLM judge to grade a task."""
+    rubric = task.get("llm_judge_rubric")
+    if not rubric:
+        return {"score": 0.0, "reason": "no LLM judge rubric"}
+
+    criteria = task.get("grading_criteria", "")
+    expected = task.get("expected_behavior", "")
+
+    # Collect workspace files for context
+    ws_files = []
+    for f in workspace.rglob("*"):
+        if f.is_file() and ".git" not in str(f):
+            try:
+                content = f.read_text(encoding="utf-8", errors="replace")[:3000]
+                ws_files.append(f"--- {f.name} ---\n{content}")
+            except Exception:
+                ws_files.append(f"--- {f.name} --- (binary/unreadable)")
+
+    ws_content = "\n\n".join(ws_files[:10])  # Limit to 10 files
+
+    judge_prompt = f"""You are a grading judge. Evaluate whether the agent's output meets the task requirements.
+
+TASK: {task['name']}
+
+EXPECTED BEHAVIOR:
+{expected}
+
+GRADING CRITERIA:
+{criteria}
+
+LLM JUDGE RUBRIC:
+{rubric}
+
+AGENT'S WORKSPACE FILES:
+{ws_content}
+
+Score the task on a scale of 0.0 to 1.0. Respond with ONLY a JSON object:
+{{"score": <float>, "reason": "<brief explanation>"}}
+
+Be strict but fair. Partial credit is OK."""
+
+    cmd = ["codewhale", "exec", "--auto", "--workspace", str(workspace)]
+    if model:
+        cmd.extend(["--model", model])
+    cmd.append(judge_prompt)
+
+    try:
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=120,
+            cwd=workspace,
+            check=False,
+        )
+        # Extract JSON from response — strip control chars that break json.loads
+        output = result.stdout
+        # Remove ANSI escape codes
+        output = re.sub(r'\x1b\[[0-9;]*[a-zA-Z]', '', output)
+        output = re.sub(r'\x1b\][^\x07]*\x07', '', output)
+        json_match = re.search(r'\{[^{}]*"score"[^{}]*\}', output)
+        if json_match:
+            raw = json_match.group()
+            # Strip control characters except newline/tab
+            raw = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', raw)
+            parsed = json.loads(raw)
+            return {
+                "score": float(parsed.get("score", 0.0)),
+                "reason": parsed.get("reason", "llm judge"),
+                "judge": "llm",
+            }
+        return {"score": 0.0, "reason": "llm judge returned unparseable response", "judge": "llm"}
+    except Exception as e:
+        return {"score": 0.0, "reason": f"llm judge failed: {e}", "judge": "llm"}
+
+
 def run_benchmark(
     tasks_dir: Path,
     suite: str,
@@ -201,9 +284,7 @@ def run_benchmark(
     timeout_multiplier: float = 1.0,
 ) -> dict[str, Any]:
     """Run the benchmark suite."""
-    # Load tasks
     all_tasks: list[dict] = []
-    manifest_path = tasks_dir / "manifest.yaml"
 
     if suite == "all":
         task_files = sorted(tasks_dir.glob("task_*.md"))
@@ -227,13 +308,11 @@ def run_benchmark(
 
     print(f"Loaded {len(all_tasks)} tasks")
 
-    # Create run directory
     results_dir.mkdir(parents=True, exist_ok=True)
     run_id = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
     run_dir = results_dir / run_id
     run_dir.mkdir()
 
-    # Record metadata
     cw_version = "unknown"
     try:
         vr = subprocess.run(["codewhale", "--version"], capture_output=True, text=True)
@@ -252,7 +331,6 @@ def run_benchmark(
     }
     (run_dir / "metadata.json").write_text(json.dumps(metadata, indent=2))
 
-    # Run tasks
     results: list[dict] = []
     total_score = 0.0
 
@@ -260,10 +338,10 @@ def run_benchmark(
         task_id = task["task_id"]
         print(f"\n{'='*60}")
         print(f"Task {i}/{len(all_tasks)}: {task_id} — {task['name']}")
-        print(f"  Category: {task['category']}")
+        print(f"  Category: {task['category']} | Grading: {task['grading_type']}")
         print(f"{'='*60}")
 
-        workspace = prepare_workspace(task, run_dir)
+        workspace = prepare_workspace(task, run_dir, tasks_dir)
         timeout = int(task["timeout_seconds"] * timeout_multiplier)
 
         # Run codewhale
@@ -274,17 +352,31 @@ def run_benchmark(
         if result["timed_out"]:
             print(f"  ⏰ TIMED OUT")
 
-        # Build a minimal transcript for grading
+        # Build transcript for grading
         transcript = [{"role": "user", "content": task["prompt"]}]
         if result["stdout"]:
             transcript.append({"role": "assistant", "content": result["stdout"]})
 
-        # Grade
+        # Grade based on type
+        grading_type = task.get("grading_type", "automated")
+        has_automated = task.get("automated_checks") and "```python" in (task.get("automated_checks") or "")
+        has_llm_rubric = bool(task.get("llm_judge_rubric"))
+
         grade_result = {"score": 0.0, "reason": "not graded"}
-        if task["automated_checks"]:
+
+        if has_automated:
             grade_result = grade_automated(task, workspace, transcript)
-        elif task.get("llm_judge_rubric"):
-            grade_result = {"score": 0.0, "reason": "llm judge not implemented yet"}
+
+        # If automated score is 0 and there's an LLM rubric, try LLM judge
+        if grade_result.get("score", 0.0) == 0.0 and has_llm_rubric:
+            print(f"  Running LLM judge...")
+            llm_result = grade_llm_judge(task, workspace, transcript, model=model)
+            # Use LLM judge score if it's better, or if no automated checks
+            if not has_automated or llm_result.get("score", 0.0) > 0.0:
+                grade_result = llm_result
+
+        if not has_automated and not has_llm_rubric:
+            grade_result = {"score": 0.0, "reason": "no grading method defined"}
 
         score = grade_result.get("score", 0.0)
         total_score += score
@@ -304,13 +396,11 @@ def run_benchmark(
         }
         results.append(task_result)
 
-        # Save individual result
         (run_dir / f"{task_id}.json").write_text(json.dumps(task_result, indent=2))
 
     # Summary
     avg_score = total_score / len(results) if results else 0.0
 
-    # Group by category
     categories: dict[str, list[dict]] = {}
     for r in results:
         cat = r["category"]
@@ -334,7 +424,6 @@ def run_benchmark(
 
     (run_dir / "summary.json").write_text(json.dumps(summary, indent=2))
 
-    # Print summary
     print(f"\n{'='*60}")
     print(f"PINCHBENCH SCORE SUMMARY (CodeWhale)")
     print(f"{'='*60}")