fix(benchmarks): fix workspace file copying and add LLM judge grading
Two bugs from the initial run:
1. workspace_files format is [{source, dest}] not {path, content} —
files live in PinchBench's assets/ directory, not tasks/. Now checks
both tasks/ and assets/ directories.
2. LLM judge tasks (writing, research) scored 0% because the judge
wasn't implemented. Now uses codewhale exec as the judge — sends
the rubric + workspace contents and parses a JSON score response.
Also strips ANSI escape codes and control characters from judge output
to prevent JSON parse failures.
This commit is contained in:
@@ -9,7 +9,7 @@ Usage:
|
|||||||
python scripts/benchmarks/pinchbench_codewhale.py --help
|
python scripts/benchmarks/pinchbench_codewhale.py --help
|
||||||
python scripts/benchmarks/pinchbench_codewhale.py --suite task_calendar
|
python scripts/benchmarks/pinchbench_codewhale.py --suite task_calendar
|
||||||
python scripts/benchmarks/pinchbench_codewhale.py --suite task_calendar,task_stock
|
python scripts/benchmarks/pinchbench_codewhale.py --suite task_calendar,task_stock
|
||||||
python scripts/benchmarks/pinchbench_codewhale.py --all
|
python scripts/benchmarks/pinchbench_codewhale.py --suite all
|
||||||
"""
|
"""
|
||||||
# /// script
|
# /// script
|
||||||
# requires-python = ">=3.10"
|
# requires-python = ">=3.10"
|
||||||
@@ -25,7 +25,6 @@ import re
|
|||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
|
||||||
import time
|
import time
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -36,7 +35,6 @@ def load_task(task_path: Path) -> dict[str, Any]:
|
|||||||
"""Load a PinchBench task markdown file."""
|
"""Load a PinchBench task markdown file."""
|
||||||
content = task_path.read_text(encoding="utf-8")
|
content = task_path.read_text(encoding="utf-8")
|
||||||
|
|
||||||
# Extract YAML frontmatter
|
|
||||||
fm_match = re.match(r"^---\s*\n(.*?)\n---\s*\n(.*)$", content, re.DOTALL)
|
fm_match = re.match(r"^---\s*\n(.*?)\n---\s*\n(.*)$", content, re.DOTALL)
|
||||||
if not fm_match:
|
if not fm_match:
|
||||||
raise ValueError(f"No YAML frontmatter in {task_path}")
|
raise ValueError(f"No YAML frontmatter in {task_path}")
|
||||||
@@ -45,7 +43,6 @@ def load_task(task_path: Path) -> dict[str, Any]:
|
|||||||
frontmatter = yaml.safe_load(fm_match.group(1))
|
frontmatter = yaml.safe_load(fm_match.group(1))
|
||||||
body = fm_match.group(2)
|
body = fm_match.group(2)
|
||||||
|
|
||||||
# Extract sections
|
|
||||||
sections: dict[str, str] = {}
|
sections: dict[str, str] = {}
|
||||||
current_section = None
|
current_section = None
|
||||||
current_content: list[str] = []
|
current_content: list[str] = []
|
||||||
@@ -77,7 +74,7 @@ def load_task(task_path: Path) -> dict[str, Any]:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def prepare_workspace(task: dict, run_dir: Path) -> Path:
|
def prepare_workspace(task: dict, run_dir: Path, tasks_dir: Path) -> Path:
|
||||||
"""Create a temp workspace with any task-required files."""
|
"""Create a temp workspace with any task-required files."""
|
||||||
workspace = run_dir / task["task_id"]
|
workspace = run_dir / task["task_id"]
|
||||||
workspace.mkdir(parents=True, exist_ok=True)
|
workspace.mkdir(parents=True, exist_ok=True)
|
||||||
@@ -93,13 +90,26 @@ def prepare_workspace(task: dict, run_dir: Path) -> Path:
|
|||||||
cwd=workspace, capture_output=True, check=False,
|
cwd=workspace, capture_output=True, check=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create workspace files from task definition
|
# Copy workspace files — source paths may be relative to tasks/ or assets/
|
||||||
|
assets_dir = tasks_dir.parent / "assets"
|
||||||
for wf in task.get("workspace_files", []):
|
for wf in task.get("workspace_files", []):
|
||||||
if isinstance(wf, dict):
|
if isinstance(wf, dict) and "source" in wf and "dest" in wf:
|
||||||
|
# Try tasks_dir first, then assets_dir
|
||||||
|
src = tasks_dir / wf["source"]
|
||||||
|
if not src.exists():
|
||||||
|
src = assets_dir / wf["source"]
|
||||||
|
dst = workspace / wf["dest"]
|
||||||
|
dst.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
if src.exists():
|
||||||
|
shutil.copy2(src, dst)
|
||||||
|
else:
|
||||||
|
print(f" Warning: workspace file not found: {wf['source']}", file=sys.stderr)
|
||||||
|
elif isinstance(wf, dict):
|
||||||
|
# Legacy format: {path: content}
|
||||||
for path, content in wf.items():
|
for path, content in wf.items():
|
||||||
fpath = workspace / path
|
fpath = workspace / path
|
||||||
fpath.parent.mkdir(parents=True, exist_ok=True)
|
fpath.parent.mkdir(parents=True, exist_ok=True)
|
||||||
fpath.write_text(content, encoding="utf-8")
|
fpath.write_text(str(content), encoding="utf-8")
|
||||||
|
|
||||||
# Commit initial state
|
# Commit initial state
|
||||||
subprocess.run(["git", "add", "-A"], cwd=workspace, capture_output=True, check=False)
|
subprocess.run(["git", "add", "-A"], cwd=workspace, capture_output=True, check=False)
|
||||||
@@ -162,14 +172,11 @@ def grade_automated(task: dict, workspace: Path, transcript: list) -> dict[str,
|
|||||||
if not checks_code:
|
if not checks_code:
|
||||||
return {"score": 0.0, "reason": "no automated checks defined"}
|
return {"score": 0.0, "reason": "no automated checks defined"}
|
||||||
|
|
||||||
# Extract the grade function from the markdown code block
|
|
||||||
code_match = re.search(r"```python\n(.*?)```", checks_code, re.DOTALL)
|
code_match = re.search(r"```python\n(.*?)```", checks_code, re.DOTALL)
|
||||||
if not code_match:
|
if not code_match:
|
||||||
return {"score": 0.0, "reason": "no python code block in automated checks"}
|
return {"score": 0.0, "reason": "no python code block in automated checks"}
|
||||||
|
|
||||||
code = code_match.group(1)
|
code = code_match.group(1)
|
||||||
|
|
||||||
# Execute the grading function
|
|
||||||
namespace: dict[str, Any] = {}
|
namespace: dict[str, Any] = {}
|
||||||
try:
|
try:
|
||||||
exec(code, namespace)
|
exec(code, namespace)
|
||||||
@@ -183,7 +190,6 @@ def grade_automated(task: dict, workspace: Path, transcript: list) -> dict[str,
|
|||||||
try:
|
try:
|
||||||
result = grade_fn(transcript, str(workspace))
|
result = grade_fn(transcript, str(workspace))
|
||||||
if isinstance(result, dict):
|
if isinstance(result, dict):
|
||||||
# PinchBench returns per-criterion scores; average them
|
|
||||||
numeric = [v for v in result.values() if isinstance(v, (int, float))]
|
numeric = [v for v in result.values() if isinstance(v, (int, float))]
|
||||||
avg = sum(numeric) / len(numeric) if numeric else 0.0
|
avg = sum(numeric) / len(numeric) if numeric else 0.0
|
||||||
result["score"] = avg
|
result["score"] = avg
|
||||||
@@ -193,6 +199,83 @@ def grade_automated(task: dict, workspace: Path, transcript: list) -> dict[str,
|
|||||||
return {"score": 0.0, "reason": f"grading failed: {e}"}
|
return {"score": 0.0, "reason": f"grading failed: {e}"}
|
||||||
|
|
||||||
|
|
||||||
|
def grade_llm_judge(task: dict, workspace: Path, transcript: list, model: Optional[str] = None) -> dict[str, Any]:
|
||||||
|
"""Use codewhale as an LLM judge to grade a task."""
|
||||||
|
rubric = task.get("llm_judge_rubric")
|
||||||
|
if not rubric:
|
||||||
|
return {"score": 0.0, "reason": "no LLM judge rubric"}
|
||||||
|
|
||||||
|
criteria = task.get("grading_criteria", "")
|
||||||
|
expected = task.get("expected_behavior", "")
|
||||||
|
|
||||||
|
# Collect workspace files for context
|
||||||
|
ws_files = []
|
||||||
|
for f in workspace.rglob("*"):
|
||||||
|
if f.is_file() and ".git" not in str(f):
|
||||||
|
try:
|
||||||
|
content = f.read_text(encoding="utf-8", errors="replace")[:3000]
|
||||||
|
ws_files.append(f"--- {f.name} ---\n{content}")
|
||||||
|
except Exception:
|
||||||
|
ws_files.append(f"--- {f.name} --- (binary/unreadable)")
|
||||||
|
|
||||||
|
ws_content = "\n\n".join(ws_files[:10]) # Limit to 10 files
|
||||||
|
|
||||||
|
judge_prompt = f"""You are a grading judge. Evaluate whether the agent's output meets the task requirements.
|
||||||
|
|
||||||
|
TASK: {task['name']}
|
||||||
|
|
||||||
|
EXPECTED BEHAVIOR:
|
||||||
|
{expected}
|
||||||
|
|
||||||
|
GRADING CRITERIA:
|
||||||
|
{criteria}
|
||||||
|
|
||||||
|
LLM JUDGE RUBRIC:
|
||||||
|
{rubric}
|
||||||
|
|
||||||
|
AGENT'S WORKSPACE FILES:
|
||||||
|
{ws_content}
|
||||||
|
|
||||||
|
Score the task on a scale of 0.0 to 1.0. Respond with ONLY a JSON object:
|
||||||
|
{{"score": <float>, "reason": "<brief explanation>"}}
|
||||||
|
|
||||||
|
Be strict but fair. Partial credit is OK."""
|
||||||
|
|
||||||
|
cmd = ["codewhale", "exec", "--auto", "--workspace", str(workspace)]
|
||||||
|
if model:
|
||||||
|
cmd.extend(["--model", model])
|
||||||
|
cmd.append(judge_prompt)
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
cmd,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=120,
|
||||||
|
cwd=workspace,
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
# Extract JSON from response — strip control chars that break json.loads
|
||||||
|
output = result.stdout
|
||||||
|
# Remove ANSI escape codes
|
||||||
|
output = re.sub(r'\x1b\[[0-9;]*[a-zA-Z]', '', output)
|
||||||
|
output = re.sub(r'\x1b\][^\x07]*\x07', '', output)
|
||||||
|
json_match = re.search(r'\{[^{}]*"score"[^{}]*\}', output)
|
||||||
|
if json_match:
|
||||||
|
raw = json_match.group()
|
||||||
|
# Strip control characters except newline/tab
|
||||||
|
raw = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', raw)
|
||||||
|
parsed = json.loads(raw)
|
||||||
|
return {
|
||||||
|
"score": float(parsed.get("score", 0.0)),
|
||||||
|
"reason": parsed.get("reason", "llm judge"),
|
||||||
|
"judge": "llm",
|
||||||
|
}
|
||||||
|
return {"score": 0.0, "reason": "llm judge returned unparseable response", "judge": "llm"}
|
||||||
|
except Exception as e:
|
||||||
|
return {"score": 0.0, "reason": f"llm judge failed: {e}", "judge": "llm"}
|
||||||
|
|
||||||
|
|
||||||
def run_benchmark(
|
def run_benchmark(
|
||||||
tasks_dir: Path,
|
tasks_dir: Path,
|
||||||
suite: str,
|
suite: str,
|
||||||
@@ -201,9 +284,7 @@ def run_benchmark(
|
|||||||
timeout_multiplier: float = 1.0,
|
timeout_multiplier: float = 1.0,
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""Run the benchmark suite."""
|
"""Run the benchmark suite."""
|
||||||
# Load tasks
|
|
||||||
all_tasks: list[dict] = []
|
all_tasks: list[dict] = []
|
||||||
manifest_path = tasks_dir / "manifest.yaml"
|
|
||||||
|
|
||||||
if suite == "all":
|
if suite == "all":
|
||||||
task_files = sorted(tasks_dir.glob("task_*.md"))
|
task_files = sorted(tasks_dir.glob("task_*.md"))
|
||||||
@@ -227,13 +308,11 @@ def run_benchmark(
|
|||||||
|
|
||||||
print(f"Loaded {len(all_tasks)} tasks")
|
print(f"Loaded {len(all_tasks)} tasks")
|
||||||
|
|
||||||
# Create run directory
|
|
||||||
results_dir.mkdir(parents=True, exist_ok=True)
|
results_dir.mkdir(parents=True, exist_ok=True)
|
||||||
run_id = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
|
run_id = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
|
||||||
run_dir = results_dir / run_id
|
run_dir = results_dir / run_id
|
||||||
run_dir.mkdir()
|
run_dir.mkdir()
|
||||||
|
|
||||||
# Record metadata
|
|
||||||
cw_version = "unknown"
|
cw_version = "unknown"
|
||||||
try:
|
try:
|
||||||
vr = subprocess.run(["codewhale", "--version"], capture_output=True, text=True)
|
vr = subprocess.run(["codewhale", "--version"], capture_output=True, text=True)
|
||||||
@@ -252,7 +331,6 @@ def run_benchmark(
|
|||||||
}
|
}
|
||||||
(run_dir / "metadata.json").write_text(json.dumps(metadata, indent=2))
|
(run_dir / "metadata.json").write_text(json.dumps(metadata, indent=2))
|
||||||
|
|
||||||
# Run tasks
|
|
||||||
results: list[dict] = []
|
results: list[dict] = []
|
||||||
total_score = 0.0
|
total_score = 0.0
|
||||||
|
|
||||||
@@ -260,10 +338,10 @@ def run_benchmark(
|
|||||||
task_id = task["task_id"]
|
task_id = task["task_id"]
|
||||||
print(f"\n{'='*60}")
|
print(f"\n{'='*60}")
|
||||||
print(f"Task {i}/{len(all_tasks)}: {task_id} — {task['name']}")
|
print(f"Task {i}/{len(all_tasks)}: {task_id} — {task['name']}")
|
||||||
print(f" Category: {task['category']}")
|
print(f" Category: {task['category']} | Grading: {task['grading_type']}")
|
||||||
print(f"{'='*60}")
|
print(f"{'='*60}")
|
||||||
|
|
||||||
workspace = prepare_workspace(task, run_dir)
|
workspace = prepare_workspace(task, run_dir, tasks_dir)
|
||||||
timeout = int(task["timeout_seconds"] * timeout_multiplier)
|
timeout = int(task["timeout_seconds"] * timeout_multiplier)
|
||||||
|
|
||||||
# Run codewhale
|
# Run codewhale
|
||||||
@@ -274,17 +352,31 @@ def run_benchmark(
|
|||||||
if result["timed_out"]:
|
if result["timed_out"]:
|
||||||
print(f" ⏰ TIMED OUT")
|
print(f" ⏰ TIMED OUT")
|
||||||
|
|
||||||
# Build a minimal transcript for grading
|
# Build transcript for grading
|
||||||
transcript = [{"role": "user", "content": task["prompt"]}]
|
transcript = [{"role": "user", "content": task["prompt"]}]
|
||||||
if result["stdout"]:
|
if result["stdout"]:
|
||||||
transcript.append({"role": "assistant", "content": result["stdout"]})
|
transcript.append({"role": "assistant", "content": result["stdout"]})
|
||||||
|
|
||||||
# Grade
|
# Grade based on type
|
||||||
|
grading_type = task.get("grading_type", "automated")
|
||||||
|
has_automated = task.get("automated_checks") and "```python" in (task.get("automated_checks") or "")
|
||||||
|
has_llm_rubric = bool(task.get("llm_judge_rubric"))
|
||||||
|
|
||||||
grade_result = {"score": 0.0, "reason": "not graded"}
|
grade_result = {"score": 0.0, "reason": "not graded"}
|
||||||
if task["automated_checks"]:
|
|
||||||
|
if has_automated:
|
||||||
grade_result = grade_automated(task, workspace, transcript)
|
grade_result = grade_automated(task, workspace, transcript)
|
||||||
elif task.get("llm_judge_rubric"):
|
|
||||||
grade_result = {"score": 0.0, "reason": "llm judge not implemented yet"}
|
# If automated score is 0 and there's an LLM rubric, try LLM judge
|
||||||
|
if grade_result.get("score", 0.0) == 0.0 and has_llm_rubric:
|
||||||
|
print(f" Running LLM judge...")
|
||||||
|
llm_result = grade_llm_judge(task, workspace, transcript, model=model)
|
||||||
|
# Use LLM judge score if it's better, or if no automated checks
|
||||||
|
if not has_automated or llm_result.get("score", 0.0) > 0.0:
|
||||||
|
grade_result = llm_result
|
||||||
|
|
||||||
|
if not has_automated and not has_llm_rubric:
|
||||||
|
grade_result = {"score": 0.0, "reason": "no grading method defined"}
|
||||||
|
|
||||||
score = grade_result.get("score", 0.0)
|
score = grade_result.get("score", 0.0)
|
||||||
total_score += score
|
total_score += score
|
||||||
@@ -304,13 +396,11 @@ def run_benchmark(
|
|||||||
}
|
}
|
||||||
results.append(task_result)
|
results.append(task_result)
|
||||||
|
|
||||||
# Save individual result
|
|
||||||
(run_dir / f"{task_id}.json").write_text(json.dumps(task_result, indent=2))
|
(run_dir / f"{task_id}.json").write_text(json.dumps(task_result, indent=2))
|
||||||
|
|
||||||
# Summary
|
# Summary
|
||||||
avg_score = total_score / len(results) if results else 0.0
|
avg_score = total_score / len(results) if results else 0.0
|
||||||
|
|
||||||
# Group by category
|
|
||||||
categories: dict[str, list[dict]] = {}
|
categories: dict[str, list[dict]] = {}
|
||||||
for r in results:
|
for r in results:
|
||||||
cat = r["category"]
|
cat = r["category"]
|
||||||
@@ -334,7 +424,6 @@ def run_benchmark(
|
|||||||
|
|
||||||
(run_dir / "summary.json").write_text(json.dumps(summary, indent=2))
|
(run_dir / "summary.json").write_text(json.dumps(summary, indent=2))
|
||||||
|
|
||||||
# Print summary
|
|
||||||
print(f"\n{'='*60}")
|
print(f"\n{'='*60}")
|
||||||
print(f"PINCHBENCH SCORE SUMMARY (CodeWhale)")
|
print(f"PINCHBENCH SCORE SUMMARY (CodeWhale)")
|
||||||
print(f"{'='*60}")
|
print(f"{'='*60}")
|
||||||
|
|||||||
Reference in New Issue
Block a user