Files
codewhale/scripts/benchmarks/harbor/__init__.py
T
Hunter B a5f27aae3a feat(benchmarks): default PinchBench to MiMo v2.5 Pro, add direct-mimo routing
PinchBench runner now defaults to openrouter/xiaomi/mimo-v2.5-pro instead
of deepseek/deepseek-chat. Adds --direct-mimo flag for routing through
Xiaomi's API directly (bypasses OpenRouter), with tp-/sk- key type
detection and endpoint mismatch warnings.

Harbor adapter gains --provider CLI flag for MiMo provider routing.

Known issues documented in docs/MIMO_BENCHMARK_ISSUES.md:
- PinchBench model validation requires OpenRouter prefix
- OPENROUTER_API_KEY needed even for some direct-provider paths
- Token Plan vs pay-as-you-go key/endpoint mismatch
- PinchBench runs through OpenClaw, not CodeWhale
2026-06-04 19:33:43 -07:00

182 lines
5.8 KiB
Python

"""
Harbor adapter for CodeWhale.
Lets Harbor evaluate CodeWhale as an agent on Terminal-Bench and other
Harbor-compatible datasets.
Usage (after pip install harbor):
harbor run \\
--dataset terminal-bench@2.0 \\
--agent scripts.benchmarks.harbor.codewhale_agent:CodeWhaleAgent \\
--model deepseek/deepseek-chat
Or register the agent name in Harbor's AgentName enum for shorter invocations.
"""
import json
import os
import shlex
from pathlib import Path, PurePosixPath
from typing import Any
from harbor.agents.installed.base import (
BaseInstalledAgent,
CliFlag,
with_prompt_template,
)
from harbor.environments.base import BaseEnvironment
from harbor.models.agent.context import AgentContext
class CodeWhaleAgent(BaseInstalledAgent):
"""
CodeWhale agent adapter for Harbor.
Installs the ``codewhale`` CLI via npm into the task container and runs
tasks in non-interactive exec mode with full tool access.
"""
_OUTPUT_FILENAME = "codewhale.txt"
CLI_FLAGS = [
CliFlag(
"max_subagents",
cli="--max-subagents",
type="int",
default=4,
),
CliFlag(
"thinking",
cli="--thinking",
type="str",
default="high",
),
CliFlag(
"provider",
cli="--provider",
type="str",
default=None,
),
]
@staticmethod
def name() -> str:
return "codewhale"
def version(self) -> str | None:
return getattr(self, "_version", None)
def get_version_command(self) -> str | None:
return "codewhale --version 2>/dev/null || codewhale-tui --version 2>/dev/null"
def parse_version(self, stdout: str) -> str:
text = stdout.strip()
for line in text.splitlines():
line = line.strip()
if line:
# Strip any prefix like "codewhale " or "codewhale-cli "
for prefix in ("codewhale-tui ", "codewhale-cli ", "codewhale "):
if line.lower().startswith(prefix):
return line[len(prefix):]
return line
return text
async def install(self, environment: BaseEnvironment) -> None:
"""Install CodeWhale via npm in the container."""
# Install system dependencies
await self.exec_as_root(
environment,
command=(
"if ldd --version 2>&1 | grep -qi musl || [ -f /etc/alpine-release ]; then"
" apk add --no-cache curl bash nodejs npm git ripgrep;"
" elif command -v apt-get &>/dev/null; then"
" apt-get update && apt-get install -y curl git ripgrep;"
" elif command -v yum &>/dev/null; then"
" yum install -y curl git ripgrep;"
" fi"
),
env={"DEBIAN_FRONTEND": "noninteractive"},
)
# Install Node.js if not present (some images lack it)
await self.exec_as_root(
environment,
command=(
"if ! command -v node &>/dev/null; then"
" curl -fsSL https://deb.nodesource.com/setup_20.x | bash - &&"
" apt-get install -y nodejs;"
" fi"
),
env={"DEBIAN_FRONTEND": "noninteractive"},
)
# Install CodeWhale CLI via npm
await self.exec_as_agent(
environment,
command="npm install -g codewhale",
)
@with_prompt_template
async def run(
self,
instruction: str,
environment: BaseEnvironment,
context: AgentContext,
) -> None:
"""Run CodeWhale in non-interactive exec mode on the task."""
escaped_instruction = shlex.quote(instruction)
# Build CLI flags from agent config
cli_flags = self.build_cli_flags()
extra_flags = (cli_flags + " ") if cli_flags else ""
# Determine API key environment variables to forward
env: dict[str, str] = {}
# DeepSeek
deepseek_key = os.environ.get("DEEPSEEK_API_KEY", "")
if deepseek_key:
env["DEEPSEEK_API_KEY"] = deepseek_key
# OpenRouter (fallback)
openrouter_key = os.environ.get("OPENROUTER_API_KEY", "")
if openrouter_key:
env["OPENROUTER_API_KEY"] = openrouter_key
# Generic OpenAI-compatible
openai_key = os.environ.get("OPENAI_API_KEY", "")
if openai_key:
env["OPENAI_API_KEY"] = openai_key
# Build model flag if model_name is provided
model_flag = ""
if self.model_name:
# Harbor passes model as "provider/model"; CodeWhale uses --model
model_flag = f"--model {shlex.quote(self.model_name)} "
output_path = f"/logs/agent/{self._OUTPUT_FILENAME}"
# Run CodeWhale in non-interactive YOLO exec mode
# --yolo enables full tool access (auto-approved)
# --auto runs non-interactively and exits when done
# --stream-json gives us structured output for trajectory parsing
await self.exec_as_agent(
environment,
command=(
f"codewhale exec --yolo --auto --stream-json "
f"{model_flag}{extra_flags}"
f"--workspace /workspace "
f"{escaped_instruction} "
f"2>&1 | tee {shlex.quote(output_path)}"
),
env=env if env else None,
)
def populate_context_post_run(self, context: AgentContext) -> None:
"""Parse CodeWhale's output for any post-run metadata."""
# CodeWhale writes its results to the working tree as git diffs.
# Harbor's eval harness inspects the workspace directly, so no
# special trajectory parsing is needed for basic eval.
pass