From b7798ba0f65f845d0450b909374957138fb80e83 Mon Sep 17 00:00:00 2001 From: Hunter B Date: Thu, 4 Jun 2026 19:38:46 -0700 Subject: [PATCH] feat(benchmarks): default PinchBench to direct MiMo routing, auto-read config PinchBench runner now defaults to direct Xiaomi API (no OpenRouter). Reads API key from ~/.codewhale/config.toml [providers.xiaomi_mimo] when XIAOMI_MIMO_API_KEY env var is not set. --openrouter flag for the old OpenRouter path. --- scripts/benchmarks/run-pinchbench.sh | 153 ++++++++++++++------------- 1 file changed, 81 insertions(+), 72 deletions(-) diff --git a/scripts/benchmarks/run-pinchbench.sh b/scripts/benchmarks/run-pinchbench.sh index 788f4028..2caddb9f 100755 --- a/scripts/benchmarks/run-pinchbench.sh +++ b/scripts/benchmarks/run-pinchbench.sh @@ -1,43 +1,29 @@ #!/usr/bin/env bash -# run-pinchbench.sh — Run PinchBench benchmarks with CodeWhale model routing. +# run-pinchbench.sh — Run PinchBench benchmarks with Xiaomi MiMo v2.5. # -# PinchBench evaluates agent performance on real-world tasks (calendar, email, -# coding, research, file management). It uses OpenClaw as the agent runtime and -# routes models through OpenRouter by default. -# -# Known issues with Xiaomi MiMo v2.5: -# 1. PinchBench validates models against OpenRouter's /models endpoint. -# MiMo models MUST use the openrouter/ prefix or validation is skipped. -# 2. PinchBench requires OPENROUTER_API_KEY even when using a direct provider. -# The --direct-mimo flag sets up a custom OpenAI-compatible endpoint in -# OpenClaw's models.json to bypass this. -# 3. MiMo v2.5 Pro has a 128K context window but PinchBench tasks are small. -# No special handling needed, but worth noting for cost estimates. -# 4. The Xiaomi Token Plan endpoint (token-plan-sgp.xiaomimimo.com) uses -# tp- prefixed keys. Pay-as-you-go (api.xiaomimimo.com) uses sk- keys. -# Make sure XIAOMI_MIMO_API_KEY matches the endpoint you're using. -# 5. OpenRouter model ID for MiMo: xiaomi/mimo-v2.5-pro (Pro) or -# xiaomi/mimo-v2.5 (Omni). PinchBench expects the full provider/model. +# Defaults to direct Xiaomi API routing (no OpenRouter needed). Reads the +# API key from ~/.codewhale/config.toml if not set via environment variables. # # Usage: # ./scripts/benchmarks/run-pinchbench.sh --help -# ./scripts/benchmarks/run-pinchbench.sh --model xiaomi/mimo-v2.5-pro -# ./scripts/benchmarks/run-pinchbench.sh --direct-mimo --suite task_calendar +# ./scripts/benchmarks/run-pinchbench.sh # direct MiMo (default) +# ./scripts/benchmarks/run-pinchbench.sh --openrouter # via OpenRouter +# ./scripts/benchmarks/run-pinchbench.sh --suite task_calendar # # Prerequisites: # - PinchBench cloned (or use --install) # - Python 3.10+ with uv -# - OPENROUTER_API_KEY (for OpenRouter routing) -# - OR XIAOMI_MIMO_API_KEY + --direct-mimo (for direct Xiaomi API) +# - Xiaomi MiMo API key (in env or ~/.codewhale/config.toml) # - A running OpenClaw instance set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +CODEWHALE_CONFIG="${HOME}/.codewhale/config.toml" -# Defaults — MiMo v2.5 Pro via OpenRouter -MODEL="openrouter/xiaomi/mimo-v2.5-pro" +# Defaults — direct MiMo v2.5 Pro (no OpenRouter) +MODEL="mimo-v2.5-pro" SUITE="all" PINCHBENCH_DIR="${PINCHBENCH_DIR:-/tmp/pinchbench}" RESULTS_DIR="./results/pinchbench" @@ -45,28 +31,28 @@ INSTALL_PINCHBENCH=false RUNS=1 JUDGE_MODEL="" NO_UPLOAD=true -DIRECT_MIMO=false +DIRECT_MIMO=true MIMO_BASE_URL="" +OPENROUTER_MODE=false EXTRA_ARGS=() usage() { cat </dev/null || true) + url=$(awk '/\[providers\.xiaomi_mimo\]/{f=1} f && /^base_url/{gsub(/.*= *"/,""); gsub(/".*/,""); print; exit}' "$config" 2>/dev/null || true) + fi + echo "$key|$url" +} - # Resolve API key from multiple env var names +# ── OpenRouter mode ───────────────────────────────────────────────────────── +if [[ "$OPENROUTER_MODE" == true ]]; then + MODEL="openrouter/xiaomi/mimo-v2.5-pro" + if [[ -z "${OPENROUTER_API_KEY:-}" ]]; then + echo "Error: --openrouter requires OPENROUTER_API_KEY" >&2 + exit 1 + fi + echo "OpenRouter mode:" + echo " Model: $MODEL" + echo "" + +# ── Direct MiMo mode (default) ───────────────────────────────────────────── +elif [[ "$DIRECT_MIMO" == true ]]; then + # Resolve API key: env var > codewhale config.toml MIMO_KEY="${XIAOMI_MIMO_API_KEY:-${XIAOMI_API_KEY:-${MIMO_API_KEY:-}}}" + if [[ -z "$MIMO_KEY" ]]; then - echo "Error: --direct-mimo requires XIAOMI_MIMO_API_KEY (or XIAOMI_API_KEY / MIMO_API_KEY)" >&2 - echo " Token Plan keys (tp-...): https://token-plan-sgp.xiaomimimo.com/v1" >&2 - echo " Pay-as-you-go keys (sk-...): https://api.xiaomimimo.com/v1" >&2 + # Try reading from codewhale config + IFS='|' read -r cfg_key cfg_url <<< "$(read_codewhale_mimo_config "$CODEWHALE_CONFIG")" + if [[ -n "$cfg_key" ]]; then + MIMO_KEY="$cfg_key" + echo "Read MiMo API key from $CODEWHALE_CONFIG" + # Use config base_url if not overridden + if [[ -z "$MIMO_BASE_URL" && -n "$cfg_url" ]]; then + MIMO_BASE_URL="$cfg_url" + fi + fi + fi + + if [[ -z "$MIMO_KEY" ]]; then + echo "Error: No MiMo API key found." >&2 + echo " Set XIAOMI_MIMO_API_KEY env var, or configure [providers.xiaomi_mimo] in" >&2 + echo " ~/.codewhale/config.toml" >&2 exit 1 fi - # Determine base URL: flag > env > default (Token Plan Singapore) + # Determine base URL: flag > env > config > default (Token Plan Singapore) if [[ -z "$MIMO_BASE_URL" ]]; then MIMO_BASE_URL="${XIAOMI_MIMO_BASE_URL:-https://token-plan-sgp.xiaomimimo.com/v1}" fi @@ -156,15 +174,6 @@ if [[ "$DIRECT_MIMO" == true ]]; then export OPENAI_BASE_URL="$MIMO_BASE_URL" fi -# ── Prereq checks ─────────────────────────────────────────────────────────── -if [[ "$DIRECT_MIMO" != true ]]; then - # OpenRouter mode — need the key - if [[ -z "${OPENROUTER_API_KEY:-}" ]]; then - echo "Warning: OPENROUTER_API_KEY not set. PinchBench may fail model validation." >&2 - echo " Either set OPENROUTER_API_KEY or use --direct-mimo with XIAOMI_MIMO_API_KEY." >&2 - fi -fi - # ── Install PinchBench ────────────────────────────────────────────────────── if [[ "$INSTALL_PINCHBENCH" == true || ! -d "$PINCHBENCH_DIR" ]]; then echo "Installing PinchBench to $PINCHBENCH_DIR ..." @@ -201,7 +210,7 @@ cat > "$METADATA_FILE" </dev/null || echo unknown)", "pinchbench_commit": "$(git -C "$PINCHBENCH_DIR" rev-parse HEAD 2>/dev/null || echo unknown)", "model": "$MODEL", - "routing": "$(if [[ "$DIRECT_MIMO" == true ]]; then echo "direct-xiaomi"; else echo "openrouter"; fi)", + "routing": "$(if [[ "$OPENROUTER_MODE" == true ]]; then echo "openrouter"; else echo "direct-xiaomi"; fi)", "suite": "$SUITE", "runs": $RUNS, "timestamp_utc": "$(date -u +%Y-%m-%dT%H:%M:%SZ)", @@ -221,7 +230,7 @@ if [[ "$NO_UPLOAD" == true ]]; then PB_ARGS+=("--no-upload") fi -# Pass direct-mimo endpoint info via env for lib_agent.py's custom provider setup +# Pass direct-mimo endpoint info for lib_agent.py's custom provider setup if [[ "$DIRECT_MIMO" == true ]]; then PB_ARGS+=("--base-url" "$MIMO_BASE_URL") fi @@ -233,10 +242,10 @@ echo " Model: $MODEL" echo " Suite: $SUITE" echo " Runs: $RUNS" echo " Output: $RESULTS_DIR" -if [[ "$DIRECT_MIMO" == true ]]; then - echo " Routing: Direct Xiaomi API ($MIMO_BASE_URL)" -else +if [[ "$OPENROUTER_MODE" == true ]]; then echo " Routing: OpenRouter" +else + echo " Routing: Direct Xiaomi API ($MIMO_BASE_URL)" fi echo ""