From b96fa37ea44a19ba30e482bc0b049d40bec0ab3a Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Tue, 26 May 2026 16:40:16 -0500 Subject: [PATCH] chore: add DeepSWE task verification script --- scripts/verify_task.sh | 48 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 scripts/verify_task.sh diff --git a/scripts/verify_task.sh b/scripts/verify_task.sh new file mode 100644 index 00000000..97689ebf --- /dev/null +++ b/scripts/verify_task.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# verify_task.sh +# Runs the DeepSWE verifier inside the task's Docker container. +# Expects model.patch at /tmp/deep-swe-verify//model.patch +TASK_ID="$1" +IMAGE="$2" +TASKS_DIR="/Volumes/VIXinSSD/whalebro/codewhale/deep-swe/tasks" +WORK_DIR="/tmp/deep-swe-verify/$TASK_ID" + +mkdir -p "$WORK_DIR" +RESULT_FILE="$WORK_DIR/result.txt" + +echo "[$TASK_ID] Pulling image..." +docker pull "$IMAGE" 2>&1 | tail -1 + +echo "[$TASK_ID] Running verifier..." +docker run --rm \ + --platform linux/amd64 \ + -v "$WORK_DIR/model.patch:/model.patch:ro" \ + -v "$TASKS_DIR/$TASK_ID/tests/test.patch:/tests/test.patch:ro" \ + -v "$TASKS_DIR/$TASK_ID/tests/test.sh:/verify.sh:ro" \ + "$IMAGE" \ + bash -c ' + set -e + mkdir -p /logs/verifier /logs/artifacts + cd /app + git apply --whitespace=nowarn /model.patch 2>/dev/null || { echo "PATCH_FAILED"; exit 2; } + bash /verify.sh > /logs/verifier/output.txt 2>&1 + EC=$? + if [ -f /logs/verifier/reward.txt ]; then + REWARD=$(cat /logs/verifier/reward.txt) + echo "REWARD=$REWARD" + else + # Extract from output + if grep -q "New tests exit code: 0" /logs/verifier/output.txt && \ + grep -q "Baseline exit code: 0" /logs/verifier/output.txt; then + echo "REWARD=1" + else + echo "REWARD=0" + fi + fi + echo "---OUTPUT_TAIL---" + tail -30 /logs/verifier/output.txt + ' > "$RESULT_FILE" 2>&1 + +echo "[$TASK_ID] Done. Result:" +cat "$RESULT_FILE" | grep -E 'REWARD|FAILED|PATCH_FAILED|passed' +echo ""