Files
Hunter B 5106ecfb2d test(fleet): CI-safe multi-worker dogfood smoke with injected failure (#3166)
Adds a verifiable dogfood smoke that drives several concurrent exec-style
workers (three healthy + one injected-failure that emits an error event and
exits non-zero) through the real host adapter, asserting distinct terminal
pass/fail outcomes — no external services, no model calls, no codewhale binary.
Documents the automated CI smoke vs the manual `codewhale fleet run` path in
the dogfood spec, and is honest that the manager run-loop cutover to drive real
FleetExecutor workers is still in progress.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-13 01:30:06 -07:00

64 lines
4.0 KiB
TOML

# Agent Fleet dogfood smoke spec (#3166)
#
# This spec exercises the fleet end-to-end: create a run with two local
# workers, run a workspace-check task and a protocol-review task, verify the
# ledger records receipts, and confirm the status surfaces work. Each worker is
# a headless `codewhale exec` run (see docs/AGENT_RUNTIME.md).
#
# Automated CI-safe smoke (no external services, no model calls):
# cargo test -p codewhale-tui --bins fleet::executor
# It drives several concurrent exec-style workers (with one injected failure)
# through the real host adapter and asserts terminal pass/fail outcomes.
#
# Manual run (drives real `codewhale exec` workers; needs provider creds):
# codewhale fleet run docs/examples/fleet-dogfood.toml --max-workers 2 --once
#
# Then check:
# codewhale fleet status
# codewhale fleet inspect <worker-id-from-status>
# codewhale fleet logs <worker-id-from-status>
#
# NOTE: wiring the manager run loop to drive FleetExecutor for real workers is
# the in-progress cutover; until then the manual run path uses the local
# simulation harness. The automated smoke above already proves the real
# exec-subprocess -> ledger-event path.
name = "dogfood smoke"
labels = { milestone = "v0.8.60", class = "smoke" }
security_policy = { default_trust_level = "local", allowed_secrets = [], require_identity_verification = false }
[[tasks]]
id = "cargo-check"
name = "Workspace check"
description = "Run `cargo check --workspace` and report any compilation errors."
objective = "Verify the workspace compiles cleanly with zero errors."
instructions = "Run `cargo check --workspace` in the repo root. If it compiles cleanly, report success. If there are errors, list each file:line and the error message. Do NOT attempt to fix anything — just report what you found."
worker = { role = "release-checker", tool_profile = "read-only", tools = ["cargo"], capabilities = ["rust"] }
workspace = { required_files = ["Cargo.toml"], writable_paths = [".codewhale/fleet"], environment = { required = ["PATH"] } }
input_files = ["Cargo.toml"]
context = ["You are running in a fleet smoke test. Be concise. Only report the pass/fail and any specific errors."]
budget = { max_tokens = 8000, max_tool_calls = 12, max_seconds = 300 }
expected_artifacts = ["log", "report", "receipt"]
scorer = { kind = "exit_code" }
retry_policy = { max_attempts = 2, initial_backoff_seconds = 5, max_backoff_seconds = 30 }
timeout_seconds = 300
tags = ["smoke", "check"]
[[tasks]]
id = "protocol-review"
name = "Protocol review"
description = "Review fleet protocol types for security and correctness."
objective = "Inspect crates/protocol/src/fleet.rs and crates/secrets/src/lib.rs. Report any missing serde defaults, unsafe wire changes, or security-sensitive fields lacking SecretRef."
instructions = "Read crates/protocol/src/fleet.rs and crates/secrets/src/lib.rs. Check for: (1) new fields without serde(default) or skip_serializing_if, (2) raw secrets in struct fields instead of FleetSecretRef, (3) missing Clone/Debug/PartialEq derives on new types. Write a concise report with file:line references for each finding. If there are no findings, report 'all clear'."
worker = { role = "reviewer", tool_profile = "read-only", tools = ["rg", "git", "cargo"], capabilities = ["rust"] }
workspace = { required_files = ["crates/protocol/src/fleet.rs", "crates/secrets/src/lib.rs"], writable_paths = [".codewhale/fleet"], environment = { required = ["PATH"] } }
input_files = ["crates/protocol/src/fleet.rs", "crates/secrets/src/lib.rs"]
context = ["You are a fleet protocol reviewer. Be thorough but concise. Reference specific lines."]
budget = { max_tokens = 10000, max_tool_calls = 16, max_seconds = 600 }
expected_artifacts = ["log", "report", "receipt"]
scorer = { kind = "code_whale_verifier_prompt", prompt = "Verify the review includes at least one concrete file:line finding or explicitly says 'all clear'." }
retry_policy = { max_attempts = 1, initial_backoff_seconds = 10 }
timeout_seconds = 600
tags = ["smoke", "review", "protocol"]