From 5106ecfb2d303808da76bc648683240c44304559 Mon Sep 17 00:00:00 2001 From: Hunter B Date: Sat, 13 Jun 2026 01:30:06 -0700 Subject: [PATCH] test(fleet): CI-safe multi-worker dogfood smoke with injected failure (#3166) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a verifiable dogfood smoke that drives several concurrent exec-style workers (three healthy + one injected-failure that emits an error event and exits non-zero) through the real host adapter, asserting distinct terminal pass/fail outcomes — no external services, no model calls, no codewhale binary. Documents the automated CI smoke vs the manual `codewhale fleet run` path in the dogfood spec, and is honest that the manager run-loop cutover to drive real FleetExecutor workers is still in progress. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/tui/src/fleet/executor.rs | 62 ++++++++++++++++++++++++++++++++ docs/examples/fleet-dogfood.toml | 17 +++++++-- 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/crates/tui/src/fleet/executor.rs b/crates/tui/src/fleet/executor.rs index d544a35e..10ef1831 100644 --- a/crates/tui/src/fleet/executor.rs +++ b/crates/tui/src/fleet/executor.rs @@ -421,4 +421,66 @@ mod tests { ); assert!(exec.all_terminal()); } + + /// Dogfood smoke (#3166): several concurrent exec-style workers with one + /// injected failure. Proves the executor drives a small fleet to terminal + /// outcomes and that a failing worker is classified distinctly from the + /// passing ones — all without the codewhale binary. + #[cfg(unix)] + #[test] + fn executor_drives_concurrent_workers_with_injected_failure() { + let tmp = tempfile::TempDir::new().unwrap(); + let mut exec = FleetExecutor::new(tmp.path()); + + // Three healthy workers emit a tool_use + done; one injected-failure + // worker emits an error event and exits non-zero. + let ok = r#"printf '{"type":"tool_use","name":"grep_files","id":"c","input":{}}\n{"type":"done"}\n'"#; + let bad = r#"printf '{"type":"error","error":"injected failure"}\n'; exit 7"#; + for id in ["w1", "w2", "w3"] { + exec.start_worker( + id, + FleetWorkerCommand::new("sh", vec!["-c".to_string(), ok.to_string()]), + None, + ) + .unwrap(); + } + exec.start_worker( + "w-fail", + FleetWorkerCommand::new("sh", vec!["-c".to_string(), bad.to_string()]), + None, + ) + .unwrap(); + + let ids = ["w1", "w2", "w3", "w-fail"]; + let mut terminals: std::collections::BTreeMap<&str, FleetWorkerEventPayload> = + std::collections::BTreeMap::new(); + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(8); + while terminals.len() < ids.len() { + for id in ids { + let _ = exec.drain_events(id); + if let Some(term) = exec.poll_terminal(id) { + terminals.insert(id, term); + } + } + assert!( + std::time::Instant::now() < deadline, + "not all workers terminated: {terminals:?}" + ); + std::thread::sleep(std::time::Duration::from_millis(20)); + } + + assert!(exec.all_terminal()); + for id in ["w1", "w2", "w3"] { + assert!( + matches!(terminals[id], FleetWorkerEventPayload::Completed { .. }), + "{id} should pass, got {:?}", + terminals[id] + ); + } + assert!( + matches!(terminals["w-fail"], FleetWorkerEventPayload::Failed { .. }), + "injected-failure worker should fail, got {:?}", + terminals["w-fail"] + ); + } } diff --git a/docs/examples/fleet-dogfood.toml b/docs/examples/fleet-dogfood.toml index 14e9e047..fb361f61 100644 --- a/docs/examples/fleet-dogfood.toml +++ b/docs/examples/fleet-dogfood.toml @@ -1,16 +1,27 @@ # Agent Fleet dogfood smoke spec (#3166) # # This spec exercises the fleet end-to-end: create a run with two local -# workers, run a lint task and a review task, verify the ledger records -# receipts, and confirm the status surfaces work. +# workers, run a workspace-check task and a protocol-review task, verify the +# ledger records receipts, and confirm the status surfaces work. Each worker is +# a headless `codewhale exec` run (see docs/AGENT_RUNTIME.md). # -# Run: +# Automated CI-safe smoke (no external services, no model calls): +# cargo test -p codewhale-tui --bins fleet::executor +# It drives several concurrent exec-style workers (with one injected failure) +# through the real host adapter and asserts terminal pass/fail outcomes. +# +# Manual run (drives real `codewhale exec` workers; needs provider creds): # codewhale fleet run docs/examples/fleet-dogfood.toml --max-workers 2 --once # # Then check: # codewhale fleet status # codewhale fleet inspect # codewhale fleet logs +# +# NOTE: wiring the manager run loop to drive FleetExecutor for real workers is +# the in-progress cutover; until then the manual run path uses the local +# simulation harness. The automated smoke above already proves the real +# exec-subprocess -> ledger-event path. name = "dogfood smoke" labels = { milestone = "v0.8.60", class = "smoke" }