From e5974aa85014d0c88cbcf42a216a9febf94d5cb9 Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Fri, 5 Jun 2026 22:17:02 -0700 Subject: [PATCH] feat(whaleflow): run dogfood workflow with mock executor (#2831) --- CHANGELOG.md | 5 +- crates/tui/CHANGELOG.md | 5 +- crates/whaleflow/src/starlark_authoring.rs | 64 +++++++++++++++++++++- workflows/rlm_cache_change.star | 54 ++++++++++++++---- 4 files changed, 113 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6cdf2d71..2da0c8f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -56,7 +56,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 command and live-provider replay fallback remain deferred. The crate also now has a model-agnostic role/capability registry with mock provider plumbing and fail-closed JSON repair parsing, so WhaleFlow can choose capable models for - roles without hardcoding provider-specific runtime paths (#2672). + roles without hardcoding provider-specific runtime paths (#2672). The + `rlm_cache_change.star` dogfood workflow now exercises candidate branches, + LoopUntil verification, tournament selection, teacher review, and mock + execution in CI-oriented crate tests (#2679). Thanks @AdityaVG13 for the WhaleFlow draft and cost-tracking direction. - Added a state-store v2 schema migration for WhaleFlow trace tables covering workflow, branch, leaf, control-node, and teacher-candidate runs. The diff --git a/crates/tui/CHANGELOG.md b/crates/tui/CHANGELOG.md index 6cdf2d71..2da0c8f2 100644 --- a/crates/tui/CHANGELOG.md +++ b/crates/tui/CHANGELOG.md @@ -56,7 +56,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 command and live-provider replay fallback remain deferred. The crate also now has a model-agnostic role/capability registry with mock provider plumbing and fail-closed JSON repair parsing, so WhaleFlow can choose capable models for - roles without hardcoding provider-specific runtime paths (#2672). + roles without hardcoding provider-specific runtime paths (#2672). The + `rlm_cache_change.star` dogfood workflow now exercises candidate branches, + LoopUntil verification, tournament selection, teacher review, and mock + execution in CI-oriented crate tests (#2679). Thanks @AdityaVG13 for the WhaleFlow draft and cost-tracking direction. - Added a state-store v2 schema migration for WhaleFlow trace tables covering workflow, branch, leaf, control-node, and teacher-candidate runs. The diff --git a/crates/whaleflow/src/starlark_authoring.rs b/crates/whaleflow/src/starlark_authoring.rs index 7f395c78..2813ac41 100644 --- a/crates/whaleflow/src/starlark_authoring.rs +++ b/crates/whaleflow/src/starlark_authoring.rs @@ -413,7 +413,7 @@ fn workflow_builtins(builder: &mut GlobalsBuilder) { #[cfg(test)] mod tests { use super::*; - use crate::{AgentType, ControlNodeKind}; + use crate::{AgentType, ControlNodeKind, MockWorkflowExecutor, WorkflowRunStatus}; #[test] fn starlark_compiles_to_ir() { @@ -426,7 +426,7 @@ mod tests { let WorkflowNode::BranchSet(branch) = &workflow.nodes[0] else { panic!("first node should be a branch set"); }; - assert_eq!(branch.id, "discover"); + assert_eq!(branch.id, "candidate-branches"); assert!(branch.parallel); let WorkflowNode::Leaf(leaf) = &branch.children[0] else { panic!("first branch child should be a leaf"); @@ -434,6 +434,33 @@ mod tests { assert_eq!(leaf.agent_type, AgentType::ToolAgent); } + #[test] + fn rlm_cache_change_workflow_runs_with_mock_provider() { + let source = include_str!("../../../workflows/rlm_cache_change.star"); + let workflow = compile_starlark_workflow("rlm_cache_change.star", source) + .expect("example should compile"); + let mut executor = MockWorkflowExecutor::new() + .with_predicate_results("implement-until-tests-pass", vec![true]); + + let execution = executor + .run(&workflow) + .expect("dogfood workflow should run with mock leaves"); + + assert_eq!(execution.status, WorkflowRunStatus::Succeeded); + assert!( + execution + .leaf_results + .iter() + .any(|result| result.leaf_id == "regression-tests") + ); + assert!( + execution + .control_node_results + .iter() + .any(|result| result.node_id == "teacher-review") + ); + } + #[test] fn starlark_repair_loop() { let source = r#" @@ -455,6 +482,39 @@ workflow( assert!(matches!(workflow.nodes[0], WorkflowNode::BranchSet(_))); } + #[test] + fn starlark_generated_workflow_repairs_then_runs() { + let source = r#" +workflow( + id = "generated-repair-run", + goal = "repair generated workflow aliases", + nodes = [ + ctx.parallel(id = "discover", children = [ + agent(id = "scan", prompt = "scan repo"), + ]), + ctx.loop_until( + id = "verify", + condition = "checks pass", + max_iterations = 1, + children = [ + test(id = "run-tests", command = "cargo test -p codewhale-whaleflow --locked"), + ], + ), + ], +) +"#; + let workflow = compile_starlark_workflow_with_repair("generated.star", source) + .expect("repair should produce runnable IR"); + let mut executor = MockWorkflowExecutor::new().with_predicate_results("verify", vec![true]); + + let execution = executor + .run(&workflow) + .expect("repaired workflow should run with mock leaves"); + + assert_eq!(execution.status, WorkflowRunStatus::Succeeded); + assert_eq!(execution.leaf_results.len(), 2); + } + #[test] fn invalid_workflow_rejected() { let source = r#" diff --git a/workflows/rlm_cache_change.star b/workflows/rlm_cache_change.star index c68a86c9..32a62982 100644 --- a/workflows/rlm_cache_change.star +++ b/workflows/rlm_cache_change.star @@ -3,7 +3,7 @@ workflow( goal = "Evaluate an RLM/cache routing change with safe mock WhaleFlow IR", nodes = [ branch( - id = "discover", + id = "candidate-branches", parallel = True, children = [ search( @@ -12,27 +12,59 @@ workflow( file_scope = ["crates/tui/src/rlm/**", "crates/tui/src/core/**"], ), agent( - id = "inspect-provider-cache", - prompt = "Inspect provider cache behavior without editing files.", + id = "minimal-patch", + prompt = "Draft the smallest safe cache-routing patch using shared ARMH context.", + agent_type = "implementer", + mode = "read_write", + isolation = "worktree", + file_scope = ["crates/tui/src/rlm/**", "crates/tui/src/core/**"], + ), + agent( + id = "architecture-review", + prompt = "Review cache routing boundaries and identify replay or provider risks.", agent_type = "explore", - file_scope = ["crates/tui/src/providers/**"], + file_scope = [ + "crates/tui/src/providers/**", + "crates/tui/src/rlm/**", + ], ), ], ), sequence( - id = "verify-and-summarize", + id = "verify-select-and-summarize", children = [ - test( - id = "run-rlm-tests", - command = "cargo test -p codewhale-tui rlm --locked", - file_scope = ["crates/tui/src/rlm/**"], + loop_until( + id = "implement-until-tests-pass", + condition = "regression tests pass", + max_iterations = 2, + children = [ + test( + id = "regression-tests", + command = "cargo test -p codewhale-tui rlm --locked", + file_scope = ["crates/tui/src/rlm/**"], + ), + ], + ), + tournament( + id = "select-maintainer-slice", + candidates = [ + "minimal-patch", + "regression-tests", + "architecture-review", + ], + ), + teacher_review( + id = "teacher-review", + candidates = ["select-maintainer-slice"], ), reduce( id = "summarize-cache-change", inputs = [ "find-cache-surfaces", - "inspect-provider-cache", - "run-rlm-tests", + "minimal-patch", + "architecture-review", + "regression-tests", + "teacher-review", ], prompt = "Summarize the smallest safe cache-routing patch.", ),