From e5974aa85014d0c88cbcf42a216a9febf94d5cb9 Mon Sep 17 00:00:00 2001
From: Hunter Bown <hmbown@gmail.com>
Date: Fri, 5 Jun 2026 22:17:02 -0700
Subject: [PATCH] feat(whaleflow): run dogfood workflow with mock executor
 (#2831)

---
 CHANGELOG.md                               |  5 +-
 crates/tui/CHANGELOG.md                    |  5 +-
 crates/whaleflow/src/starlark_authoring.rs | 64 +++++++++++++++++++++-
 workflows/rlm_cache_change.star            | 54 ++++++++++++++----
 4 files changed, 113 insertions(+), 15 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6cdf2d71..2da0c8f2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -56,7 +56,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   command and live-provider replay fallback remain deferred. The crate also now
   has a model-agnostic role/capability registry with mock provider plumbing and
   fail-closed JSON repair parsing, so WhaleFlow can choose capable models for
-  roles without hardcoding provider-specific runtime paths (#2672).
+  roles without hardcoding provider-specific runtime paths (#2672). The
+  `rlm_cache_change.star` dogfood workflow now exercises candidate branches,
+  LoopUntil verification, tournament selection, teacher review, and mock
+  execution in CI-oriented crate tests (#2679).
   Thanks @AdityaVG13 for the WhaleFlow draft and cost-tracking direction.
 - Added a state-store v2 schema migration for WhaleFlow trace tables covering
   workflow, branch, leaf, control-node, and teacher-candidate runs. The
diff --git a/crates/tui/CHANGELOG.md b/crates/tui/CHANGELOG.md
index 6cdf2d71..2da0c8f2 100644
--- a/crates/tui/CHANGELOG.md
+++ b/crates/tui/CHANGELOG.md
@@ -56,7 +56,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   command and live-provider replay fallback remain deferred. The crate also now
   has a model-agnostic role/capability registry with mock provider plumbing and
   fail-closed JSON repair parsing, so WhaleFlow can choose capable models for
-  roles without hardcoding provider-specific runtime paths (#2672).
+  roles without hardcoding provider-specific runtime paths (#2672). The
+  `rlm_cache_change.star` dogfood workflow now exercises candidate branches,
+  LoopUntil verification, tournament selection, teacher review, and mock
+  execution in CI-oriented crate tests (#2679).
   Thanks @AdityaVG13 for the WhaleFlow draft and cost-tracking direction.
 - Added a state-store v2 schema migration for WhaleFlow trace tables covering
   workflow, branch, leaf, control-node, and teacher-candidate runs. The
diff --git a/crates/whaleflow/src/starlark_authoring.rs b/crates/whaleflow/src/starlark_authoring.rs
index 7f395c78..2813ac41 100644
--- a/crates/whaleflow/src/starlark_authoring.rs
+++ b/crates/whaleflow/src/starlark_authoring.rs
@@ -413,7 +413,7 @@ fn workflow_builtins(builder: &mut GlobalsBuilder) {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::{AgentType, ControlNodeKind};
+    use crate::{AgentType, ControlNodeKind, MockWorkflowExecutor, WorkflowRunStatus};
 
     #[test]
     fn starlark_compiles_to_ir() {
@@ -426,7 +426,7 @@ mod tests {
         let WorkflowNode::BranchSet(branch) = &workflow.nodes[0] else {
             panic!("first node should be a branch set");
         };
-        assert_eq!(branch.id, "discover");
+        assert_eq!(branch.id, "candidate-branches");
         assert!(branch.parallel);
         let WorkflowNode::Leaf(leaf) = &branch.children[0] else {
             panic!("first branch child should be a leaf");
@@ -434,6 +434,33 @@ mod tests {
         assert_eq!(leaf.agent_type, AgentType::ToolAgent);
     }
 
+    #[test]
+    fn rlm_cache_change_workflow_runs_with_mock_provider() {
+        let source = include_str!("../../../workflows/rlm_cache_change.star");
+        let workflow = compile_starlark_workflow("rlm_cache_change.star", source)
+            .expect("example should compile");
+        let mut executor = MockWorkflowExecutor::new()
+            .with_predicate_results("implement-until-tests-pass", vec![true]);
+
+        let execution = executor
+            .run(&workflow)
+            .expect("dogfood workflow should run with mock leaves");
+
+        assert_eq!(execution.status, WorkflowRunStatus::Succeeded);
+        assert!(
+            execution
+                .leaf_results
+                .iter()
+                .any(|result| result.leaf_id == "regression-tests")
+        );
+        assert!(
+            execution
+                .control_node_results
+                .iter()
+                .any(|result| result.node_id == "teacher-review")
+        );
+    }
+
     #[test]
     fn starlark_repair_loop() {
         let source = r#"
@@ -455,6 +482,39 @@ workflow(
         assert!(matches!(workflow.nodes[0], WorkflowNode::BranchSet(_)));
     }
 
+    #[test]
+    fn starlark_generated_workflow_repairs_then_runs() {
+        let source = r#"
+workflow(
+    id = "generated-repair-run",
+    goal = "repair generated workflow aliases",
+    nodes = [
+        ctx.parallel(id = "discover", children = [
+            agent(id = "scan", prompt = "scan repo"),
+        ]),
+        ctx.loop_until(
+            id = "verify",
+            condition = "checks pass",
+            max_iterations = 1,
+            children = [
+                test(id = "run-tests", command = "cargo test -p codewhale-whaleflow --locked"),
+            ],
+        ),
+    ],
+)
+"#;
+        let workflow = compile_starlark_workflow_with_repair("generated.star", source)
+            .expect("repair should produce runnable IR");
+        let mut executor = MockWorkflowExecutor::new().with_predicate_results("verify", vec![true]);
+
+        let execution = executor
+            .run(&workflow)
+            .expect("repaired workflow should run with mock leaves");
+
+        assert_eq!(execution.status, WorkflowRunStatus::Succeeded);
+        assert_eq!(execution.leaf_results.len(), 2);
+    }
+
     #[test]
     fn invalid_workflow_rejected() {
         let source = r#"
diff --git a/workflows/rlm_cache_change.star b/workflows/rlm_cache_change.star
index c68a86c9..32a62982 100644
--- a/workflows/rlm_cache_change.star
+++ b/workflows/rlm_cache_change.star
@@ -3,7 +3,7 @@ workflow(
     goal = "Evaluate an RLM/cache routing change with safe mock WhaleFlow IR",
     nodes = [
         branch(
-            id = "discover",
+            id = "candidate-branches",
             parallel = True,
             children = [
                 search(
@@ -12,27 +12,59 @@ workflow(
                     file_scope = ["crates/tui/src/rlm/**", "crates/tui/src/core/**"],
                 ),
                 agent(
-                    id = "inspect-provider-cache",
-                    prompt = "Inspect provider cache behavior without editing files.",
+                    id = "minimal-patch",
+                    prompt = "Draft the smallest safe cache-routing patch using shared ARMH context.",
+                    agent_type = "implementer",
+                    mode = "read_write",
+                    isolation = "worktree",
+                    file_scope = ["crates/tui/src/rlm/**", "crates/tui/src/core/**"],
+                ),
+                agent(
+                    id = "architecture-review",
+                    prompt = "Review cache routing boundaries and identify replay or provider risks.",
                     agent_type = "explore",
-                    file_scope = ["crates/tui/src/providers/**"],
+                    file_scope = [
+                        "crates/tui/src/providers/**",
+                        "crates/tui/src/rlm/**",
+                    ],
                 ),
             ],
         ),
         sequence(
-            id = "verify-and-summarize",
+            id = "verify-select-and-summarize",
             children = [
-                test(
-                    id = "run-rlm-tests",
-                    command = "cargo test -p codewhale-tui rlm --locked",
-                    file_scope = ["crates/tui/src/rlm/**"],
+                loop_until(
+                    id = "implement-until-tests-pass",
+                    condition = "regression tests pass",
+                    max_iterations = 2,
+                    children = [
+                        test(
+                            id = "regression-tests",
+                            command = "cargo test -p codewhale-tui rlm --locked",
+                            file_scope = ["crates/tui/src/rlm/**"],
+                        ),
+                    ],
+                ),
+                tournament(
+                    id = "select-maintainer-slice",
+                    candidates = [
+                        "minimal-patch",
+                        "regression-tests",
+                        "architecture-review",
+                    ],
+                ),
+                teacher_review(
+                    id = "teacher-review",
+                    candidates = ["select-maintainer-slice"],
                 ),
                 reduce(
                     id = "summarize-cache-change",
                     inputs = [
                         "find-cache-surfaces",
-                        "inspect-provider-cache",
-                        "run-rlm-tests",
+                        "minimal-patch",
+                        "architecture-review",
+                        "regression-tests",
+                        "teacher-review",
                     ],
                     prompt = "Summarize the smallest safe cache-routing patch.",
                 ),