feat(whaleflow): run dogfood workflow with mock executor (#2831)

2026-06-05 22:17:02 -07:00
parent 5044a29db8
commit e5974aa850
4 changed files with 113 additions and 15 deletions
@@ -56,7 +56,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
  command and live-provider replay fallback remain deferred. The crate also now
  has a model-agnostic role/capability registry with mock provider plumbing and
  fail-closed JSON repair parsing, so WhaleFlow can choose capable models for
-  roles without hardcoding provider-specific runtime paths (#2672).
+  roles without hardcoding provider-specific runtime paths (#2672). The
+  `rlm_cache_change.star` dogfood workflow now exercises candidate branches,
+  LoopUntil verification, tournament selection, teacher review, and mock
+  execution in CI-oriented crate tests (#2679).
  Thanks @AdityaVG13 for the WhaleFlow draft and cost-tracking direction.
 - Added a state-store v2 schema migration for WhaleFlow trace tables covering
  workflow, branch, leaf, control-node, and teacher-candidate runs. The
@@ -56,7 +56,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
  command and live-provider replay fallback remain deferred. The crate also now
  has a model-agnostic role/capability registry with mock provider plumbing and
  fail-closed JSON repair parsing, so WhaleFlow can choose capable models for
-  roles without hardcoding provider-specific runtime paths (#2672).
+  roles without hardcoding provider-specific runtime paths (#2672). The
+  `rlm_cache_change.star` dogfood workflow now exercises candidate branches,
+  LoopUntil verification, tournament selection, teacher review, and mock
+  execution in CI-oriented crate tests (#2679).
  Thanks @AdityaVG13 for the WhaleFlow draft and cost-tracking direction.
 - Added a state-store v2 schema migration for WhaleFlow trace tables covering
  workflow, branch, leaf, control-node, and teacher-candidate runs. The
@@ -413,7 +413,7 @@ fn workflow_builtins(builder: &mut GlobalsBuilder) {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::{AgentType, ControlNodeKind};
+    use crate::{AgentType, ControlNodeKind, MockWorkflowExecutor, WorkflowRunStatus};

    #[test]
    fn starlark_compiles_to_ir() {
@@ -426,7 +426,7 @@ mod tests {
        let WorkflowNode::BranchSet(branch) = &workflow.nodes[0] else {
            panic!("first node should be a branch set");
        };
-        assert_eq!(branch.id, "discover");
+        assert_eq!(branch.id, "candidate-branches");
        assert!(branch.parallel);
        let WorkflowNode::Leaf(leaf) = &branch.children[0] else {
            panic!("first branch child should be a leaf");
@@ -434,6 +434,33 @@ mod tests {
        assert_eq!(leaf.agent_type, AgentType::ToolAgent);
    }

+    #[test]
+    fn rlm_cache_change_workflow_runs_with_mock_provider() {
+        let source = include_str!("../../../workflows/rlm_cache_change.star");
+        let workflow = compile_starlark_workflow("rlm_cache_change.star", source)
+            .expect("example should compile");
+        let mut executor = MockWorkflowExecutor::new()
+            .with_predicate_results("implement-until-tests-pass", vec![true]);
+
+        let execution = executor
+            .run(&workflow)
+            .expect("dogfood workflow should run with mock leaves");
+
+        assert_eq!(execution.status, WorkflowRunStatus::Succeeded);
+        assert!(
+            execution
+                .leaf_results
+                .iter()
+                .any(|result| result.leaf_id == "regression-tests")
+        );
+        assert!(
+            execution
+                .control_node_results
+                .iter()
+                .any(|result| result.node_id == "teacher-review")
+        );
+    }
+
    #[test]
    fn starlark_repair_loop() {
        let source = r#"
@@ -455,6 +482,39 @@ workflow(
        assert!(matches!(workflow.nodes[0], WorkflowNode::BranchSet(_)));
    }

+    #[test]
+    fn starlark_generated_workflow_repairs_then_runs() {
+        let source = r#"
+workflow(
+    id = "generated-repair-run",
+    goal = "repair generated workflow aliases",
+    nodes = [
+        ctx.parallel(id = "discover", children = [
+            agent(id = "scan", prompt = "scan repo"),
+        ]),
+        ctx.loop_until(
+            id = "verify",
+            condition = "checks pass",
+            max_iterations = 1,
+            children = [
+                test(id = "run-tests", command = "cargo test -p codewhale-whaleflow --locked"),
+            ],
+        ),
+    ],
+)
+"#;
+        let workflow = compile_starlark_workflow_with_repair("generated.star", source)
+            .expect("repair should produce runnable IR");
+        let mut executor = MockWorkflowExecutor::new().with_predicate_results("verify", vec![true]);
+
+        let execution = executor
+            .run(&workflow)
+            .expect("repaired workflow should run with mock leaves");
+
+        assert_eq!(execution.status, WorkflowRunStatus::Succeeded);
+        assert_eq!(execution.leaf_results.len(), 2);
+    }
+
    #[test]
    fn invalid_workflow_rejected() {
        let source = r#"
@@ -3,7 +3,7 @@ workflow(
    goal = "Evaluate an RLM/cache routing change with safe mock WhaleFlow IR",
    nodes = [
        branch(
-            id = "discover",
+            id = "candidate-branches",
            parallel = True,
            children = [
                search(
@@ -12,27 +12,59 @@ workflow(
                    file_scope = ["crates/tui/src/rlm/**", "crates/tui/src/core/**"],
                ),
                agent(
-                    id = "inspect-provider-cache",
-                    prompt = "Inspect provider cache behavior without editing files.",
+                    id = "minimal-patch",
+                    prompt = "Draft the smallest safe cache-routing patch using shared ARMH context.",
+                    agent_type = "implementer",
+                    mode = "read_write",
+                    isolation = "worktree",
+                    file_scope = ["crates/tui/src/rlm/**", "crates/tui/src/core/**"],
+                ),
+                agent(
+                    id = "architecture-review",
+                    prompt = "Review cache routing boundaries and identify replay or provider risks.",
                    agent_type = "explore",
-                    file_scope = ["crates/tui/src/providers/**"],
+                    file_scope = [
+                        "crates/tui/src/providers/**",
+                        "crates/tui/src/rlm/**",
+                    ],
                ),
            ],
        ),
        sequence(
-            id = "verify-and-summarize",
+            id = "verify-select-and-summarize",
            children = [
-                test(
-                    id = "run-rlm-tests",
-                    command = "cargo test -p codewhale-tui rlm --locked",
-                    file_scope = ["crates/tui/src/rlm/**"],
+                loop_until(
+                    id = "implement-until-tests-pass",
+                    condition = "regression tests pass",
+                    max_iterations = 2,
+                    children = [
+                        test(
+                            id = "regression-tests",
+                            command = "cargo test -p codewhale-tui rlm --locked",
+                            file_scope = ["crates/tui/src/rlm/**"],
+                        ),
+                    ],
+                ),
+                tournament(
+                    id = "select-maintainer-slice",
+                    candidates = [
+                        "minimal-patch",
+                        "regression-tests",
+                        "architecture-review",
+                    ],
+                ),
+                teacher_review(
+                    id = "teacher-review",
+                    candidates = ["select-maintainer-slice"],
                ),
                reduce(
                    id = "summarize-cache-change",
                    inputs = [
                        "find-cache-surfaces",
-                        "inspect-provider-cache",
-                        "run-rlm-tests",
+                        "minimal-patch",
+                        "architecture-review",
+                        "regression-tests",
+                        "teacher-review",
                    ],
                    prompt = "Summarize the smallest safe cache-routing patch.",
                ),