feat(whaleflow): run dogfood workflow with mock executor (#2831)

This commit is contained in:
Hunter Bown
2026-06-05 22:17:02 -07:00
committed by GitHub
parent 5044a29db8
commit e5974aa850
4 changed files with 113 additions and 15 deletions
+4 -1
View File
@@ -56,7 +56,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
command and live-provider replay fallback remain deferred. The crate also now
has a model-agnostic role/capability registry with mock provider plumbing and
fail-closed JSON repair parsing, so WhaleFlow can choose capable models for
roles without hardcoding provider-specific runtime paths (#2672).
roles without hardcoding provider-specific runtime paths (#2672). The
`rlm_cache_change.star` dogfood workflow now exercises candidate branches,
LoopUntil verification, tournament selection, teacher review, and mock
execution in CI-oriented crate tests (#2679).
Thanks @AdityaVG13 for the WhaleFlow draft and cost-tracking direction.
- Added a state-store v2 schema migration for WhaleFlow trace tables covering
workflow, branch, leaf, control-node, and teacher-candidate runs. The
+4 -1
View File
@@ -56,7 +56,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
command and live-provider replay fallback remain deferred. The crate also now
has a model-agnostic role/capability registry with mock provider plumbing and
fail-closed JSON repair parsing, so WhaleFlow can choose capable models for
roles without hardcoding provider-specific runtime paths (#2672).
roles without hardcoding provider-specific runtime paths (#2672). The
`rlm_cache_change.star` dogfood workflow now exercises candidate branches,
LoopUntil verification, tournament selection, teacher review, and mock
execution in CI-oriented crate tests (#2679).
Thanks @AdityaVG13 for the WhaleFlow draft and cost-tracking direction.
- Added a state-store v2 schema migration for WhaleFlow trace tables covering
workflow, branch, leaf, control-node, and teacher-candidate runs. The
+62 -2
View File
@@ -413,7 +413,7 @@ fn workflow_builtins(builder: &mut GlobalsBuilder) {
#[cfg(test)]
mod tests {
use super::*;
use crate::{AgentType, ControlNodeKind};
use crate::{AgentType, ControlNodeKind, MockWorkflowExecutor, WorkflowRunStatus};
#[test]
fn starlark_compiles_to_ir() {
@@ -426,7 +426,7 @@ mod tests {
let WorkflowNode::BranchSet(branch) = &workflow.nodes[0] else {
panic!("first node should be a branch set");
};
assert_eq!(branch.id, "discover");
assert_eq!(branch.id, "candidate-branches");
assert!(branch.parallel);
let WorkflowNode::Leaf(leaf) = &branch.children[0] else {
panic!("first branch child should be a leaf");
@@ -434,6 +434,33 @@ mod tests {
assert_eq!(leaf.agent_type, AgentType::ToolAgent);
}
#[test]
fn rlm_cache_change_workflow_runs_with_mock_provider() {
let source = include_str!("../../../workflows/rlm_cache_change.star");
let workflow = compile_starlark_workflow("rlm_cache_change.star", source)
.expect("example should compile");
let mut executor = MockWorkflowExecutor::new()
.with_predicate_results("implement-until-tests-pass", vec![true]);
let execution = executor
.run(&workflow)
.expect("dogfood workflow should run with mock leaves");
assert_eq!(execution.status, WorkflowRunStatus::Succeeded);
assert!(
execution
.leaf_results
.iter()
.any(|result| result.leaf_id == "regression-tests")
);
assert!(
execution
.control_node_results
.iter()
.any(|result| result.node_id == "teacher-review")
);
}
#[test]
fn starlark_repair_loop() {
let source = r#"
@@ -455,6 +482,39 @@ workflow(
assert!(matches!(workflow.nodes[0], WorkflowNode::BranchSet(_)));
}
#[test]
fn starlark_generated_workflow_repairs_then_runs() {
let source = r#"
workflow(
id = "generated-repair-run",
goal = "repair generated workflow aliases",
nodes = [
ctx.parallel(id = "discover", children = [
agent(id = "scan", prompt = "scan repo"),
]),
ctx.loop_until(
id = "verify",
condition = "checks pass",
max_iterations = 1,
children = [
test(id = "run-tests", command = "cargo test -p codewhale-whaleflow --locked"),
],
),
],
)
"#;
let workflow = compile_starlark_workflow_with_repair("generated.star", source)
.expect("repair should produce runnable IR");
let mut executor = MockWorkflowExecutor::new().with_predicate_results("verify", vec![true]);
let execution = executor
.run(&workflow)
.expect("repaired workflow should run with mock leaves");
assert_eq!(execution.status, WorkflowRunStatus::Succeeded);
assert_eq!(execution.leaf_results.len(), 2);
}
#[test]
fn invalid_workflow_rejected() {
let source = r#"
+43 -11
View File
@@ -3,7 +3,7 @@ workflow(
goal = "Evaluate an RLM/cache routing change with safe mock WhaleFlow IR",
nodes = [
branch(
id = "discover",
id = "candidate-branches",
parallel = True,
children = [
search(
@@ -12,27 +12,59 @@ workflow(
file_scope = ["crates/tui/src/rlm/**", "crates/tui/src/core/**"],
),
agent(
id = "inspect-provider-cache",
prompt = "Inspect provider cache behavior without editing files.",
id = "minimal-patch",
prompt = "Draft the smallest safe cache-routing patch using shared ARMH context.",
agent_type = "implementer",
mode = "read_write",
isolation = "worktree",
file_scope = ["crates/tui/src/rlm/**", "crates/tui/src/core/**"],
),
agent(
id = "architecture-review",
prompt = "Review cache routing boundaries and identify replay or provider risks.",
agent_type = "explore",
file_scope = ["crates/tui/src/providers/**"],
file_scope = [
"crates/tui/src/providers/**",
"crates/tui/src/rlm/**",
],
),
],
),
sequence(
id = "verify-and-summarize",
id = "verify-select-and-summarize",
children = [
test(
id = "run-rlm-tests",
command = "cargo test -p codewhale-tui rlm --locked",
file_scope = ["crates/tui/src/rlm/**"],
loop_until(
id = "implement-until-tests-pass",
condition = "regression tests pass",
max_iterations = 2,
children = [
test(
id = "regression-tests",
command = "cargo test -p codewhale-tui rlm --locked",
file_scope = ["crates/tui/src/rlm/**"],
),
],
),
tournament(
id = "select-maintainer-slice",
candidates = [
"minimal-patch",
"regression-tests",
"architecture-review",
],
),
teacher_review(
id = "teacher-review",
candidates = ["select-maintainer-slice"],
),
reduce(
id = "summarize-cache-change",
inputs = [
"find-cache-surfaces",
"inspect-provider-cache",
"run-rlm-tests",
"minimal-patch",
"architecture-review",
"regression-tests",
"teacher-review",
],
prompt = "Summarize the smallest safe cache-routing patch.",
),