feat(whaleflow): run dogfood workflow with mock executor (#2831)
This commit is contained in:
+4
-1
@@ -56,7 +56,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
command and live-provider replay fallback remain deferred. The crate also now
|
||||
has a model-agnostic role/capability registry with mock provider plumbing and
|
||||
fail-closed JSON repair parsing, so WhaleFlow can choose capable models for
|
||||
roles without hardcoding provider-specific runtime paths (#2672).
|
||||
roles without hardcoding provider-specific runtime paths (#2672). The
|
||||
`rlm_cache_change.star` dogfood workflow now exercises candidate branches,
|
||||
LoopUntil verification, tournament selection, teacher review, and mock
|
||||
execution in CI-oriented crate tests (#2679).
|
||||
Thanks @AdityaVG13 for the WhaleFlow draft and cost-tracking direction.
|
||||
- Added a state-store v2 schema migration for WhaleFlow trace tables covering
|
||||
workflow, branch, leaf, control-node, and teacher-candidate runs. The
|
||||
|
||||
@@ -56,7 +56,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
command and live-provider replay fallback remain deferred. The crate also now
|
||||
has a model-agnostic role/capability registry with mock provider plumbing and
|
||||
fail-closed JSON repair parsing, so WhaleFlow can choose capable models for
|
||||
roles without hardcoding provider-specific runtime paths (#2672).
|
||||
roles without hardcoding provider-specific runtime paths (#2672). The
|
||||
`rlm_cache_change.star` dogfood workflow now exercises candidate branches,
|
||||
LoopUntil verification, tournament selection, teacher review, and mock
|
||||
execution in CI-oriented crate tests (#2679).
|
||||
Thanks @AdityaVG13 for the WhaleFlow draft and cost-tracking direction.
|
||||
- Added a state-store v2 schema migration for WhaleFlow trace tables covering
|
||||
workflow, branch, leaf, control-node, and teacher-candidate runs. The
|
||||
|
||||
@@ -413,7 +413,7 @@ fn workflow_builtins(builder: &mut GlobalsBuilder) {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::{AgentType, ControlNodeKind};
|
||||
use crate::{AgentType, ControlNodeKind, MockWorkflowExecutor, WorkflowRunStatus};
|
||||
|
||||
#[test]
|
||||
fn starlark_compiles_to_ir() {
|
||||
@@ -426,7 +426,7 @@ mod tests {
|
||||
let WorkflowNode::BranchSet(branch) = &workflow.nodes[0] else {
|
||||
panic!("first node should be a branch set");
|
||||
};
|
||||
assert_eq!(branch.id, "discover");
|
||||
assert_eq!(branch.id, "candidate-branches");
|
||||
assert!(branch.parallel);
|
||||
let WorkflowNode::Leaf(leaf) = &branch.children[0] else {
|
||||
panic!("first branch child should be a leaf");
|
||||
@@ -434,6 +434,33 @@ mod tests {
|
||||
assert_eq!(leaf.agent_type, AgentType::ToolAgent);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rlm_cache_change_workflow_runs_with_mock_provider() {
|
||||
let source = include_str!("../../../workflows/rlm_cache_change.star");
|
||||
let workflow = compile_starlark_workflow("rlm_cache_change.star", source)
|
||||
.expect("example should compile");
|
||||
let mut executor = MockWorkflowExecutor::new()
|
||||
.with_predicate_results("implement-until-tests-pass", vec![true]);
|
||||
|
||||
let execution = executor
|
||||
.run(&workflow)
|
||||
.expect("dogfood workflow should run with mock leaves");
|
||||
|
||||
assert_eq!(execution.status, WorkflowRunStatus::Succeeded);
|
||||
assert!(
|
||||
execution
|
||||
.leaf_results
|
||||
.iter()
|
||||
.any(|result| result.leaf_id == "regression-tests")
|
||||
);
|
||||
assert!(
|
||||
execution
|
||||
.control_node_results
|
||||
.iter()
|
||||
.any(|result| result.node_id == "teacher-review")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn starlark_repair_loop() {
|
||||
let source = r#"
|
||||
@@ -455,6 +482,39 @@ workflow(
|
||||
assert!(matches!(workflow.nodes[0], WorkflowNode::BranchSet(_)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn starlark_generated_workflow_repairs_then_runs() {
|
||||
let source = r#"
|
||||
workflow(
|
||||
id = "generated-repair-run",
|
||||
goal = "repair generated workflow aliases",
|
||||
nodes = [
|
||||
ctx.parallel(id = "discover", children = [
|
||||
agent(id = "scan", prompt = "scan repo"),
|
||||
]),
|
||||
ctx.loop_until(
|
||||
id = "verify",
|
||||
condition = "checks pass",
|
||||
max_iterations = 1,
|
||||
children = [
|
||||
test(id = "run-tests", command = "cargo test -p codewhale-whaleflow --locked"),
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
"#;
|
||||
let workflow = compile_starlark_workflow_with_repair("generated.star", source)
|
||||
.expect("repair should produce runnable IR");
|
||||
let mut executor = MockWorkflowExecutor::new().with_predicate_results("verify", vec![true]);
|
||||
|
||||
let execution = executor
|
||||
.run(&workflow)
|
||||
.expect("repaired workflow should run with mock leaves");
|
||||
|
||||
assert_eq!(execution.status, WorkflowRunStatus::Succeeded);
|
||||
assert_eq!(execution.leaf_results.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invalid_workflow_rejected() {
|
||||
let source = r#"
|
||||
|
||||
@@ -3,7 +3,7 @@ workflow(
|
||||
goal = "Evaluate an RLM/cache routing change with safe mock WhaleFlow IR",
|
||||
nodes = [
|
||||
branch(
|
||||
id = "discover",
|
||||
id = "candidate-branches",
|
||||
parallel = True,
|
||||
children = [
|
||||
search(
|
||||
@@ -12,27 +12,59 @@ workflow(
|
||||
file_scope = ["crates/tui/src/rlm/**", "crates/tui/src/core/**"],
|
||||
),
|
||||
agent(
|
||||
id = "inspect-provider-cache",
|
||||
prompt = "Inspect provider cache behavior without editing files.",
|
||||
id = "minimal-patch",
|
||||
prompt = "Draft the smallest safe cache-routing patch using shared ARMH context.",
|
||||
agent_type = "implementer",
|
||||
mode = "read_write",
|
||||
isolation = "worktree",
|
||||
file_scope = ["crates/tui/src/rlm/**", "crates/tui/src/core/**"],
|
||||
),
|
||||
agent(
|
||||
id = "architecture-review",
|
||||
prompt = "Review cache routing boundaries and identify replay or provider risks.",
|
||||
agent_type = "explore",
|
||||
file_scope = ["crates/tui/src/providers/**"],
|
||||
file_scope = [
|
||||
"crates/tui/src/providers/**",
|
||||
"crates/tui/src/rlm/**",
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
sequence(
|
||||
id = "verify-and-summarize",
|
||||
id = "verify-select-and-summarize",
|
||||
children = [
|
||||
test(
|
||||
id = "run-rlm-tests",
|
||||
command = "cargo test -p codewhale-tui rlm --locked",
|
||||
file_scope = ["crates/tui/src/rlm/**"],
|
||||
loop_until(
|
||||
id = "implement-until-tests-pass",
|
||||
condition = "regression tests pass",
|
||||
max_iterations = 2,
|
||||
children = [
|
||||
test(
|
||||
id = "regression-tests",
|
||||
command = "cargo test -p codewhale-tui rlm --locked",
|
||||
file_scope = ["crates/tui/src/rlm/**"],
|
||||
),
|
||||
],
|
||||
),
|
||||
tournament(
|
||||
id = "select-maintainer-slice",
|
||||
candidates = [
|
||||
"minimal-patch",
|
||||
"regression-tests",
|
||||
"architecture-review",
|
||||
],
|
||||
),
|
||||
teacher_review(
|
||||
id = "teacher-review",
|
||||
candidates = ["select-maintainer-slice"],
|
||||
),
|
||||
reduce(
|
||||
id = "summarize-cache-change",
|
||||
inputs = [
|
||||
"find-cache-surfaces",
|
||||
"inspect-provider-cache",
|
||||
"run-rlm-tests",
|
||||
"minimal-patch",
|
||||
"architecture-review",
|
||||
"regression-tests",
|
||||
"teacher-review",
|
||||
],
|
||||
prompt = "Summarize the smallest safe cache-routing patch.",
|
||||
),
|
||||
|
||||
Reference in New Issue
Block a user