test(whaleflow): replay dogfood workflow from recorded trace
Add recorded mock-trace replay coverage for workflows/rlm_cache_change.star and prove missing dogfood records produce ReplayDiverged instead of live fallback.\n\nVerification:\n- cargo test -p codewhale-whaleflow rlm_cache_change --locked\n- cargo fmt --all --check\n- git diff --check\n- cmp -s CHANGELOG.md crates/tui/CHANGELOG.md\n- ./scripts/release/check-versions.sh\n- ./scripts/release/check-ohos-deps.sh
This commit is contained in:
+4
-1
@@ -111,7 +111,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
WhaleFlow mock/runtime foundations, explicit external-memory boundaries, and
|
||||
docs alignment. Live workflow execution, provider calls, TraceStore writes,
|
||||
and mutation-oriented GUI endpoints remain deferred until their atomicity and
|
||||
replay contracts are tested.
|
||||
replay contracts are tested. The `rlm_cache_change.star` dogfood workflow can
|
||||
now be replayed from recorded mock leaf/control records, and missing dogfood
|
||||
records produce `ReplayDiverged` instead of falling back to live execution
|
||||
(#2679).
|
||||
Thanks @AdityaVG13 for the WhaleFlow draft and cost-tracking direction.
|
||||
- Added a state-store v2 schema migration for WhaleFlow trace tables covering
|
||||
workflow, branch, leaf, control-node, and teacher-candidate runs. The
|
||||
|
||||
@@ -111,7 +111,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
WhaleFlow mock/runtime foundations, explicit external-memory boundaries, and
|
||||
docs alignment. Live workflow execution, provider calls, TraceStore writes,
|
||||
and mutation-oriented GUI endpoints remain deferred until their atomicity and
|
||||
replay contracts are tested.
|
||||
replay contracts are tested. The `rlm_cache_change.star` dogfood workflow can
|
||||
now be replayed from recorded mock leaf/control records, and missing dogfood
|
||||
records produce `ReplayDiverged` instead of falling back to live execution
|
||||
(#2679).
|
||||
Thanks @AdityaVG13 for the WhaleFlow draft and cost-tracking direction.
|
||||
- Added a state-store v2 schema migration for WhaleFlow trace tables covering
|
||||
workflow, branch, leaf, control-node, and teacher-candidate runs. The
|
||||
|
||||
@@ -416,7 +416,13 @@ fn workflow_builtins(builder: &mut GlobalsBuilder) {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::{AgentType, ControlNodeKind, MockWorkflowExecutor, WorkflowRunStatus};
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use crate::{
|
||||
AgentType, ControlNodeKind, LeafResult, MockWorkflowExecutor, ReplayControlRecord,
|
||||
ReplayLeafRecord, WorkflowReplayExecutor, WorkflowReplayTrace, WorkflowRunStatus,
|
||||
compute_leaf_input_hash,
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn starlark_compiles_to_ir() {
|
||||
@@ -464,6 +470,187 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rlm_cache_change_workflow_replays_from_recorded_mock_trace() {
|
||||
let source = include_str!("../../../workflows/rlm_cache_change.star");
|
||||
let workflow = compile_starlark_workflow("rlm_cache_change.star", source)
|
||||
.expect("example should compile");
|
||||
let execution = MockWorkflowExecutor::new()
|
||||
.with_predicate_results("implement-until-tests-pass", vec![true])
|
||||
.run(&workflow)
|
||||
.expect("dogfood workflow should run with mock leaves");
|
||||
let trace = replay_trace_from_execution("trace-rlm-cache", &workflow, &execution);
|
||||
|
||||
let replayed = WorkflowReplayExecutor::new(trace)
|
||||
.run(&workflow)
|
||||
.expect("recorded dogfood trace should replay");
|
||||
|
||||
assert_eq!(replayed.status, WorkflowRunStatus::Succeeded);
|
||||
assert!(
|
||||
replayed
|
||||
.leaf_results
|
||||
.iter()
|
||||
.any(|result| result.leaf_id == "regression-tests")
|
||||
);
|
||||
assert!(
|
||||
replayed
|
||||
.control_node_results
|
||||
.iter()
|
||||
.any(|result| result.node_id == "teacher-review")
|
||||
);
|
||||
assert!(
|
||||
replayed
|
||||
.control_node_results
|
||||
.iter()
|
||||
.any(|result| result.node_id == "summarize-cache-change")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rlm_cache_change_replay_diverges_when_record_missing() {
|
||||
let source = include_str!("../../../workflows/rlm_cache_change.star");
|
||||
let workflow = compile_starlark_workflow("rlm_cache_change.star", source)
|
||||
.expect("example should compile");
|
||||
let execution = MockWorkflowExecutor::new()
|
||||
.with_predicate_results("implement-until-tests-pass", vec![true])
|
||||
.run(&workflow)
|
||||
.expect("dogfood workflow should run with mock leaves");
|
||||
let mut trace =
|
||||
replay_trace_from_execution("trace-rlm-cache-missing", &workflow, &execution);
|
||||
trace
|
||||
.leaf_records
|
||||
.retain(|record| record.leaf_id != "regression-tests");
|
||||
|
||||
let replayed = WorkflowReplayExecutor::new(trace)
|
||||
.run(&workflow)
|
||||
.expect("missing dogfood leaf record should be a replay result");
|
||||
|
||||
assert_eq!(replayed.status, WorkflowRunStatus::ReplayDiverged);
|
||||
assert!(replayed.leaf_results.iter().any(|result| {
|
||||
result.leaf_id == "regression-tests"
|
||||
&& result.status == WorkflowRunStatus::ReplayDiverged
|
||||
}));
|
||||
}
|
||||
|
||||
fn replay_trace_from_execution(
|
||||
trace_id: &str,
|
||||
workflow: &WorkflowSpec,
|
||||
execution: &crate::WorkflowExecution,
|
||||
) -> WorkflowReplayTrace {
|
||||
let mut resolved_outputs = BTreeMap::new();
|
||||
let mut leaf_records = Vec::new();
|
||||
collect_leaf_records(
|
||||
trace_id,
|
||||
workflow,
|
||||
&workflow.nodes,
|
||||
&execution.leaf_results,
|
||||
&mut resolved_outputs,
|
||||
&mut leaf_records,
|
||||
);
|
||||
let control_records = execution
|
||||
.control_node_results
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(|result| ReplayControlRecord {
|
||||
trace_id: trace_id.to_string(),
|
||||
node_id: result.node_id.clone(),
|
||||
kind: result.kind,
|
||||
result,
|
||||
generated_nodes: Vec::new(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
WorkflowReplayTrace {
|
||||
trace_id: trace_id.to_string(),
|
||||
leaf_records,
|
||||
control_records,
|
||||
}
|
||||
}
|
||||
|
||||
fn collect_leaf_records(
|
||||
trace_id: &str,
|
||||
workflow: &WorkflowSpec,
|
||||
nodes: &[WorkflowNode],
|
||||
results: &[LeafResult],
|
||||
resolved_outputs: &mut BTreeMap<String, Option<String>>,
|
||||
records: &mut Vec<ReplayLeafRecord>,
|
||||
) {
|
||||
for node in nodes {
|
||||
match node {
|
||||
WorkflowNode::BranchSet(branch) => collect_leaf_records(
|
||||
trace_id,
|
||||
workflow,
|
||||
&branch.children,
|
||||
results,
|
||||
resolved_outputs,
|
||||
records,
|
||||
),
|
||||
WorkflowNode::Leaf(leaf) => {
|
||||
let result = results
|
||||
.iter()
|
||||
.find(|result| result.leaf_id == leaf.id)
|
||||
.expect("mock execution should record every declared leaf")
|
||||
.clone();
|
||||
let resolved_inputs = leaf
|
||||
.depends_on_results
|
||||
.iter()
|
||||
.map(|dependency| {
|
||||
(
|
||||
dependency.clone(),
|
||||
resolved_outputs.get(dependency).cloned().unwrap_or(None),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
records.push(ReplayLeafRecord {
|
||||
trace_id: trace_id.to_string(),
|
||||
leaf_id: leaf.id.clone(),
|
||||
input_hash: compute_leaf_input_hash(workflow, leaf, &resolved_inputs)
|
||||
.expect("leaf input hash should serialize"),
|
||||
result: result.clone(),
|
||||
});
|
||||
resolved_outputs.insert(leaf.id.clone(), result.output);
|
||||
}
|
||||
WorkflowNode::Sequence(sequence) => collect_leaf_records(
|
||||
trace_id,
|
||||
workflow,
|
||||
&sequence.children,
|
||||
results,
|
||||
resolved_outputs,
|
||||
records,
|
||||
),
|
||||
WorkflowNode::LoopUntil(loop_until) => collect_leaf_records(
|
||||
trace_id,
|
||||
workflow,
|
||||
&loop_until.children,
|
||||
results,
|
||||
resolved_outputs,
|
||||
records,
|
||||
),
|
||||
WorkflowNode::Cond(cond) => {
|
||||
collect_leaf_records(
|
||||
trace_id,
|
||||
workflow,
|
||||
&cond.then_nodes,
|
||||
results,
|
||||
resolved_outputs,
|
||||
records,
|
||||
);
|
||||
collect_leaf_records(
|
||||
trace_id,
|
||||
workflow,
|
||||
&cond.else_nodes,
|
||||
results,
|
||||
resolved_outputs,
|
||||
records,
|
||||
);
|
||||
}
|
||||
WorkflowNode::Expand(_)
|
||||
| WorkflowNode::Reduce(_)
|
||||
| WorkflowNode::TeacherReview(_) => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn starlark_repair_loop() {
|
||||
let source = r#"
|
||||
|
||||
@@ -59,7 +59,7 @@ config source, result, and follow-up issue or PR.
|
||||
|
||||
| Gate | Owner | Ship/defer decision | Evidence |
|
||||
| --- | --- | --- | --- |
|
||||
| WhaleFlow typed IR, mock executor, replay, TeacherReview, StudentReplay, and cutline docs are tested | WhaleFlow steward | ship | #2821/#2824/#2831/#2833/#2839/#2840/#2841 plus focused local `cargo test -p codewhale-whaleflow --locked`; #2670 closed after `cargo test -p codewhale-whaleflow starlark --locked` passed 7/7 on current stewardship head. |
|
||||
| WhaleFlow typed IR, mock executor, replay, TeacherReview, StudentReplay, and cutline docs are tested | WhaleFlow steward | ship | #2821/#2824/#2831/#2833/#2839/#2840/#2841 plus focused local `cargo test -p codewhale-whaleflow --locked`; #2670 closed after `cargo test -p codewhale-whaleflow starlark --locked` passed 7/7 on current stewardship head. The `rlm_cache_change.star` dogfood workflow now has recorded mock-trace replay coverage, including a missing-record divergence check. |
|
||||
| Live `workflow_run`, worktree application, provider calls, and TraceStore writes are deferred until cancellation/replay/atomicity semantics pass | WhaleFlow steward | defer | #2669 and #2679 remain open for live runtime execution, provider calls, TraceStore writes, Arcee/student replay, and CLI/TUI workflow mode; current v0.9 branch ships mock executor/replay foundations only. |
|
||||
| Model Lab / Hugging Face MVP is included or deferred with release-note wording | model-lab steward | decide | |
|
||||
| HarnessProfile runtime MVP is deferred; schema/resolver foundation ships with release-note wording | harness steward | ship foundation / defer runtime | #2844 (`efbcc681a`) documents the cutline; `HarnessPosture` / `HarnessProfile` config schema and strict validation are present; a pure resolver matches provider/model routes without changing runtime behavior; seed-profile runtime selection, telemetry, and status display remain follow-up work. |
|
||||
|
||||
Reference in New Issue
Block a user