From 14d14f56deeec478e1ff35c174503b54d3a481ea Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Fri, 5 Jun 2026 23:08:10 -0700 Subject: [PATCH] feat(whaleflow): add teacher candidate artifacts (#2839) --- CHANGELOG.md | 6 +- crates/tui/CHANGELOG.md | 6 +- crates/whaleflow/src/lib.rs | 297 ++++++++++++++++++++++++++++++++++++ 3 files changed, 307 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b73d00dd..241e530c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -78,7 +78,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 Starlark and typed-IR gates now also reject unknown leaf dependencies, reducer inputs, and teacher-review candidates before mock execution or replay, keeping generated workflows fail-closed while runtime/worktree semantics stay - deferred. + deferred. TeacherReview now has serializable GEPA-style candidate artifacts + for notes, workflow recipes, skills, regression tests, cache policy, branch + heuristics, and Starlark authoring prompt patches, plus an offline helper + that proposes candidates from recorded execution traces without promoting + them or training model weights (#2674). Thanks @AdityaVG13 for the WhaleFlow draft and cost-tracking direction. - Added a state-store v2 schema migration for WhaleFlow trace tables covering workflow, branch, leaf, control-node, and teacher-candidate runs. The diff --git a/crates/tui/CHANGELOG.md b/crates/tui/CHANGELOG.md index b73d00dd..241e530c 100644 --- a/crates/tui/CHANGELOG.md +++ b/crates/tui/CHANGELOG.md @@ -78,7 +78,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 Starlark and typed-IR gates now also reject unknown leaf dependencies, reducer inputs, and teacher-review candidates before mock execution or replay, keeping generated workflows fail-closed while runtime/worktree semantics stay - deferred. + deferred. TeacherReview now has serializable GEPA-style candidate artifacts + for notes, workflow recipes, skills, regression tests, cache policy, branch + heuristics, and Starlark authoring prompt patches, plus an offline helper + that proposes candidates from recorded execution traces without promoting + them or training model weights (#2674). Thanks @AdityaVG13 for the WhaleFlow draft and cost-tracking direction. - Added a state-store v2 schema migration for WhaleFlow trace tables covering workflow, branch, leaf, control-node, and teacher-candidate runs. The diff --git a/crates/whaleflow/src/lib.rs b/crates/whaleflow/src/lib.rs index 68ef4ac2..7540afdc 100644 --- a/crates/whaleflow/src/lib.rs +++ b/crates/whaleflow/src/lib.rs @@ -911,6 +911,202 @@ pub struct BranchCandidate { pub diversity_key: Option, } +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum TeacherCandidateKind { + Note, + WorkflowRecipe, + SkillPatch, + RegressionTest, + CachePolicyPatch, + BranchHeuristic, + StarlarkAuthoringPromptPatch, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] +#[serde(rename_all = "snake_case")] +pub enum TeacherCandidateStatus { + #[default] + Proposed, + Accepted, + Rejected, + Promoted, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct TeacherCandidate { + pub candidate_id: String, + pub kind: TeacherCandidateKind, + #[serde(default)] + pub status: TeacherCandidateStatus, + pub source_node_id: String, + #[serde(default)] + pub source_branch_id: Option, + pub summary: String, + #[serde(default)] + pub evidence: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)] +pub struct TeacherReviewReport { + pub review_node_id: String, + #[serde(default)] + pub candidates: Vec, +} + +impl TeacherReviewReport { + pub fn from_execution(review: &TeacherReviewSpec, execution: &WorkflowExecution) -> Self { + let candidates = teacher_candidates_from_execution(review, execution); + Self { + review_node_id: review.id.clone(), + candidates, + } + } +} + +pub fn teacher_candidates_from_execution( + review: &TeacherReviewSpec, + execution: &WorkflowExecution, +) -> Vec { + let mut candidates = Vec::new(); + for source in &review.candidates { + if let Some(branch) = execution + .branch_results + .iter() + .find(|branch| branch.branch_id == *source || branch.task_id == *source) + { + candidates.push(teacher_candidate_from_branch(review, branch)); + continue; + } + if let Some(leaf) = execution + .leaf_results + .iter() + .find(|leaf| leaf.leaf_id == *source || leaf.task_id == *source) + { + candidates.push(teacher_candidate_from_leaf(review, leaf)); + continue; + } + if let Some(control) = execution + .control_node_results + .iter() + .find(|control| control.node_id == *source) + { + candidates.push(teacher_candidate_from_control(review, control)); + } + } + candidates +} + +fn teacher_candidate_from_branch( + review: &TeacherReviewSpec, + branch: &BranchResult, +) -> TeacherCandidate { + let kind = + if branch.memo_usage.armh_hits > 0 || branch.memo_usage.provider_prompt_cache_hits > 0 { + TeacherCandidateKind::CachePolicyPatch + } else if branch.status == WorkflowRunStatus::Succeeded { + TeacherCandidateKind::WorkflowRecipe + } else { + TeacherCandidateKind::BranchHeuristic + }; + let mut evidence = vec![format!("status={:?}", branch.status)]; + if branch.usage.total_tokens() > 0 || branch.usage.cost_microusd > 0 { + evidence.push(format!( + "tokens={}, cost_microusd={}", + branch.usage.total_tokens(), + branch.usage.cost_microusd + )); + } + if branch.memo_usage.armh_hits > 0 || branch.memo_usage.provider_prompt_cache_hits > 0 { + evidence.push(format!( + "armh_hits={}, provider_prompt_cache_hits={}", + branch.memo_usage.armh_hits, branch.memo_usage.provider_prompt_cache_hits + )); + } + if let Some(notes) = branch.notes.as_deref() { + evidence.push(format!("notes={notes}")); + } + TeacherCandidate { + candidate_id: format!("{}:{}", review.id, branch.branch_id), + kind, + status: TeacherCandidateStatus::Proposed, + source_node_id: branch.task_id.clone(), + source_branch_id: Some(branch.branch_id.clone()), + summary: format!( + "TeacherReview candidate from branch `{}` with {:?} status.", + branch.branch_id, branch.status + ), + evidence, + } +} + +fn teacher_candidate_from_leaf(review: &TeacherReviewSpec, leaf: &LeafResult) -> TeacherCandidate { + let kind = if leaf.status == WorkflowRunStatus::Failed { + TeacherCandidateKind::RegressionTest + } else if leaf.memo_usage.armh_hits > 0 || leaf.memo_usage.provider_prompt_cache_hits > 0 { + TeacherCandidateKind::CachePolicyPatch + } else { + TeacherCandidateKind::Note + }; + let mut evidence = vec![format!("status={:?}", leaf.status)]; + if let Some(output) = leaf.output.as_deref() { + evidence.push(format!("output={}", truncate_evidence(output))); + } + TeacherCandidate { + candidate_id: format!("{}:{}", review.id, leaf.leaf_id), + kind, + status: TeacherCandidateStatus::Proposed, + source_node_id: leaf.leaf_id.clone(), + source_branch_id: None, + summary: format!( + "TeacherReview candidate from leaf `{}` with {:?} status.", + leaf.leaf_id, leaf.status + ), + evidence, + } +} + +fn teacher_candidate_from_control( + review: &TeacherReviewSpec, + control: &ControlNodeResult, +) -> TeacherCandidate { + let mut evidence = vec![format!("status={:?}", control.status)]; + if !control.selected_children.is_empty() { + evidence.push(format!( + "selected_children={}", + control.selected_children.join(",") + )); + } + if let Some(summary) = control.summary.as_deref() { + evidence.push(format!("summary={}", truncate_evidence(summary))); + } + TeacherCandidate { + candidate_id: format!("{}:{}", review.id, control.node_id), + kind: TeacherCandidateKind::StarlarkAuthoringPromptPatch, + status: TeacherCandidateStatus::Proposed, + source_node_id: control.node_id.clone(), + source_branch_id: None, + summary: format!( + "TeacherReview candidate from control node `{}` ({:?}).", + control.node_id, control.kind + ), + evidence, + } +} + +fn truncate_evidence(value: &str) -> String { + const MAX_EVIDENCE_CHARS: usize = 240; + if value.chars().count() <= MAX_EVIDENCE_CHARS { + return value.to_string(); + } + let mut truncated = value + .chars() + .take(MAX_EVIDENCE_CHARS.saturating_sub(1)) + .collect::(); + truncated.push_str("..."); + truncated +} + #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct BranchTournament { #[serde(default)] @@ -2227,6 +2423,107 @@ mod tests { ); } + #[test] + fn teacher_candidate_serialization() { + let candidate = TeacherCandidate { + candidate_id: "teacher-review:branch-a".to_string(), + kind: TeacherCandidateKind::WorkflowRecipe, + status: TeacherCandidateStatus::Proposed, + source_node_id: "branch-a".to_string(), + source_branch_id: Some("branch-a".to_string()), + summary: "Winning branch found a reusable workflow recipe.".to_string(), + evidence: vec![ + "status=Succeeded".to_string(), + "tokens=42, cost_microusd=7".to_string(), + ], + }; + + let json = serde_json::to_string(&candidate).expect("serialize teacher candidate"); + + assert!(json.contains("\"kind\":\"workflow_recipe\"")); + assert!(json.contains("\"status\":\"proposed\"")); + let parsed: TeacherCandidate = + serde_json::from_str(&json).expect("parse teacher candidate"); + assert_eq!(parsed, candidate); + } + + #[test] + fn teacher_review_produces_candidate_from_trace() { + let review = TeacherReviewSpec { + id: "teacher-review".to_string(), + candidates: vec!["winning-branch".to_string()], + promotion_policy: PromotionPolicy::default(), + }; + let execution = WorkflowExecution { + branch_results: vec![BranchResult { + branch_id: "winning-branch".to_string(), + task_id: "winning-branch".to_string(), + status: WorkflowRunStatus::Succeeded, + usage: WorkflowUsage { + input_tokens: 30, + output_tokens: 12, + cost_microusd: 7, + }, + memo_usage: WorkflowMemoUsage::default(), + artifacts: vec!["trace://branches/winning-branch".to_string()], + notes: Some("branch produced a minimal verified patch".to_string()), + }], + ..WorkflowExecution::default() + }; + + let report = TeacherReviewReport::from_execution(&review, &execution); + + assert_eq!(report.review_node_id, "teacher-review"); + assert_eq!(report.candidates.len(), 1); + assert_eq!( + report.candidates[0].kind, + TeacherCandidateKind::WorkflowRecipe + ); + assert_eq!( + report.candidates[0].status, + TeacherCandidateStatus::Proposed + ); + assert!( + report.candidates[0] + .evidence + .iter() + .any(|line| line.contains("tokens=42")) + ); + } + + #[test] + fn failed_leaf_becomes_regression_test_candidate() { + let review = TeacherReviewSpec { + id: "teacher-review".to_string(), + candidates: vec!["verify-failure".to_string()], + promotion_policy: PromotionPolicy::default(), + }; + let execution = WorkflowExecution { + leaf_results: vec![LeafResult { + leaf_id: "verify-failure".to_string(), + task_id: "verify-failure".to_string(), + status: WorkflowRunStatus::Failed, + usage: WorkflowUsage::default(), + memo_usage: WorkflowMemoUsage::default(), + output: Some("cargo test failed with a replay mismatch".to_string()), + artifacts: Vec::new(), + }], + ..WorkflowExecution::default() + }; + + let candidates = teacher_candidates_from_execution(&review, &execution); + + assert_eq!(candidates.len(), 1); + assert_eq!(candidates[0].kind, TeacherCandidateKind::RegressionTest); + assert_eq!(candidates[0].status, TeacherCandidateStatus::Proposed); + assert!( + candidates[0] + .evidence + .iter() + .any(|line| { line.contains("cargo test failed with a replay mismatch") }) + ); + } + #[test] fn tournament_selects_passing_minimal_branch() { let tournament = BranchTournament { min_score: 60 };