From ad3d61936bc62039ca5358e35986fc8a18d66e39 Mon Sep 17 00:00:00 2001 From: Hunter B Date: Wed, 3 Jun 2026 23:27:20 -0700 Subject: [PATCH] feat(subagent): preserve checkpoints for timeout continuation Refs #2029. Reported by @qiyuanlicn. This lands live per-step API-timeout checkpoint continuation and preserves checkpoint metadata through projections, transcripts, and persistence; cold-restart child-task rehydration remains out of scope. --- CHANGELOG.md | 9 +- crates/tui/src/tools/subagent/mod.rs | 358 ++++++++++++++++++++++++- crates/tui/src/tools/subagent/tests.rs | 313 +++++++++++++++++++++ crates/tui/src/tui/ui/tests.rs | 1 + crates/tui/src/tui/views/mod.rs | 2 + docs/V0_9_0_EXECUTION_MAP.md | 3 +- 6 files changed, 680 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e730cb41..139c39e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -75,6 +75,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 and Agents rows. Mouse hover opens a bordered, wrapping popover with the full underlying row text, long turn/agent ids, and current sub-agent progress instead of repeating the already-ellipsized sidebar label (#2694, #2734). +- Sub-agents now preserve checkpoint metadata around long model calls. A + per-step API timeout marks the child as interrupted with a continuable + checkpoint instead of ending as a null failed result, and `agent_eval` can + explicitly continue a live checkpointed interrupted child while normal + completed/failed/cancelled follow-up behavior stays unchanged (#2029). - Auto-generated project instructions now reuse the bounded Project Context Pack data instead of running an unbounded summary/tree scan when no `.codewhale/instructions.md` file exists. The fallback keeps later @@ -97,7 +102,9 @@ HarmonyOS/OpenHarmony port and MatePad Edge validation trail (#2634), dense tool-call transcript collapse/sidebar detail direction (#2738, #2734, #2692, #2694), and **@h3c-hexin** for the tool-agent model inheritance and configured -`skills_dir` fixes (#2736, #2737). Thanks also to **@NASLXTO** and +`skills_dir` fixes (#2736, #2737). Thanks also to **@qiyuanlicn** for the +checkpoint/resume report that shaped the sub-agent recovery slice (#2029), +and to **@NASLXTO** and **@wuxixing** for the large-workspace startup reports (#697, #1827), and to **@linzhiqin2003** and **@merchloubna70-dot** for earlier context-cap and startup-diagnosis work that shaped this bounded fallback. diff --git a/crates/tui/src/tools/subagent/mod.rs b/crates/tui/src/tools/subagent/mod.rs index 80bdf3e3..9e528d1f 100644 --- a/crates/tui/src/tools/subagent/mod.rs +++ b/crates/tui/src/tools/subagent/mod.rs @@ -594,6 +594,8 @@ pub struct SubAgentResult { pub status: SubAgentStatus, pub result: Option, pub steps_taken: u32, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub checkpoint: Option, pub duration_ms: u64, /// `true` when this agent was loaded from a prior-session persisted /// state file rather than spawned in the current session (#405). @@ -691,6 +693,21 @@ struct AssignRequest { interrupt: bool, } +/// Durable recovery point for an interrupted sub-agent session. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct SubAgentCheckpoint { + pub checkpoint_id: String, + pub agent_id: String, + pub continuation_handle: String, + pub reason: String, + pub continuable: bool, + pub steps_taken: u32, + pub message_count: usize, + pub created_at_ms: u64, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub messages: Vec, +} + #[derive(Debug, Clone, Serialize, Deserialize)] struct PersistedSubAgent { id: String, @@ -708,6 +725,8 @@ struct PersistedSubAgent { status: SubAgentStatus, result: Option, steps_taken: u32, + #[serde(default, skip_serializing_if = "Option::is_none")] + checkpoint: Option, duration_ms: u64, allowed_tools: Vec, updated_at_ms: u64, @@ -1034,6 +1053,7 @@ pub struct SubAgent { pub status: SubAgentStatus, pub result: Option, pub steps_taken: u32, + pub checkpoint: Option, pub started_at: Instant, pub last_activity_at: Instant, /// `None` = full registry inheritance, with approval-gated tools still @@ -1078,6 +1098,7 @@ impl SubAgent { status: SubAgentStatus::Running, result: None, steps_taken: 0, + checkpoint: None, started_at, last_activity_at: started_at, allowed_tools, @@ -1102,6 +1123,7 @@ impl SubAgent { status: self.status.clone(), result: self.result.clone(), steps_taken: self.steps_taken, + checkpoint: self.checkpoint.clone(), duration_ms: u64::try_from(self.started_at.elapsed().as_millis()).unwrap_or(u64::MAX), // Snapshots from the agent itself don't know the manager's // current boot id, so default to false. The manager fills @@ -1199,6 +1221,7 @@ impl SubAgentManager { status: agent.status.clone(), result: agent.result.clone(), steps_taken: agent.steps_taken, + checkpoint: agent.checkpoint.clone(), duration_ms: u64::try_from(agent.started_at.elapsed().as_millis()) .unwrap_or(u64::MAX), // Backward-compat: Vec on disk. None → empty vec; Some(list) → list. @@ -1278,6 +1301,7 @@ impl SubAgentManager { status, result: persisted.result, steps_taken: persisted.steps_taken, + checkpoint: persisted.checkpoint, started_at, last_activity_at: started_at, allowed_tools, @@ -1848,6 +1872,7 @@ impl SubAgentManager { agent.assignment = result.assignment; agent.result = result.result; agent.steps_taken = result.steps_taken; + agent.checkpoint = result.checkpoint; agent.task_handle = None; changed = true; } @@ -1868,6 +1893,83 @@ impl SubAgentManager { self.persist_state_best_effort(); } } + + fn update_checkpoint(&mut self, agent_id: &str, checkpoint: SubAgentCheckpoint) -> bool { + let Some(agent) = self.agents.get_mut(agent_id) else { + return false; + }; + agent.steps_taken = checkpoint.steps_taken; + agent.checkpoint = Some(checkpoint); + agent.last_activity_at = Instant::now(); + self.persist_state_best_effort(); + true + } + + fn interrupt_with_checkpoint( + &mut self, + agent_id: &str, + reason: String, + checkpoint: SubAgentCheckpoint, + ) -> Result { + let snapshot = { + let agent = self + .agents + .get_mut(agent_id) + .ok_or_else(|| anyhow!("Agent {agent_id} not found"))?; + agent.status = SubAgentStatus::Interrupted(reason.clone()); + agent.result = Some(reason); + agent.steps_taken = checkpoint.steps_taken; + agent.checkpoint = Some(checkpoint); + agent.last_activity_at = Instant::now(); + release_resident_leases_for(agent_id); + agent.snapshot() + }; + self.persist_state_best_effort(); + Ok(snapshot) + } + + fn continue_checkpointed( + &mut self, + agent_id: &str, + message: Option, + interrupt: bool, + ) -> Result { + let snapshot = { + let agent = self + .agents + .get_mut(agent_id) + .ok_or_else(|| anyhow!("Agent {agent_id} not found"))?; + if !matches!(agent.status, SubAgentStatus::Interrupted(_)) { + return Err(anyhow!( + "Agent {agent_id} is not interrupted; checkpoint continuation is only available for interrupted sessions" + )); + } + let checkpoint = agent + .checkpoint + .as_ref() + .ok_or_else(|| anyhow!("Agent {agent_id} has no checkpoint to continue"))?; + if !checkpoint.continuable || checkpoint.messages.is_empty() { + return Err(anyhow!("Agent {agent_id} checkpoint is not continuable")); + } + let tx = agent.input_tx.as_ref().ok_or_else(|| { + anyhow!( + "Agent {agent_id} checkpoint is persisted, but no live child task is available to continue" + ) + })?; + tx.send(SubAgentInput { + text: message.unwrap_or_default(), + interrupt, + }) + .map_err(|_| anyhow!("Failed to continue checkpointed agent {agent_id}"))?; + + agent.status = SubAgentStatus::Running; + agent.result = None; + agent.last_activity_at = Instant::now(); + agent.snapshot() + }; + self.persist_state_best_effort(); + Ok(snapshot) + } } /// Thread-safe wrapper for `SubAgentManager`. @@ -1885,6 +1987,10 @@ pub struct SubAgentSessionProjection { pub prefix_cache: SubAgentPrefixCacheProjection, pub transcript_handle: VarHandle, pub snapshot: SubAgentResult, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub checkpoint: Option, + #[serde(default, skip_serializing_if = "is_false")] + pub continuable: bool, #[serde(default, skip_serializing_if = "is_false")] pub timed_out: bool, } @@ -1912,6 +2018,14 @@ fn subagent_prefix_cache_projection(snapshot: &SubAgentResult) -> SubAgentPrefix } } +fn subagent_checkpoint_is_continuable(snapshot: &SubAgentResult) -> bool { + matches!(snapshot.status, SubAgentStatus::Interrupted(_)) + && snapshot + .checkpoint + .as_ref() + .is_some_and(|checkpoint| checkpoint.continuable && !checkpoint.messages.is_empty()) +} + async fn subagent_session_projection( snapshot: SubAgentResult, timed_out: bool, @@ -1929,6 +2043,7 @@ async fn subagent_session_projection( "steps_taken": snapshot.steps_taken, "duration_ms": snapshot.duration_ms, "assignment": snapshot.assignment.clone(), + "checkpoint": snapshot.checkpoint.clone(), "snapshot": snapshot.clone(), }); let transcript_handle = { @@ -1960,6 +2075,8 @@ async fn subagent_session_projection( fork_context: snapshot.fork_context, prefix_cache: subagent_prefix_cache_projection(&snapshot), transcript_handle, + checkpoint: snapshot.checkpoint.clone(), + continuable: subagent_checkpoint_is_continuable(&snapshot), snapshot, timed_out, } @@ -2609,7 +2726,7 @@ impl ToolSpec for AgentEvalTool { } fn description(&self) -> &'static str { - "Fetch or wait on a child sub-agent session. Optionally deliver a message/items to a running session, then return the latest session projection. With block=true (default), waits for the session to reach a terminal boundary; block=false is a non-blocking status fetch. Terminal projections expose a handle_read-compatible transcript_handle for the full child transcript." + "Fetch or wait on a child sub-agent session. Optionally deliver a message/items to a running session, then return the latest session projection. With continue=true, resume only a checkpointed interrupted session. With block=true (default), waits for the session to reach a terminal boundary; block=false is a non-blocking status fetch. Terminal projections expose a handle_read-compatible transcript_handle for the full child transcript." } fn input_schema(&self) -> Value { @@ -2649,6 +2766,14 @@ impl ToolSpec for AgentEvalTool { "type": "boolean", "description": "When sending input, prioritize it over pending inputs" }, + "continue": { + "type": "boolean", + "description": "Resume a checkpointed interrupted session. Only valid when the projection has continuable=true." + }, + "resume": { + "type": "boolean", + "description": "Alias for continue" + }, "block": { "type": "boolean", "description": "Wait for a terminal boundary before returning (default true)" @@ -2677,6 +2802,8 @@ impl ToolSpec for AgentEvalTool { .ok_or_else(|| ToolError::missing_field("name"))?; let message = parse_optional_text_or_items(&input, &["message", "input"], "items")?; let interrupt = optional_bool(&input, "interrupt", false); + let continue_from_checkpoint = + optional_bool(&input, "continue", false) || optional_bool(&input, "resume", false); let block = optional_bool(&input, "block", true); let timeout_ms = optional_u64(&input, "timeout_ms", DEFAULT_RESULT_TIMEOUT_MS) .clamp(1000, MAX_RESULT_TIMEOUT_MS); @@ -2696,7 +2823,16 @@ impl ToolSpec for AgentEvalTool { // completed session returns 'not running', no way to recover the full // child output". let mut message_delivery: Option = None; - if let Some(message) = message { + if continue_from_checkpoint { + let mut manager = self.manager.write().await; + manager + .continue_checkpointed(&agent_id, message, interrupt) + .map_err(|e| ToolError::execution_failed(e.to_string()))?; + message_delivery = Some(json!({ + "delivered": true, + "continued_from_checkpoint": true + })); + } else if let Some(message) = message { let terminal = { let manager = self.manager.read().await; manager @@ -3846,6 +3982,7 @@ async fn insert_subagent_full_transcript_handle( assignment: &SubAgentAssignment, status: &SubAgentStatus, result: Option<&String>, + checkpoint: Option<&SubAgentCheckpoint>, messages: &[Message], steps_taken: u32, duration_ms: u64, @@ -3862,12 +3999,50 @@ async fn insert_subagent_full_transcript_handle( "steps_taken": steps_taken, "duration_ms": duration_ms, "assignment": assignment, + "checkpoint": checkpoint, "messages": messages, }); let mut store = runtime.context.runtime.handle_store.lock().await; store.insert_json(format!("agent:{agent_id}"), "full_transcript", payload) } +fn build_subagent_checkpoint( + agent_id: &str, + reason: impl Into, + messages: &[Message], + steps_taken: u32, + continuable: bool, +) -> SubAgentCheckpoint { + let created_at_ms = epoch_millis_now(); + let checkpoint_id = format!("{agent_id}:step:{steps_taken}:ts:{created_at_ms}"); + SubAgentCheckpoint { + checkpoint_id: checkpoint_id.clone(), + agent_id: agent_id.to_string(), + continuation_handle: format!("agent:{agent_id}:checkpoint:{checkpoint_id}"), + reason: reason.into(), + continuable, + steps_taken, + message_count: messages.len(), + created_at_ms, + messages: messages.to_vec(), + } +} + +async fn checkpoint_subagent_progress( + runtime: &SubAgentRuntime, + agent_id: &str, + reason: impl Into, + messages: &[Message], + steps_taken: u32, + continuable: bool, +) -> SubAgentCheckpoint { + let checkpoint = + build_subagent_checkpoint(agent_id, reason, messages, steps_taken, continuable); + let mut manager = runtime.manager.write().await; + manager.update_checkpoint(agent_id, checkpoint.clone()); + checkpoint +} + fn record_agent_progress(runtime: &SubAgentRuntime, agent_id: &str, message: impl Into) { if let Ok(mut manager) = runtime.manager.try_write() { manager.touch(agent_id); @@ -3934,8 +4109,9 @@ async fn run_subagent( let mut final_result: Option = None; let mut pending_inputs: VecDeque = VecDeque::new(); let mut consecutive_truncated_responses = 0; + let mut latest_checkpoint: Option = None; - for _step in 0..max_steps { + 'steps_loop: for _step in 0..max_steps { // Cooperative cancellation: bail if this session's token was cancelled // while we were between steps. Top-level model-visible sub-agents use // a detached token so parent turn cancellation does not stop them. @@ -3959,6 +4135,7 @@ async fn run_subagent( &assignment, &status, None, + latest_checkpoint.as_ref(), &messages, steps, duration_ms, @@ -3982,6 +4159,7 @@ async fn run_subagent( status, result: None, steps_taken: steps, + checkpoint: latest_checkpoint.clone(), duration_ms, from_prior_session: false, }); @@ -4027,6 +4205,17 @@ async fn run_subagent( temperature: None, top_p: None, }; + latest_checkpoint = Some( + checkpoint_subagent_progress( + runtime, + &agent_id, + "before_api_request", + &messages, + steps, + true, + ) + .await, + ); // Race the API call against the cancellation token so a parent // cancel during a long thinking turn doesn't have to wait for the @@ -4053,6 +4242,7 @@ async fn run_subagent( &assignment, &status, None, + latest_checkpoint.as_ref(), &messages, steps, duration_ms, @@ -4071,12 +4261,130 @@ async fn run_subagent( status, result: None, steps_taken: steps, + checkpoint: latest_checkpoint.clone(), duration_ms, from_prior_session: false, }); } api = tokio::time::timeout(runtime.step_api_timeout, runtime.client.create_message(request)) => { - api.map_err(|_| anyhow!("API call timed out after {}s", runtime.step_api_timeout.as_secs()))?? + match api { + Ok(response) => response?, + Err(_) => { + let reason = format!( + "API call timed out after {}ms; checkpoint preserved for continuation", + runtime.step_api_timeout.as_millis() + ); + let checkpoint = checkpoint_subagent_progress( + runtime, + &agent_id, + "api_timeout", + &messages, + steps, + true, + ) + .await; + latest_checkpoint = Some(checkpoint.clone()); + record_agent_progress( + runtime, + &agent_id, + format!("step {steps}/{max_steps}: interrupted; {reason}"), + ); + let status = SubAgentStatus::Interrupted(reason.clone()); + let duration_ms = + u64::try_from(started_at.elapsed().as_millis()).unwrap_or(u64::MAX); + insert_subagent_full_transcript_handle( + runtime, + &agent_id, + &agent_type, + &assignment, + &status, + Some(&reason), + Some(&checkpoint), + &messages, + steps, + duration_ms, + fork_context_enabled, + ) + .await; + let interrupted_snapshot = { + let mut manager = runtime.manager.write().await; + manager.interrupt_with_checkpoint( + &agent_id, + reason.clone(), + checkpoint.clone(), + )? + }; + + let next_input = tokio::select! { + biased; + () = runtime.cancel_token.cancelled() => { + record_agent_progress( + runtime, + &agent_id, + format!("step {steps}/{max_steps}: cancelled while interrupted"), + ); + if let Some(mb) = runtime.mailbox.as_ref() { + let _ = mb.send(MailboxMessage::Cancelled { + agent_id: agent_id.clone(), + }); + } + let status = SubAgentStatus::Cancelled; + let duration_ms = u64::try_from(started_at.elapsed().as_millis()) + .unwrap_or(u64::MAX); + insert_subagent_full_transcript_handle( + runtime, + &agent_id, + &agent_type, + &assignment, + &status, + None, + latest_checkpoint.as_ref(), + &messages, + steps, + duration_ms, + fork_context_enabled, + ) + .await; + return Ok(SubAgentResult { + name: agent_id.clone(), + agent_id: agent_id.clone(), + context_mode: if fork_context_enabled { "forked" } else { "fresh" }.to_string(), + fork_context: fork_context_enabled, + agent_type: agent_type.clone(), + assignment: assignment.clone(), + model: runtime.model.clone(), + nickname: None, + status, + result: None, + steps_taken: steps, + checkpoint: latest_checkpoint.clone(), + duration_ms, + from_prior_session: false, + }); + } + input = input_rx.recv() => input, + }; + let Some(input) = next_input else { + return Ok(interrupted_snapshot); + }; + if input.interrupt { + pending_inputs.clear(); + } + pending_inputs.push_back(input); + latest_checkpoint = Some( + checkpoint_subagent_progress( + runtime, + &agent_id, + "continued_after_api_timeout", + &messages, + steps, + true, + ) + .await, + ); + continue 'steps_loop; + } + } } }; @@ -4109,6 +4417,17 @@ async fn run_subagent( role: "assistant".to_string(), content: response.content.clone(), }); + latest_checkpoint = Some( + checkpoint_subagent_progress( + runtime, + &agent_id, + "after_model_response", + &messages, + steps, + true, + ) + .await, + ); if response_was_truncated(&response) { final_result = None; @@ -4134,6 +4453,17 @@ async fn run_subagent( truncated_response_tool_results(&tool_uses) }, }); + latest_checkpoint = Some( + checkpoint_subagent_progress( + runtime, + &agent_id, + "after_truncated_response_retry_message", + &messages, + steps, + true, + ) + .await, + ); continue; } reset_truncated_subagent_responses(&mut consecutive_truncated_responses); @@ -4217,12 +4547,30 @@ async fn run_subagent( role: "user".to_string(), content: tool_results, }); + latest_checkpoint = Some( + checkpoint_subagent_progress( + runtime, + &agent_id, + "after_tool_results", + &messages, + steps, + true, + ) + .await, + ); } } release_resident_leases_for(&agent_id); let status = SubAgentStatus::Completed; let duration_ms = u64::try_from(started_at.elapsed().as_millis()).unwrap_or(u64::MAX); + latest_checkpoint = Some(build_subagent_checkpoint( + &agent_id, + "completed", + &messages, + steps, + false, + )); insert_subagent_full_transcript_handle( runtime, &agent_id, @@ -4230,6 +4578,7 @@ async fn run_subagent( &assignment, &status, final_result.as_ref(), + latest_checkpoint.as_ref(), &messages, steps, duration_ms, @@ -4254,6 +4603,7 @@ async fn run_subagent( status, result: final_result, steps_taken: steps, + checkpoint: latest_checkpoint, duration_ms, from_prior_session: false, }) diff --git a/crates/tui/src/tools/subagent/tests.rs b/crates/tui/src/tools/subagent/tests.rs index 0ed21884..2b536d6e 100644 --- a/crates/tui/src/tools/subagent/tests.rs +++ b/crates/tui/src/tools/subagent/tests.rs @@ -1,4 +1,6 @@ use super::*; +use axum::{Json, Router, routing::post}; +use std::sync::atomic::{AtomicUsize, Ordering}; use tempfile::tempdir; fn make_assignment() -> SubAgentAssignment { @@ -18,11 +20,26 @@ fn make_snapshot(status: SubAgentStatus) -> SubAgentResult { status, result: None, steps_taken: 0, + checkpoint: None, duration_ms: 0, from_prior_session: false, } } +fn text_message(role: &str, text: &str) -> Message { + Message { + role: role.to_string(), + content: vec![ContentBlock::Text { + text: text.to_string(), + cache_control: None, + }], + } +} + +fn make_checkpoint(agent_id: &str, steps_taken: u32, messages: Vec) -> SubAgentCheckpoint { + build_subagent_checkpoint(agent_id, "test_checkpoint", &messages, steps_taken, true) +} + fn message_text(message: &Message) -> &str { match message.content.first() { Some(ContentBlock::Text { text, .. }) => text.as_str(), @@ -30,6 +47,74 @@ fn message_text(message: &Message) -> &str { } } +async fn delayed_chat_client( + first_delay: Duration, + response_text: &str, +) -> ( + DeepSeekClient, + Arc, + Arc>>, +) { + let calls = Arc::new(AtomicUsize::new(0)); + let bodies = Arc::new(std::sync::Mutex::new(Vec::new())); + let response_text = response_text.to_string(); + let app = Router::new().route( + "/{*path}", + post({ + let calls = Arc::clone(&calls); + let bodies = Arc::clone(&bodies); + move |Json(body): Json| { + let calls = Arc::clone(&calls); + let bodies = Arc::clone(&bodies); + let response_text = response_text.clone(); + async move { + let attempt = calls.fetch_add(1, Ordering::SeqCst) + 1; + bodies + .lock() + .expect("request body recorder mutex poisoned") + .push(body); + if attempt == 1 { + tokio::time::sleep(first_delay).await; + } + Json(json!({ + "id": format!("chatcmpl-test-{attempt}"), + "model": "deepseek-v4-flash", + "choices": [{ + "index": 0, + "message": { + "role": "assistant", + "content": response_text + }, + "finish_reason": "stop" + }], + "usage": { + "prompt_tokens": 1, + "completion_tokens": 1, + "total_tokens": 2 + } + })) + } + } + }), + ); + + let listener = tokio::net::TcpListener::bind("127.0.0.1:0") + .await + .expect("bind fake chat server"); + let addr = listener.local_addr().expect("fake chat server addr"); + tokio::spawn(async move { + let _ = axum::serve(listener, app).await; + }); + + let config = crate::config::Config { + api_key: Some("test-key".to_string()), + base_url: Some(format!("http://{addr}/v1")), + ..crate::config::Config::default() + }; + let client = DeepSeekClient::new(&config).expect("fake chat client"); + (client, calls, bodies) +} + fn estimate_tool_description_tokens_conservative(text: &str) -> usize { text.chars().count().div_ceil(3) } @@ -426,6 +511,51 @@ async fn terminal_session_projection_prefers_full_transcript_handle() { assert_eq!(projection.transcript_handle.name, "full_transcript"); } +#[tokio::test] +async fn interrupted_projection_exposes_checkpoint_metadata_and_messages() { + let mut snapshot = make_snapshot(SubAgentStatus::Interrupted( + "API call timed out after 10ms".to_string(), + )); + let checkpoint = make_checkpoint( + &snapshot.agent_id, + 1, + vec![text_message("user", "inspect checkpoint recovery")], + ); + snapshot.steps_taken = checkpoint.steps_taken; + snapshot.checkpoint = Some(checkpoint.clone()); + + let ctx = ToolContext::new("."); + let projection = subagent_session_projection(snapshot, false, &ctx).await; + + assert_eq!(projection.status, "interrupted"); + assert!(projection.terminal); + assert!(projection.continuable); + assert_eq!( + projection + .checkpoint + .as_ref() + .expect("checkpoint projected") + .continuation_handle, + checkpoint.continuation_handle + ); + assert_eq!( + projection + .snapshot + .checkpoint + .as_ref() + .map(|cp| cp.message_count), + Some(1) + ); + assert_eq!( + projection + .checkpoint + .as_ref() + .and_then(|cp| cp.messages.first()) + .map(message_text), + Some("inspect checkpoint recovery") + ); +} + #[test] fn test_delegate_defaults_to_fork_context() { let input = with_default_fork_context(json!({ "prompt": "review current work" }), true); @@ -1089,6 +1219,143 @@ async fn agent_eval_resolves_session_via_agent_name_alias() { assert_eq!(projection.status, "completed"); } +#[tokio::test] +async fn api_timeout_preserves_checkpoint_and_agent_eval_continues_from_it() { + let tmp = tempdir().expect("tempdir"); + let manager = Arc::new(RwLock::new(SubAgentManager::new( + tmp.path().to_path_buf(), + 2, + ))); + let agent_id = "agent_checkpoint_timeout".to_string(); + let (task_input_tx, task_input_rx) = mpsc::unbounded_channel(); + let agent = SubAgent::new( + agent_id.clone(), + SubAgentType::General, + "Inspect checkpoint behavior".to_string(), + make_assignment(), + "deepseek-v4-flash".to_string(), + Some("Blue".to_string()), + Some(vec![]), + task_input_tx, + "boot_test".to_string(), + ); + manager.write().await.agents.insert(agent_id.clone(), agent); + + let (client, calls, bodies) = + delayed_chat_client(Duration::from_millis(80), "resumed answer").await; + let mut runtime = stub_runtime().with_step_api_timeout(Duration::from_millis(10)); + runtime.client = client; + runtime.manager = Arc::clone(&manager); + runtime.context = ToolContext::new(tmp.path()); + + let task = SubAgentTask { + manager_handle: Arc::clone(&manager), + runtime: runtime.clone(), + agent_id: agent_id.clone(), + agent_type: SubAgentType::General, + prompt: "Inspect checkpoint behavior".to_string(), + assignment: make_assignment(), + allowed_tools: Some(vec![]), + fork_context: false, + started_at: Instant::now(), + max_steps: 3, + input_rx: task_input_rx, + }; + let task_handle = tokio::spawn(run_subagent_task(task)); + + let interrupted = tokio::time::timeout(Duration::from_secs(2), async { + loop { + let snapshot = { + let manager = manager.read().await; + manager + .get_result(&agent_id) + .expect("agent should stay registered") + }; + if matches!(snapshot.status, SubAgentStatus::Interrupted(_)) { + return snapshot; + } + tokio::time::sleep(Duration::from_millis(10)).await; + } + }) + .await + .expect("agent should become interrupted after API timeout"); + + let checkpoint = interrupted + .checkpoint + .as_ref() + .expect("timeout should preserve checkpoint"); + assert!(checkpoint.continuable); + assert_eq!(checkpoint.steps_taken, 1); + assert!( + checkpoint + .messages + .iter() + .any(|message| message_text(message).contains("Inspect checkpoint behavior")), + "checkpoint should preserve local child prompt: {checkpoint:?}" + ); + + let ctx = runtime.context.clone(); + let tool = AgentEvalTool::new(Arc::clone(&manager)); + let result = tool + .execute(json!({ "agent_id": agent_id, "block": false }), &ctx) + .await + .expect("agent_eval should project interrupted checkpoint"); + let projection: SubAgentSessionProjection = + serde_json::from_str(&result.content).expect("projection deserializes"); + assert_eq!(projection.status, "interrupted"); + assert!(projection.continuable); + assert!(projection.checkpoint.is_some()); + + let result = tool + .execute( + json!({ + "agent_id": agent_id, + "continue": true, + "message": "Please continue with the prior checkpoint.", + "timeout_ms": 2000 + }), + &ctx, + ) + .await + .expect("agent_eval should continue checkpointed interrupted session"); + let meta = result.metadata.expect("metadata present"); + assert_eq!( + meta["message_delivery"]["continued_from_checkpoint"], + json!(true) + ); + let projection: SubAgentSessionProjection = + serde_json::from_str(&result.content).expect("projection deserializes"); + assert_eq!(projection.status, "completed"); + assert_eq!( + projection.snapshot.result.as_deref(), + Some("resumed answer") + ); + assert!( + projection + .checkpoint + .as_ref() + .expect("completed projection keeps latest checkpoint") + .messages + .iter() + .any(|message| message_text(message) + .contains("Please continue with the prior checkpoint.")), + "continuation instruction should be part of resumed transcript" + ); + + task_handle.await.expect("sub-agent task should finish"); + assert!( + calls.load(Ordering::SeqCst) >= 2, + "continuation should make a second API request" + ); + let bodies = bodies + .lock() + .expect("request body recorder mutex poisoned") + .clone(); + let second_request = serde_json::to_string(&bodies[1]).expect("second request body serializes"); + assert!(second_request.contains("Inspect checkpoint behavior")); + assert!(second_request.contains("Please continue with the prior checkpoint.")); +} + #[tokio::test] async fn spawn_duplicate_session_name_error_names_conflicting_agent() { // #2656: the duplicate-name error must identify the conflicting agent so a @@ -1468,6 +1735,52 @@ fn test_persist_and_reload_marks_running_agent_as_interrupted() { )); } +#[test] +fn persist_and_reload_preserves_checkpoint_for_interrupted_running_agent() { + let tmp = tempdir().expect("tempdir"); + let workspace = tmp.path().to_path_buf(); + let state_path = default_state_path(tmp.path()); + + let mut manager = SubAgentManager::new(workspace.clone(), 2).with_state_path(state_path); + let (input_tx, _input_rx) = mpsc::unbounded_channel(); + let mut running = SubAgent::new( + "test_agent_checkpoint_reload".to_string(), + SubAgentType::General, + "work".to_string(), + make_assignment(), + "deepseek-v4-flash".to_string(), + Some("Blue".to_string()), + Some(vec!["read_file".to_string()]), + input_tx, + "boot_test".to_string(), + ); + running.checkpoint = Some(make_checkpoint( + &running.id, + 2, + vec![ + text_message("user", "initial task"), + text_message("assistant", "partial progress"), + ], + )); + let running_id = running.id.clone(); + manager.agents.insert(running_id.clone(), running); + manager.persist_state().expect("persist state"); + + let mut reloaded = + SubAgentManager::new(workspace, 2).with_state_path(default_state_path(tmp.path())); + reloaded.load_state().expect("load state"); + let snapshot = reloaded + .get_result(&running_id) + .expect("reloaded agent should exist"); + + assert!(matches!(snapshot.status, SubAgentStatus::Interrupted(_))); + let checkpoint = snapshot.checkpoint.expect("checkpoint should reload"); + assert!(checkpoint.continuable); + assert_eq!(checkpoint.steps_taken, 2); + assert_eq!(checkpoint.messages.len(), 2); + assert_eq!(message_text(&checkpoint.messages[1]), "partial progress"); +} + #[test] fn test_interrupted_status_name_and_summary() { let snapshot = make_snapshot(SubAgentStatus::Interrupted( diff --git a/crates/tui/src/tui/ui/tests.rs b/crates/tui/src/tui/ui/tests.rs index 38d3c650..2c100a5c 100644 --- a/crates/tui/src/tui/ui/tests.rs +++ b/crates/tui/src/tui/ui/tests.rs @@ -3112,6 +3112,7 @@ fn make_subagent( status, result: None, steps_taken: 0, + checkpoint: None, duration_ms: 0, from_prior_session: false, } diff --git a/crates/tui/src/tui/views/mod.rs b/crates/tui/src/tui/views/mod.rs index d041885a..042357e8 100644 --- a/crates/tui/src/tui/views/mod.rs +++ b/crates/tui/src/tui/views/mod.rs @@ -1778,6 +1778,7 @@ fn live_subagent_result( status, result: None, steps_taken: 0, + checkpoint: None, duration_ms: 0, from_prior_session: false, } @@ -2276,6 +2277,7 @@ mod tests { status, result: None, steps_taken: 1, + checkpoint: None, duration_ms: 10, from_prior_session: false, } diff --git a/docs/V0_9_0_EXECUTION_MAP.md b/docs/V0_9_0_EXECUTION_MAP.md index 3d009c38..678e52cc 100644 --- a/docs/V0_9_0_EXECUTION_MAP.md +++ b/docs/V0_9_0_EXECUTION_MAP.md @@ -52,6 +52,7 @@ harvest/stewardship commits: | #2738 dense tool-call transcript collapse | Locally harvested with expansion, cache-key, and safety fixes. | Successful read/search/list-style tool runs collapse by default once they cross the density threshold; failures, running cells, shell/exec, patch/write/edit/delete, diff preview, plan update, and review cells stay visible. Users can expand a group with Enter/Space/mouse and can set `tool_collapse = "compact" | "expanded" | "calm"`. Credit @idling11 and issue #2692; comment/close the original after the integration branch is public. | | #2734 sidebar detail popovers | Locally harvested as the mouse-hover slice for #2694. | Work/Tasks/Agents hover metadata now stores row hitboxes, compact display text, and full source text so truncated checklist items, task/turn ids, and sub-agent ids/progress expand into a bordered wrapping popover. The harvest fixes reviewer risks from the PR by treating row metadata as authoritative, sizing by display width instead of bytes, and keeping source text untruncated. `cargo test -p codewhale-tui --bin codewhale-tui --locked sidebar_hover -- --nocapture`, `... work_hover_text_preserves_full_checklist_item ...`, and `... subagent_hover_text_preserves_full_agent_id_and_progress ...` passed. Credit @idling11; keep #2694 open for keyboard access, richer Work/Tasks/Agents metadata, redaction expansion, and clipping/snapshot coverage. | | #2532 pending-input delivery-mode labels | Locally re-harvested for #2054. | Pending-input preview rows now label steer-pending, rejected-steer, and queued-follow-up delivery modes, and wrapped continuation rows align under the label. `cargo test -p codewhale-tui --bin codewhale-tui --locked pending_input_preview -- --nocapture` passed. Credit @cyq1017; #2054 remains open for cancel/edit-mode affordance clarity. | +| #2029 sub-agent checkpoint continuation | Locally implemented as the live-timeout recovery slice. | Sub-agents now persist `SubAgentCheckpoint` metadata through state, results, projections, and transcript handles. The runner checkpoints local messages before API calls and after model/tool cycles; per-step API timeout marks the child interrupted with `continuable=true`; `agent_eval { continue: true }` resumes only live checkpointed interrupted children. Reload preserves checkpoint metadata, but cold-restart continuation is intentionally not claimed because the child task/input channel is not rehydrated yet. `cargo test -p codewhale-tui --bin codewhale-tui --locked subagent -- --nocapture`, `cargo fmt --all -- --check`, `git diff --check`, and `cargo clippy -p codewhale-tui --locked -- -D warnings` passed. Credit @qiyuanlicn for the recovery report; keep #2029 open only if cold-restart continuation or broader checkpoint UX remains required. | | #697/#1827 bounded auto-generated project context | Locally implemented from the stabilization audit. | When no project instructions exist, startup now writes `.codewhale/instructions.md` from the bounded Project Context Pack data instead of an unbounded summary/tree scan. The generated file avoids the dynamic `` marker when that setting is disabled, keeps later top-level folders visible, and omits noisy directory tails. `cargo test -p codewhale-tui --bin codewhale-tui --locked auto_generated_context_is_bounded_for_many_file_workspace -- --nocapture` and `cargo test -p codewhale-tui --bin codewhale-tui --locked project_context_pack -- --nocapture` passed. Credit reporters @NASLXTO and @wuxixing, plus earlier context-cap/startup work from @linzhiqin2003 and @merchloubna70-dot; leave #697/#1827 open pending real massive-repo/manual startup verification. | | #2636 project-context mtime cache | Defer direct merge; harvest only after cache key/signature is widened. | Must include constitution changes, auto-generated context deletion, canonical path equivalence, and overwrite detection before landing. | | #2634 HarmonyOS port | Locally harvested with additional Nix-chain clearance; keep credited and do not close until the integration branch is public. | User-supplied MatePad Edge demo (`https://bilibili.com/video/av116689597368905`) confirms real-device interest. Added env-driven OpenHarmony SDK setup, OHOS platform guards/fallbacks, self-update disablement, and OHOS target gating for Starlark execpolicy parsing plus PTY support so published OHOS builds do not pull `nix` 0.28 through `rustyline` or `portable-pty`. `cargo check --workspace --all-features --locked`, focused PTY/clipboard tests, and `cargo tree --locked -p codewhale-tui --target aarch64-unknown-linux-ohos -i nix@0.28.0` passed; full OHOS target check is blocked on this host because `OHOS_NATIVE_SDK`/target CC/sysroot are not configured and `ring` cannot find `assert.h`. | @@ -75,7 +76,7 @@ v0.9 branch so the remaining Windows/manual checks are explicit. | Windows shell descendant hangs (#2498, #1812 class) | Partially fixed and already harvested. | Foreground orphan-pipe regression passed locally with `cargo test -p codewhale-tui --all-features --locked foreground_shell_does_not_block_on_orphaned_subprocess_pipe -- --nocapture`. PR #2498 should close as harvested, but #1812 remains open for broader input-poll freeze modes and Windows CI/manual confirmation. | | Large-repo context startup (#697/#1827 class) | Partially covered. | Project-context pack ordering/budget/noise tests passed, and the auto-generated fallback now has a synthetic 1000-file startup smoke with `cargo test -p codewhale-tui --bin codewhale-tui --locked auto_generated_context_is_bounded_for_many_file_workspace -- --nocapture`. Still needs a real massive-repo/manual startup benchmark before closing #697 or #1827. | | Sub-agent timeout and trust model (#1806, #719) | Fixed or covered in current branch. | `heartbeat_timeout_secs` clamp/default test passed, and `agent_open_description_explains_fresh_vs_forked_context_and_trust_model` asserts that sub-agent results are self-reports. | -| Sub-agent checkpoint/resume (#2029) | Still release-blocking. | Session projection/transcript handles exist, but no checkpoint/continue status or resume contract has landed. Needs a child checkpoint/timeout/resume test that preserves policy and completes. | +| Sub-agent checkpoint/resume (#2029) | Partially covered. | Live per-step API timeout now preserves a continuable checkpoint and `agent_eval { continue: true }` resumes the parked child; `cargo test -p codewhale-tui --bin codewhale-tui --locked subagent -- --nocapture` passed with checkpoint/projection/persistence/continuation coverage. Cold-restart continuation is not implemented because persisted child tasks are not rehydrated; decide whether #2029 can close as live-timeout recovery or should remain open for restart-resume UX. | | Live shell/session liveness (#1786) | Partially fixed, still release-blocking. | Shell containment and turn-liveness tests exist, but orphaned PID/session-load reaping and long-running shell LIVE-state recovery remain open. Needs stale PID reaping and live-state regression coverage. | | Queued/live input feedback (#2054) | Partially covered; UX clarity still blocking. | Queued-message recovery/editing and pending-input delivery-mode labels are covered by `queued` and `pending_input_preview` focused tests. Still needs cancel/edit-mode affordance clarity and a repro for accidentally entering queued-draft edit while a turn is loading. | | Prompt/UI calmness (#1191) | Defer or narrow. | No release-blocking regression evidence yet; keep as polish unless a current user-facing prompt/UI failure is identified. |