fix(tasks): fail stale running tasks after restart
Refs #1786. Reported by @bevis-wong. This lands the durable restart-safety slice: persisted running tasks and running tool rows are marked failed with a recovery note instead of being requeued as live work after a prior process exits.
This commit is contained in:
@@ -80,6 +80,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
checkpoint instead of ending as a null failed result, and `agent_eval` can
|
||||
explicitly continue a live checkpointed interrupted child while normal
|
||||
completed/failed/cancelled follow-up behavior stays unchanged (#2029).
|
||||
- Durable task recovery no longer requeues tasks that were `running` when the
|
||||
previous CodeWhale process exited. On restart those records are marked failed
|
||||
with a recovery note, and any running tool-call summaries are marked failed
|
||||
too, so stale shell/task state cannot silently become live work again (#1786).
|
||||
- Auto-generated project instructions now reuse the bounded Project Context
|
||||
Pack data instead of running an unbounded summary/tree scan when no
|
||||
`.codewhale/instructions.md` file exists. The fallback keeps later
|
||||
@@ -104,6 +108,7 @@ dense tool-call transcript collapse/sidebar detail direction (#2738, #2734,
|
||||
**@h3c-hexin** for the tool-agent model inheritance and configured
|
||||
`skills_dir` fixes (#2736, #2737). Thanks also to **@qiyuanlicn** for the
|
||||
checkpoint/resume report that shaped the sub-agent recovery slice (#2029),
|
||||
to **@bevis-wong** for the long-running shell/task liveness report (#1786),
|
||||
and to **@NASLXTO** and
|
||||
**@wuxixing** for the large-workspace startup reports (#697, #1827), and to
|
||||
**@linzhiqin2003** and **@merchloubna70-dot** for earlier context-cap and
|
||||
|
||||
@@ -1512,14 +1512,34 @@ fn load_state(
|
||||
);
|
||||
}
|
||||
if task.status == TaskStatus::Running {
|
||||
task.status = TaskStatus::Queued;
|
||||
task.started_at = None;
|
||||
task.ended_at = None;
|
||||
task.duration_ms = None;
|
||||
let now = Utc::now();
|
||||
let duration_ms = task.started_at.and_then(|started| {
|
||||
u64::try_from(now.signed_duration_since(started).num_milliseconds()).ok()
|
||||
});
|
||||
task.status = TaskStatus::Failed;
|
||||
task.ended_at = Some(now);
|
||||
task.duration_ms = duration_ms;
|
||||
task.error = Some(
|
||||
"Interrupted by process restart; prior process is not attached".to_string(),
|
||||
);
|
||||
for tool in &mut task.tool_calls {
|
||||
if tool.status == TaskToolStatus::Running {
|
||||
tool.status = TaskToolStatus::Failed;
|
||||
tool.ended_at = Some(now);
|
||||
tool.duration_ms = duration_ms.or_else(|| {
|
||||
u64::try_from(
|
||||
now.signed_duration_since(tool.started_at)
|
||||
.num_milliseconds(),
|
||||
)
|
||||
.ok()
|
||||
});
|
||||
}
|
||||
}
|
||||
task.timeline.push(TaskTimelineEntry {
|
||||
timestamp: Utc::now(),
|
||||
timestamp: now,
|
||||
kind: "recovered".to_string(),
|
||||
summary: "Recovered from restart and re-queued".to_string(),
|
||||
summary: "Interrupted by process restart; prior process is not attached"
|
||||
.to_string(),
|
||||
detail_path: None,
|
||||
});
|
||||
}
|
||||
@@ -1790,6 +1810,98 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn running_tasks_are_not_requeued_after_restart() -> Result<()> {
|
||||
let root = std::env::temp_dir().join(format!("deepseek-task-test-{}", Uuid::new_v4()));
|
||||
let tasks_dir = root.join("tasks");
|
||||
fs::create_dir_all(&tasks_dir)?;
|
||||
let queue_path = root.join("queue.json");
|
||||
let task_id = "task_stale_running".to_string();
|
||||
let started_at = Utc::now() - chrono::Duration::seconds(30);
|
||||
let task = TaskRecord {
|
||||
schema_version: CURRENT_TASK_SCHEMA_VERSION,
|
||||
id: task_id.clone(),
|
||||
prompt: "long-running shell work".to_string(),
|
||||
model: "deepseek-v4-flash".to_string(),
|
||||
workspace: PathBuf::from("."),
|
||||
mode: "agent".to_string(),
|
||||
allow_shell: true,
|
||||
trust_mode: false,
|
||||
auto_approve: false,
|
||||
status: TaskStatus::Running,
|
||||
created_at: started_at,
|
||||
started_at: Some(started_at),
|
||||
ended_at: None,
|
||||
duration_ms: None,
|
||||
result_summary: None,
|
||||
result_detail_path: None,
|
||||
error: None,
|
||||
thread_id: Some("thr_stale".to_string()),
|
||||
turn_id: Some("turn_stale".to_string()),
|
||||
runtime_event_count: 0,
|
||||
checklist: TaskChecklistState::default(),
|
||||
gates: Vec::new(),
|
||||
attempts: Vec::new(),
|
||||
artifacts: Vec::new(),
|
||||
github_events: Vec::new(),
|
||||
tool_calls: vec![TaskToolCallSummary {
|
||||
id: "tool_shell".to_string(),
|
||||
name: "task_shell_start".to_string(),
|
||||
status: TaskToolStatus::Running,
|
||||
started_at,
|
||||
ended_at: None,
|
||||
duration_ms: None,
|
||||
input_summary: Some("shell: sleep 999".to_string()),
|
||||
output_summary: None,
|
||||
detail_path: None,
|
||||
patch_ref: None,
|
||||
}],
|
||||
timeline: vec![TaskTimelineEntry {
|
||||
timestamp: started_at,
|
||||
kind: "running".to_string(),
|
||||
summary: "Task started".to_string(),
|
||||
detail_path: None,
|
||||
}],
|
||||
};
|
||||
fs::write(
|
||||
tasks_dir.join(format!("{task_id}.json")),
|
||||
serde_json::to_string_pretty(&task)?,
|
||||
)?;
|
||||
fs::write(
|
||||
&queue_path,
|
||||
serde_json::to_string_pretty(&QueueFile {
|
||||
queue: vec![task_id.clone()],
|
||||
})?,
|
||||
)?;
|
||||
|
||||
let (tasks, queue) = load_state(&tasks_dir, &queue_path)?;
|
||||
let recovered = tasks.get(&task_id).expect("task loaded");
|
||||
|
||||
assert!(queue.is_empty(), "stale running task must not be requeued");
|
||||
assert_eq!(recovered.status, TaskStatus::Failed);
|
||||
assert!(
|
||||
recovered
|
||||
.error
|
||||
.as_deref()
|
||||
.is_some_and(|err| err.contains("prior process is not attached")),
|
||||
"recovered task should explain stale process ownership: {recovered:?}"
|
||||
);
|
||||
assert!(recovered.ended_at.is_some());
|
||||
assert!(recovered.duration_ms.is_some());
|
||||
assert_eq!(recovered.tool_calls[0].status, TaskToolStatus::Failed);
|
||||
assert!(recovered.tool_calls[0].ended_at.is_some());
|
||||
assert!(
|
||||
recovered
|
||||
.timeline
|
||||
.iter()
|
||||
.any(|entry| entry.kind == "recovered"
|
||||
&& entry.summary.contains("prior process is not attached")),
|
||||
"recovery timeline should explain why the task is terminal: {:?}",
|
||||
recovered.timeline
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn default_workspace_updates_for_future_tasks() -> Result<()> {
|
||||
let root = std::env::temp_dir().join(format!("deepseek-task-test-{}", Uuid::new_v4()));
|
||||
|
||||
@@ -53,6 +53,7 @@ harvest/stewardship commits:
|
||||
| #2734 sidebar detail popovers | Locally harvested as the mouse-hover slice for #2694. | Work/Tasks/Agents hover metadata now stores row hitboxes, compact display text, and full source text so truncated checklist items, task/turn ids, and sub-agent ids/progress expand into a bordered wrapping popover. The harvest fixes reviewer risks from the PR by treating row metadata as authoritative, sizing by display width instead of bytes, and keeping source text untruncated. `cargo test -p codewhale-tui --bin codewhale-tui --locked sidebar_hover -- --nocapture`, `... work_hover_text_preserves_full_checklist_item ...`, and `... subagent_hover_text_preserves_full_agent_id_and_progress ...` passed. Credit @idling11; keep #2694 open for keyboard access, richer Work/Tasks/Agents metadata, redaction expansion, and clipping/snapshot coverage. |
|
||||
| #2532 pending-input delivery-mode labels | Locally re-harvested for #2054. | Pending-input preview rows now label steer-pending, rejected-steer, and queued-follow-up delivery modes, and wrapped continuation rows align under the label. `cargo test -p codewhale-tui --bin codewhale-tui --locked pending_input_preview -- --nocapture` passed. Credit @cyq1017; #2054 remains open for cancel/edit-mode affordance clarity. |
|
||||
| #2029 sub-agent checkpoint continuation | Locally implemented as the live-timeout recovery slice. | Sub-agents now persist `SubAgentCheckpoint` metadata through state, results, projections, and transcript handles. The runner checkpoints local messages before API calls and after model/tool cycles; per-step API timeout marks the child interrupted with `continuable=true`; `agent_eval { continue: true }` resumes only live checkpointed interrupted children. Reload preserves checkpoint metadata, but cold-restart continuation is intentionally not claimed because the child task/input channel is not rehydrated yet. `cargo test -p codewhale-tui --bin codewhale-tui --locked subagent -- --nocapture`, `cargo fmt --all -- --check`, `git diff --check`, and `cargo clippy -p codewhale-tui --locked -- -D warnings` passed. Credit @qiyuanlicn for the recovery report; keep #2029 open only if cold-restart continuation or broader checkpoint UX remains required. |
|
||||
| #1786 stale running task recovery | Locally implemented as the durable restart-safety slice. | `TaskManager::load_state` now marks tasks that were persisted as `running` in a prior process as failed with an explicit restart/interrupted error instead of requeueing them. Running tool-call summaries inside those stale tasks are also marked failed. `cargo test -p codewhale-tui --bin codewhale-tui --locked running_tasks_are_not_requeued_after_restart -- --nocapture` and `cargo test -p codewhale-tui --bin codewhale-tui --locked task_manager -- --nocapture` passed. Credit @bevis-wong; keep #1786 open for foreground shell hang root cause and careful LIVE-state watchdog work that does not abort legitimate foreground commands. |
|
||||
| #697/#1827 bounded auto-generated project context | Locally implemented from the stabilization audit. | When no project instructions exist, startup now writes `.codewhale/instructions.md` from the bounded Project Context Pack data instead of an unbounded summary/tree scan. The generated file avoids the dynamic `<project_context_pack>` marker when that setting is disabled, keeps later top-level folders visible, and omits noisy directory tails. `cargo test -p codewhale-tui --bin codewhale-tui --locked auto_generated_context_is_bounded_for_many_file_workspace -- --nocapture` and `cargo test -p codewhale-tui --bin codewhale-tui --locked project_context_pack -- --nocapture` passed. Credit reporters @NASLXTO and @wuxixing, plus earlier context-cap/startup work from @linzhiqin2003 and @merchloubna70-dot; leave #697/#1827 open pending real massive-repo/manual startup verification. |
|
||||
| #2636 project-context mtime cache | Defer direct merge; harvest only after cache key/signature is widened. | Must include constitution changes, auto-generated context deletion, canonical path equivalence, and overwrite detection before landing. |
|
||||
| #2634 HarmonyOS port | Locally harvested with additional Nix-chain clearance; keep credited and do not close until the integration branch is public. | User-supplied MatePad Edge demo (`https://bilibili.com/video/av116689597368905`) confirms real-device interest. Added env-driven OpenHarmony SDK setup, OHOS platform guards/fallbacks, self-update disablement, and OHOS target gating for Starlark execpolicy parsing plus PTY support so published OHOS builds do not pull `nix` 0.28 through `rustyline` or `portable-pty`. `cargo check --workspace --all-features --locked`, focused PTY/clipboard tests, and `cargo tree --locked -p codewhale-tui --target aarch64-unknown-linux-ohos -i nix@0.28.0` passed; full OHOS target check is blocked on this host because `OHOS_NATIVE_SDK`/target CC/sysroot are not configured and `ring` cannot find `assert.h`. |
|
||||
@@ -77,7 +78,7 @@ v0.9 branch so the remaining Windows/manual checks are explicit.
|
||||
| Large-repo context startup (#697/#1827 class) | Partially covered. | Project-context pack ordering/budget/noise tests passed, and the auto-generated fallback now has a synthetic 1000-file startup smoke with `cargo test -p codewhale-tui --bin codewhale-tui --locked auto_generated_context_is_bounded_for_many_file_workspace -- --nocapture`. Still needs a real massive-repo/manual startup benchmark before closing #697 or #1827. |
|
||||
| Sub-agent timeout and trust model (#1806, #719) | Fixed or covered in current branch. | `heartbeat_timeout_secs` clamp/default test passed, and `agent_open_description_explains_fresh_vs_forked_context_and_trust_model` asserts that sub-agent results are self-reports. |
|
||||
| Sub-agent checkpoint/resume (#2029) | Partially covered. | Live per-step API timeout now preserves a continuable checkpoint and `agent_eval { continue: true }` resumes the parked child; `cargo test -p codewhale-tui --bin codewhale-tui --locked subagent -- --nocapture` passed with checkpoint/projection/persistence/continuation coverage. Cold-restart continuation is not implemented because persisted child tasks are not rehydrated; decide whether #2029 can close as live-timeout recovery or should remain open for restart-resume UX. |
|
||||
| Live shell/session liveness (#1786) | Partially fixed, still release-blocking. | Shell containment and turn-liveness tests exist, but orphaned PID/session-load reaping and long-running shell LIVE-state recovery remain open. Needs stale PID reaping and live-state regression coverage. |
|
||||
| Live shell/session liveness (#1786) | Partially fixed, still release-blocking. | Durable task restart recovery now fails stale persisted `running` tasks instead of requeueing them, covered by `running_tasks_are_not_requeued_after_restart` and broader `task_manager` tests. Foreground shell hang root cause and LIVE-state watchdog recovery remain open; avoid aborting legitimate foreground `exec_shell` commands while adding stale-card recovery. |
|
||||
| Queued/live input feedback (#2054) | Partially covered; UX clarity still blocking. | Queued-message recovery/editing and pending-input delivery-mode labels are covered by `queued` and `pending_input_preview` focused tests. Still needs cancel/edit-mode affordance clarity and a repro for accidentally entering queued-draft edit while a turn is loading. |
|
||||
| Prompt/UI calmness (#1191) | Defer or narrow. | No release-blocking regression evidence yet; keep as polish unless a current user-facing prompt/UI failure is identified. |
|
||||
|
||||
|
||||
Reference in New Issue
Block a user