fix(engine): recover from turn panics instead of killing the loop (#2583, #1269)

A panic inside `handle_deepseek_turn` unwound through `engine.run()` and was
caught by `spawn_supervised("engine-event-loop")`, which wrote a crash dump
and let the whole engine task exit. The UI never received `TurnComplete`, so
it sat on "working" forever and every subsequent turn was dead too — exactly
the "the engine have stopped" / stuck-on-working reports.

Wrap the turn call in `catch_unwind` so a panic now surfaces as a failed
`TurnComplete` (with a clear, actionable message) and the engine keeps
running. The crash dump is still written via a new `record_caught_panic`
helper so maintainers retain the `~/.codewhale/crashes/` diagnostics.

Also dedupes the panic-message extraction in `spawn_supervised` /
`spawn_blocking_supervised` into a shared `panic_message` helper.

https://claude.ai/code/session_01MQrnh6wHfrEYN5BBdMarC1
This commit is contained in:
Claude
2026-06-03 01:08:30 +00:00
parent fd69f4c806
commit 5249723e18
2 changed files with 57 additions and 24 deletions
+29 -10
View File
@@ -1679,16 +1679,35 @@ impl Engine {
.as_ref()
.map(|client| client.base_url().to_string());
// Main turn loop
let (status, error) = self
.handle_deepseek_turn(
&mut turn,
tool_registry.as_ref(),
tools,
mode,
force_update_plan_first,
)
.await;
// Main turn loop. Catch panics here so an internal error surfaces as a
// failed TurnComplete instead of unwinding through `engine.run()` and
// killing the whole engine-event-loop task — which left the UI stuck
// on "working" forever with the engine silently dead (#2583, #1269).
use futures_util::FutureExt as _;
let turn_result = std::panic::AssertUnwindSafe(self.handle_deepseek_turn(
&mut turn,
tool_registry.as_ref(),
tools,
mode,
force_update_plan_first,
))
.catch_unwind()
.await;
let (status, error) = match turn_result {
Ok(outcome) => outcome,
Err(panic) => {
let detail = crate::utils::panic_message(&*panic);
crate::utils::record_caught_panic("engine-event-loop", &detail);
(
TurnOutcomeStatus::Failed,
Some(format!(
"The engine hit an internal error and stopped this turn: {detail}. \
Your session is intact — send your message again to retry. \
A crash report was saved to ~/.codewhale/crashes/."
)),
)
}
};
// Update session usage
self.session.total_usage.add(&turn.usage);
+28 -14
View File
@@ -282,13 +282,7 @@ where
use futures_util::FutureExt;
let result = std::panic::AssertUnwindSafe(future).catch_unwind().await;
if let Err(panic_info) = result {
let msg = if let Some(s) = panic_info.downcast_ref::<&str>() {
s.to_string()
} else if let Some(s) = panic_info.downcast_ref::<String>() {
s.clone()
} else {
"unknown panic".to_string()
};
let msg = panic_message(&*panic_info);
tracing::error!(
target: "panic",
"Task '{name}' panicked at {}: {msg}",
@@ -300,6 +294,32 @@ where
})
}
/// Extract a human-readable message from a caught panic payload (the `Err`
/// value of `catch_unwind`). Mirrors how the panic hook formats `&str` and
/// `String` payloads so crash dumps stay consistent across call sites.
#[must_use]
pub fn panic_message(panic: &(dyn std::any::Any + Send)) -> String {
if let Some(s) = panic.downcast_ref::<&str>() {
(*s).to_string()
} else if let Some(s) = panic.downcast_ref::<String>() {
s.clone()
} else {
"unknown panic".to_string()
}
}
/// Record a panic that was caught at a call site (via `catch_unwind`) rather
/// than by a task supervisor. Logs it on the `panic` target and writes a
/// best-effort crash dump to `~/.codewhale/crashes/`, so diagnostics land in
/// the same place `spawn_supervised` writes them even when the caller recovers
/// and keeps running.
#[track_caller]
pub fn record_caught_panic(name: &'static str, message: &str) {
let location = std::panic::Location::caller();
tracing::error!(target: "panic", "Task '{name}' panicked at {location}: {message}");
let _ = write_panic_dump(name, location, message);
}
/// Write a panic dump file to `~/.codewhale/crashes/`.
///
/// Creates the directory if needed and writes a timestamped log
@@ -362,13 +382,7 @@ where
tokio::task::spawn_blocking(move || {
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(f));
if let Err(panic_info) = result {
let msg = if let Some(s) = panic_info.downcast_ref::<&str>() {
s.to_string()
} else if let Some(s) = panic_info.downcast_ref::<String>() {
s.clone()
} else {
"unknown panic".to_string()
};
let msg = panic_message(&*panic_info);
tracing::error!(
target: "panic",
"Blocking task '{name}' panicked at {location}: {msg}",