From 899c703d811e88f17a4647156d0c2f5fade7bfcb Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Sat, 2 May 2026 10:02:28 -0500 Subject: [PATCH] fix(tui): convert remaining tokio::spawn sites to spawn_supervised + restore terminal on panic (#346) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Completes the panic-safety work #346 started in a8be33b3. Converts every trivial production tokio::spawn site to spawn_supervised so a panicking task writes a crash dump to ~/.deepseek/crashes/ and the parent process stays alive. Sites converted: - tools/rlm.rs:190 — RLM progress drain - tools/subagent/mod.rs:888 — run_subagent_task spawn - tools/subagent/mod.rs:988 — run_subagent_task resume - core/engine.rs:744 — sub-agent mailbox drainer - core/engine.rs:1601 — engine event-loop spawn - lsp/client.rs:127 — LSP writer - lsp/client.rs:129 — LSP reader - lsp/client.rs:135 — LSP dispatcher - rlm/bridge.rs:188 — bridge progress drain - task_manager.rs:790 — task worker loop - automation_manager.rs:822 — automation scheduler Sites left as-is (already panic-safe with their own catch_unwind): - runtime_threads.rs:1242, 1462 — custom AssertUnwindSafe + catch_unwind - mcp.rs:322 — MCP SSE loop with custom catch_unwind Sites that don't need conversion: - runtime_api.rs:287 — axum::serve runs in the parent task, not spawned - runtime_api.rs:1583+ — test-helper spawn_test_server inside #[cfg(test)] - All other spawn calls are in #[cfg(test)] modules where panics are expected to propagate. Also: - main.rs panic hook now restores the terminal (LeaveAlternateScreen + disable_raw_mode) before invoking the original hook, so a panicked TUI doesn't leave the user's shell stuck in alt-screen mode. - Adds spawn_supervised_tests::panicking_task_writes_crash_dump_and_does_not_kill_parent that proves a panicking task produces a dated crash log under ~/.deepseek/crashes/.log and the parent task completes Ok. Closes #346. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/tui/src/automation_manager.rs | 9 +++- crates/tui/src/core/engine.rs | 41 +++++++++------- crates/tui/src/lsp/client.rs | 19 ++++++-- crates/tui/src/main.rs | 11 ++++- crates/tui/src/rlm/bridge.rs | 7 ++- crates/tui/src/task_manager.rs | 11 +++-- crates/tui/src/tools/rlm.rs | 7 ++- crates/tui/src/tools/subagent/mod.rs | 13 +++++- crates/tui/src/utils.rs | 70 ++++++++++++++++++++++++++++ 9 files changed, 159 insertions(+), 29 deletions(-) diff --git a/crates/tui/src/automation_manager.rs b/crates/tui/src/automation_manager.rs index 290d56e0..5aaf85dc 100644 --- a/crates/tui/src/automation_manager.rs +++ b/crates/tui/src/automation_manager.rs @@ -18,6 +18,7 @@ use tokio_util::sync::CancellationToken; use uuid::Uuid; use crate::task_manager::{NewTaskRequest, SharedTaskManager, TaskStatus}; +use crate::utils::spawn_supervised; const CURRENT_AUTOMATION_SCHEMA_VERSION: u32 = 1; const CURRENT_RUN_SCHEMA_VERSION: u32 = 1; @@ -819,7 +820,10 @@ pub fn spawn_scheduler( cancel: CancellationToken, config: AutomationSchedulerConfig, ) -> tokio::task::JoinHandle<()> { - tokio::spawn(async move { + spawn_supervised( + "automation-scheduler", + std::panic::Location::caller(), + async move { let interval = config.tick_interval_secs.max(5); loop { if cancel.is_cancelled() { @@ -841,7 +845,8 @@ pub fn spawn_scheduler( _ = sleep(std::time::Duration::from_secs(interval)) => {} } } - }) + }, + ) } #[cfg(test)] diff --git a/crates/tui/src/core/engine.rs b/crates/tui/src/core/engine.rs index 71024d72..caf7dc24 100644 --- a/crates/tui/src/core/engine.rs +++ b/crates/tui/src/core/engine.rs @@ -51,6 +51,7 @@ use crate::tools::todo::{SharedTodoList, new_shared_todo_list}; use crate::tools::user_input::{UserInputRequest, UserInputResponse}; use crate::tools::{ToolContext, ToolRegistryBuilder}; use crate::tui::app::AppMode; +use crate::utils::spawn_supervised; use super::capacity::{ CapacityController, CapacityControllerConfig, CapacityDecision, CapacityObservationInput, @@ -741,20 +742,24 @@ impl Engine { let cancel_token = self.cancel_token.child_token(); let (mailbox, mut receiver) = Mailbox::new(cancel_token.clone()); let tx_event_clone = self.tx_event.clone(); - tokio::spawn(async move { - while let Some(envelope) = receiver.recv().await { - if tx_event_clone - .send(Event::SubAgentMailbox { - seq: envelope.seq, - message: envelope.message, - }) - .await - .is_err() - { - break; + spawn_supervised( + "subagent-mailbox-drainer", + std::panic::Location::caller(), + async move { + while let Some(envelope) = receiver.recv().await { + if tx_event_clone + .send(Event::SubAgentMailbox { + seq: envelope.seq, + message: envelope.message, + }) + .await + .is_err() + { + break; + } } - } - }); + }, + ); Some((mailbox, cancel_token)) } else { None @@ -1598,9 +1603,13 @@ impl Engine { pub fn spawn_engine(config: EngineConfig, api_config: &Config) -> EngineHandle { let (engine, handle) = Engine::new(config, api_config); - tokio::spawn(async move { - engine.run().await; - }); + spawn_supervised( + "engine-event-loop", + std::panic::Location::caller(), + async move { + engine.run().await; + }, + ); handle } diff --git a/crates/tui/src/lsp/client.rs b/crates/tui/src/lsp/client.rs index 9db64635..7cd10e6b 100644 --- a/crates/tui/src/lsp/client.rs +++ b/crates/tui/src/lsp/client.rs @@ -40,6 +40,7 @@ use tokio::time::timeout; use super::diagnostics::{Diagnostic, Severity}; use super::registry::Language; +use crate::utils::spawn_supervised; /// Trait the LSP manager talks to. A real LSP server speaks this via stdio; /// tests use an in-process fake. @@ -124,15 +125,27 @@ impl StdioLspTransport { let (tx_diag, rx_diag) = mpsc::channel::<(PathBuf, Vec)>(64); // Writer task: drain outbound channel, frame with Content-Length, write to stdin. - tokio::spawn(writer_task(stdin, rx_outbound)); + spawn_supervised( + "lsp-writer", + std::panic::Location::caller(), + writer_task(stdin, rx_outbound), + ); // Reader task: parse Content-Length frames from stdout, push to inbound queue. - tokio::spawn(reader_task(stdout, tx_inbound)); + spawn_supervised( + "lsp-reader", + std::panic::Location::caller(), + reader_task(stdout, tx_inbound), + ); // Inbound dispatcher: routes notifications to `tx_diag`, replies to a // pending map. We keep the pending map for completeness even though // diagnostics polling itself does not reuse it. let pending: Arc>>> = Arc::new(AsyncMutex::new(HashMap::new())); - tokio::spawn(dispatcher_task(rx_inbound, tx_diag, pending.clone())); + spawn_supervised( + "lsp-dispatcher", + std::panic::Location::caller(), + dispatcher_task(rx_inbound, tx_diag, pending.clone()), + ); // Send `initialize` and wait for `initialized`. We synthesize id=1. let init_payload = json!({ diff --git a/crates/tui/src/main.rs b/crates/tui/src/main.rs index 8307aa44..6c130626 100644 --- a/crates/tui/src/main.rs +++ b/crates/tui/src/main.rs @@ -506,9 +506,18 @@ enum SandboxCommand { #[tokio::main] async fn main() -> Result<()> { // Set up process panic hook before anything else — writes crash dumps - // to ~/.deepseek/crashes/ even if the panic happens before tokio is up. + // to ~/.deepseek/crashes/ even if the panic happens before tokio is up, + // and restores the terminal so a panicked TUI doesn't leave the user's + // shell stuck in alt-screen mode. let orig_hook = std::panic::take_hook(); std::panic::set_hook(Box::new(move |panic_info| { + // Restore the terminal first so the panic message itself, plus the + // user's shell after exit, are visible. Best-effort — we may not be + // in raw / alt-screen mode if the panic happens pre-TUI. + use crossterm::terminal::{LeaveAlternateScreen, disable_raw_mode}; + let _ = disable_raw_mode(); + let _ = crossterm::execute!(std::io::stdout(), LeaveAlternateScreen); + let msg = if let Some(s) = panic_info.payload().downcast_ref::<&str>() { s.to_string() } else if let Some(s) = panic_info.payload().downcast_ref::() { diff --git a/crates/tui/src/rlm/bridge.rs b/crates/tui/src/rlm/bridge.rs index 6338762b..36d8a9dc 100644 --- a/crates/tui/src/rlm/bridge.rs +++ b/crates/tui/src/rlm/bridge.rs @@ -22,6 +22,7 @@ use tokio::sync::Mutex; use crate::llm_client::LlmClient; use crate::models::{ContentBlock, Message, MessageRequest, MessageResponse, SystemPrompt, Usage}; use crate::repl::runtime::{BatchResp, RpcDispatcher, RpcRequest, RpcResponse, SingleResp}; +use crate::utils::spawn_supervised; /// Per-child completion timeout — same as the previous sidecar default. const CHILD_TIMEOUT_SECS: u64 = 120; @@ -185,7 +186,11 @@ impl RlmBridge { // turn (we don't surface them; this dispatch is invisible to the // outer agent stream). let (tx, mut rx) = tokio::sync::mpsc::channel(64); - let drain = tokio::spawn(async move { while rx.recv().await.is_some() {} }); + let drain = spawn_supervised( + "rlm-bridge-drain", + std::panic::Location::caller(), + async move { while rx.recv().await.is_some() {} }, + ); let child_model = model .filter(|m| !m.is_empty()) diff --git a/crates/tui/src/task_manager.rs b/crates/tui/src/task_manager.rs index f1aa8507..e19d146e 100644 --- a/crates/tui/src/task_manager.rs +++ b/crates/tui/src/task_manager.rs @@ -27,6 +27,7 @@ use crate::runtime_threads::{ CreateThreadRequest, RuntimeThreadManager, RuntimeThreadManagerConfig, RuntimeTurnStatus, SharedRuntimeThreadManager, StartTurnRequest, }; +use crate::utils::spawn_supervised; const DEFAULT_WORKERS: usize = 2; const MAX_WORKERS: usize = 8; @@ -787,9 +788,13 @@ impl TaskManager { for _ in 0..workers { let manager_clone = Arc::clone(&manager); - tokio::spawn(async move { - manager_clone.worker_loop().await; - }); + spawn_supervised( + "task-manager-worker", + std::panic::Location::caller(), + async move { + manager_clone.worker_loop().await; + }, + ); } Ok(manager) diff --git a/crates/tui/src/tools/rlm.rs b/crates/tui/src/tools/rlm.rs index c4e513c5..cfc1f0a9 100644 --- a/crates/tui/src/tools/rlm.rs +++ b/crates/tui/src/tools/rlm.rs @@ -21,6 +21,7 @@ use crate::rlm::turn::{RlmTermination, run_rlm_turn_with_root}; use crate::tools::spec::{ ApprovalRequirement, ToolCapability, ToolContext, ToolError, ToolResult, ToolSpec, }; +use crate::utils::spawn_supervised; /// Default child model — cheap and fast. const DEFAULT_CHILD_MODEL: &str = "deepseek-v4-flash"; @@ -187,7 +188,11 @@ impl ToolSpec for RlmTool { // we don't want RLM's progress events to interleave with the // parent agent's stream. Drain into a no-op channel. let (tx, mut rx) = tokio::sync::mpsc::channel(64); - let drain = tokio::spawn(async move { while rx.recv().await.is_some() {} }); + let drain = spawn_supervised( + "rlm-progress-drain", + std::panic::Location::caller(), + async move { while rx.recv().await.is_some() {} }, + ); // The big body lives only in the REPL as `context`. The small // `task` rides along as `root_prompt` and is shown to the root diff --git a/crates/tui/src/tools/subagent/mod.rs b/crates/tui/src/tools/subagent/mod.rs index fb71a89a..58432df7 100644 --- a/crates/tui/src/tools/subagent/mod.rs +++ b/crates/tui/src/tools/subagent/mod.rs @@ -31,6 +31,7 @@ use crate::tools::spec::{ optional_bool, optional_u64, required_str, }; use crate::tools::todo::{SharedTodoList, TodoList}; +use crate::utils::spawn_supervised; pub mod mailbox; #[allow(unused_imports)] @@ -885,7 +886,11 @@ impl SubAgentManager { max_steps, input_rx, }; - let handle = tokio::spawn(run_subagent_task(task)); + let handle = spawn_supervised( + "subagent-task", + std::panic::Location::caller(), + run_subagent_task(task), + ); agent.task_handle = Some(handle); self.agents.insert(agent_id.clone(), agent); self.persist_state_best_effort(); @@ -985,7 +990,11 @@ impl SubAgentManager { max_steps: self.max_steps, input_rx, }; - let handle = tokio::spawn(run_subagent_task(task)); + let handle = spawn_supervised( + "subagent-task-resume", + std::panic::Location::caller(), + run_subagent_task(task), + ); agent.status = SubAgentStatus::Running; agent.result = None; diff --git a/crates/tui/src/utils.rs b/crates/tui/src/utils.rs index 51759e2f..7220e516 100644 --- a/crates/tui/src/utils.rs +++ b/crates/tui/src/utils.rs @@ -530,6 +530,76 @@ mod atomic_write_tests { } } +#[cfg(test)] +mod spawn_supervised_tests { + use super::*; + use std::sync::Arc; + use std::sync::atomic::{AtomicBool, Ordering}; + + /// A spawned task that panics produces a crash dump in + /// `~/.deepseek/crashes/` and the panic does not propagate to the + /// parent task — `spawn_supervised` catches it. + #[tokio::test] + async fn panicking_task_writes_crash_dump_and_does_not_kill_parent() { + // Redirect HOME so we don't pollute the real ~/.deepseek/crashes/. + let tmp = tempfile::tempdir().expect("tempdir"); + let prev_home = std::env::var_os("HOME"); + // SAFETY: tests in this crate run with single-threaded env mutation + // by harness convention; we restore on exit. + unsafe { std::env::set_var("HOME", tmp.path()) }; + + // Spawn a task that immediately panics. + let parent_alive = Arc::new(AtomicBool::new(false)); + let parent_alive_clone = parent_alive.clone(); + + let handle = spawn_supervised( + "panic-test-fixture", + std::panic::Location::caller(), + async move { + parent_alive_clone.store(true, Ordering::SeqCst); + panic!("deliberate panic for crash-dump test"); + }, + ); + + // The handle resolves to () because spawn_supervised swallows the + // panic. Awaiting must not return Err — the caller must not see + // the panic. + let result = handle.await; + + // Restore HOME before any assertions can panic. + match prev_home { + Some(v) => unsafe { std::env::set_var("HOME", v) }, + None => unsafe { std::env::remove_var("HOME") }, + } + + assert!( + result.is_ok(), + "spawn_supervised must convert panic to a normal completion" + ); + assert!( + parent_alive.load(Ordering::SeqCst), + "fixture task must have run before panicking" + ); + + // A crash dump file must exist under /.deepseek/crashes/. + let crash_dir = tmp.path().join(".deepseek").join("crashes"); + let entries: Vec<_> = std::fs::read_dir(&crash_dir) + .expect("crashes dir exists") + .flatten() + .collect(); + assert_eq!(entries.len(), 1, "exactly one crash dump expected"); + let dump = std::fs::read_to_string(entries[0].path()).expect("read dump"); + assert!( + dump.contains("panic-test-fixture"), + "dump must include the task name; got: {dump}" + ); + assert!( + dump.contains("deliberate panic for crash-dump test"), + "dump must include the panic message; got: {dump}" + ); + } +} + #[cfg(test)] mod project_mapping_tests { use super::{project_tree, summarize_project};