diff --git a/crates/tui/src/core/engine.rs b/crates/tui/src/core/engine.rs index 7558856b..5a2c053c 100644 --- a/crates/tui/src/core/engine.rs +++ b/crates/tui/src/core/engine.rs @@ -263,8 +263,12 @@ const STREAM_CHUNK_TIMEOUT_SECS: u64 = 90; const STREAM_MAX_CONTENT_BYTES: usize = 10 * 1024 * 1024; // 10 MB /// Maximum wall-clock duration for a single streaming response. const STREAM_MAX_DURATION_SECS: u64 = 300; // 5 minutes -/// Max output tokens requested for normal agent turns. -const TURN_MAX_OUTPUT_TOKENS: u32 = 4096; +/// Max output tokens requested for normal agent turns. Bumped from 4096 to +/// 32768: V4 thinking models can consume 8-15K reasoning tokens on hard +/// prompts; the old 4K ceiling exhausted the budget, the API closed the +/// SSE stream with `finish_reason: "length"`, and the visible reply ended +/// up empty (surfaced as the assistant "stopping mid-response"). +const TURN_MAX_OUTPUT_TOKENS: u32 = 32768; /// Keep this many most recent messages when emergency trimming is required. const MIN_RECENT_MESSAGES_TO_KEEP: usize = 4; /// Allow a few emergency recovery attempts before failing the turn. diff --git a/crates/tui/src/core/engine/tests.rs b/crates/tui/src/core/engine/tests.rs index dd1000a8..80c446af 100644 --- a/crates/tui/src/core/engine/tests.rs +++ b/crates/tui/src/core/engine/tests.rs @@ -262,7 +262,7 @@ fn detects_context_length_errors_from_provider_payloads() { fn context_budget_reserves_output_and_headroom() { let budget = context_input_budget("deepseek-v3.2-128k", TURN_MAX_OUTPUT_TOKENS) .expect("deepseek models should have known context window"); - let expected = 128_000usize - 4_096usize - 1_024usize; + let expected = 128_000usize - (TURN_MAX_OUTPUT_TOKENS as usize) - 1_024usize; assert_eq!(budget, expected); } diff --git a/crates/tui/src/tui/ui.rs b/crates/tui/src/tui/ui.rs index fd776377..a58d6564 100644 --- a/crates/tui/src/tui/ui.rs +++ b/crates/tui/src/tui/ui.rs @@ -314,7 +314,15 @@ fn build_engine_config(app: &App, config: &Config) -> EngineConfig { trust_mode: app.trust_mode, notes_path: config.notes_path(), mcp_config_path: config.mcp_config_path(), - max_steps: 100, + // Effectively unlimited. V4 has a 1M context window and the user + // wants the model running until it's actually done. The previous cap + // of 100 hit the ceiling on long multi-step plans (wide refactors, + // sub-agent orchestration) and presented as the agent "giving up + // mid-task". `u32::MAX` is the type ceiling; users can still + // interrupt with Ctrl+C / Esc, and a turn naturally ends when the + // model stops emitting tool calls. A real runaway is rare and + // human-noticeable; we trust the operator over a hard step cap. + max_steps: u32::MAX, max_subagents: app.max_subagents, features: config.features(), compaction: app.compaction_config(),