diff --git a/crates/tui/src/core/capacity.rs b/crates/tui/src/core/capacity.rs index 437075fb..55547c9b 100644 --- a/crates/tui/src/core/capacity.rs +++ b/crates/tui/src/core/capacity.rs @@ -28,17 +28,18 @@ impl Default for CapacityControllerConfig { model_priors.insert("deepseek_v4_flash".to_string(), 4.2); Self { - enabled: true, - // Tuning history (#63 follow-up): the previous defaults - // (low_risk_max=0.34, refresh_cooldown_turns=2, min_turns=2) - // fired `TargetedContextRefresh` every couple of turns whenever - // p_fail crept above 0.34. Each refresh runs `compact_messages_safe` - // which rewrites the conversation history — visually that looked - // like the agent "restarting" mid-session. Bumping the floor to - // 0.50 (still well below the medium ceiling of 0.62) and - // lengthening the cooldown to 6 turns reduces interventions - // ~3-4x without disabling the controller; it keeps firing on - // genuine risk while ignoring routine noise. + // OFF BY DEFAULT. The capacity controller's main intervention, + // `TargetedContextRefresh`, runs `compact_messages_safe` which + // rewrites the live conversation — visually identical to the + // agent "restarting" mid-turn. Power users running V4 on a 1M + // context window simply don't need this guardrail; the failure + // mode it protects against (context overflow) is rare in + // practice and self-correcting (the model surfaces a clear + // error). Users who do want the controller back can enable it + // via `capacity.enabled = true` in `~/.deepseek/config.toml`. + enabled: false, + // Thresholds retained for the opt-in path; tuning notes live + // in git history (#63 follow-up). low_risk_max: 0.50, medium_risk_max: 0.62, severe_min_slack: -0.25, @@ -693,7 +694,13 @@ mod tests { #[test] fn cooldown_blocks_repeated_action() { - let mut controller = CapacityController::new(CapacityControllerConfig::default()); + // Capacity controller is opt-in (off by default since v0.6.2). This + // test exercises the cooldown logic, so explicitly enable it. + let config = CapacityControllerConfig { + enabled: true, + ..CapacityControllerConfig::default() + }; + let mut controller = CapacityController::new(config); let turn_index = 5; controller.mark_turn_start(turn_index); controller.mark_intervention_applied(turn_index, GuardrailAction::TargetedContextRefresh); diff --git a/crates/tui/src/core/engine.rs b/crates/tui/src/core/engine.rs index 5a2c053c..ce9765eb 100644 --- a/crates/tui/src/core/engine.rs +++ b/crates/tui/src/core/engine.rs @@ -263,12 +263,13 @@ const STREAM_CHUNK_TIMEOUT_SECS: u64 = 90; const STREAM_MAX_CONTENT_BYTES: usize = 10 * 1024 * 1024; // 10 MB /// Maximum wall-clock duration for a single streaming response. const STREAM_MAX_DURATION_SECS: u64 = 300; // 5 minutes -/// Max output tokens requested for normal agent turns. Bumped from 4096 to -/// 32768: V4 thinking models can consume 8-15K reasoning tokens on hard -/// prompts; the old 4K ceiling exhausted the budget, the API closed the -/// SSE stream with `finish_reason: "length"`, and the visible reply ended -/// up empty (surfaced as the assistant "stopping mid-response"). -const TURN_MAX_OUTPUT_TOKENS: u32 = 32768; +/// Max output tokens requested for normal agent turns. Generous on purpose: +/// V4 thinking models can produce tens of thousands of reasoning tokens on +/// hard prompts before the visible reply, and DeepSeek V4 ships with a 1M +/// context window. 256K leaves the model effectively unconstrained on +/// output without us imposing artificial per-turn caps that surfaced as the +/// assistant "stopping mid-response" when reasoning consumed the budget. +const TURN_MAX_OUTPUT_TOKENS: u32 = 262_144; /// Keep this many most recent messages when emergency trimming is required. const MIN_RECENT_MESSAGES_TO_KEEP: usize = 4; /// Allow a few emergency recovery attempts before failing the turn. diff --git a/crates/tui/src/core/engine/tests.rs b/crates/tui/src/core/engine/tests.rs index 80c446af..3f030d97 100644 --- a/crates/tui/src/core/engine/tests.rs +++ b/crates/tui/src/core/engine/tests.rs @@ -260,9 +260,12 @@ fn detects_context_length_errors_from_provider_payloads() { #[test] fn context_budget_reserves_output_and_headroom() { - let budget = context_input_budget("deepseek-v3.2-128k", TURN_MAX_OUTPUT_TOKENS) - .expect("deepseek models should have known context window"); - let expected = 128_000usize - (TURN_MAX_OUTPUT_TOKENS as usize) - 1_024usize; + // V4 has a 1M context window — the only family that comfortably hosts + // a 256K output reservation without saturating the input budget to 0. + let budget = context_input_budget("deepseek-v4-pro", TURN_MAX_OUTPUT_TOKENS) + .expect("deepseek-v4-pro should have a known context window"); + let v4_window: usize = 1_000_000; + let expected = v4_window - (TURN_MAX_OUTPUT_TOKENS as usize) - 1_024usize; assert_eq!(budget, expected); }