codewhale/crates/tui/src/core/engine/context.rs

//! Context budgeting and prompt-shaping helpers for the engine.
//!
//! These functions are shared by the streaming turn loop, capacity flow, and
//! engine session maintenance code. Keeping them here prevents the top-level
//! engine module from accumulating unrelated context-policy details.

use crate::compaction::estimate_tokens;
use crate::error_taxonomy::ErrorCategory;
use crate::models::{Message, SystemPrompt, context_window_for_model};
use crate::tools::spec::ToolResult;
use serde_json::Value;

/// Max output tokens requested for normal agent turns. Generous on purpose:
/// V4 thinking models can produce tens of thousands of reasoning tokens on
/// hard prompts before the visible reply, and DeepSeek V4 ships with a 1M
/// context window. v0.7.5 keeps this cap fixed instead of silently lowering
/// `max_tokens` near pressure; hard-cycle/preflight checks reserve this budget
/// plus safety headroom before sending the next request.
pub(super) const TURN_MAX_OUTPUT_TOKENS: u32 = 262_144;

/// Safe max output tokens sent in the API request. This must be low enough to
/// work with providers that have smaller context limits than the model's native
/// window (e.g., self-hosted vLLM/SGLang with `--max-model-len 131072`).
/// DeepSeek's API will still produce as many tokens as needed for thinking;
/// this cap just prevents HTTP 400 from providers with tight limits.
const API_MAX_OUTPUT_TOKENS: u32 = 65_536;

/// Compute the effective `max_tokens` to send in the API request for a given
/// model. Uses `API_MAX_OUTPUT_TOKENS` (64K) which fits within common provider
/// limits (128K+ total). For non-V4 models with smaller context windows, caps
/// at half the context window.
///
/// Override: when the env var `DEEPSEEK_MAX_OUTPUT_TOKENS` is set to a positive
/// integer, this function returns that value directly. Use this for self-hosted
/// providers (vLLM/SGLang) whose `max-model-len` is tight and where the
/// model-table heuristic above would over-allocate. Example: vLLM serving
/// Qwen3.6 with `--max-model-len 65536` should set
/// `DEEPSEEK_MAX_OUTPUT_TOKENS=16384` so input + output stays well under the
/// provider's hard limit.
pub(super) fn effective_max_output_tokens(model: &str) -> u32 {
    if let Ok(raw) = std::env::var("DEEPSEEK_MAX_OUTPUT_TOKENS")
        && let Ok(n) = raw.trim().parse::<u32>()
        && n > 0
    {
        return n;
    }
    let window = context_window_for_model(model).unwrap_or(128_000);
    if window >= 500_000 {
        // V4-class models on large-context providers: use 64K which is safe
        // for most deployments while still allowing substantial output.
        API_MAX_OUTPUT_TOKENS
    } else {
        // Smaller models: cap at half the context window (leave room for input)
        let capped = window / 2;
        capped.min(API_MAX_OUTPUT_TOKENS)
    }
}
/// Keep this many most recent messages when emergency trimming is required.
pub(super) const MIN_RECENT_MESSAGES_TO_KEEP: usize = 4;
/// Allow a few emergency recovery attempts before failing the turn.
pub(super) const MAX_CONTEXT_RECOVERY_ATTEMPTS: u8 = 2;
/// Reserve additional headroom to avoid hitting provider hard limits.
const CONTEXT_HEADROOM_TOKENS: usize = 1024;
/// Hard cap for any tool output inserted into model context.
const TOOL_RESULT_CONTEXT_HARD_LIMIT_CHARS: usize = 12_000;
/// Soft cap for known noisy tools inserted into model context.
const TOOL_RESULT_CONTEXT_SOFT_LIMIT_CHARS: usize = 2_000;
/// Snippet length kept when compacting tool output for model context.
const TOOL_RESULT_CONTEXT_SNIPPET_CHARS: usize = 900;
/// Hard cap for tool output inserted into a large-context model.
const LARGE_CONTEXT_TOOL_RESULT_HARD_LIMIT_CHARS: usize = 180_000;
/// Soft cap for known noisy tools inserted into a large-context model.
const LARGE_CONTEXT_TOOL_RESULT_SOFT_LIMIT_CHARS: usize = 60_000;
/// Snippet length kept when compacting large-context tool output.
const LARGE_CONTEXT_TOOL_RESULT_SNIPPET_CHARS: usize = 40_000;
/// Context window size at which tool output limits can be relaxed.
const LARGE_CONTEXT_WINDOW_TOKENS: u32 = 500_000;
/// Max chars to keep from metadata-provided output summaries.
const TOOL_RESULT_METADATA_SUMMARY_CHARS: usize = 320;

pub(super) const COMPACTION_SUMMARY_MARKER: &str = "Conversation Summary (Auto-Generated)";

#[derive(Debug, Clone, Copy)]
struct ToolResultContextLimits {
    hard_limit_chars: usize,
    noisy_soft_limit_chars: usize,
    snippet_chars: usize,
}

pub(super) fn summarize_text(text: &str, limit: usize) -> String {
    if text.chars().count() <= limit {
        return text.to_string();
    }
    let take = limit.saturating_sub(3);
    let mut out: String = text.chars().take(take).collect();
    out.push_str("...");
    out
}

fn summarize_text_head_tail(text: &str, limit: usize) -> String {
    let total = text.chars().count();
    if total <= limit {
        return text.to_string();
    }
    if limit <= 20 {
        return summarize_text(text, limit);
    }

    let marker = "\n\n[... output truncated for context ...]\n\n";
    let marker_len = marker.chars().count();
    if limit <= marker_len + 20 {
        return summarize_text(text, limit);
    }

    let remaining = limit - marker_len;
    let head_len = remaining.saturating_mul(2) / 3;
    let tail_len = remaining.saturating_sub(head_len);
    let head: String = text.chars().take(head_len).collect();
    let tail_vec: Vec<char> = text.chars().rev().take(tail_len).collect();
    let tail: String = tail_vec.into_iter().rev().collect();
    format!("{head}{marker}{tail}")
}

fn tool_result_is_noisy(tool_name: &str) -> bool {
    matches!(
        tool_name,
        "exec_shell"
            | "exec_shell_wait"
            | "exec_shell_interact"
            | "exec_shell_cancel"
            | "task_shell_start"
            | "task_shell_wait"
            | "run_tests"
            | "run_verifiers"
            | "task_gate_run"
            | "multi_tool_use.parallel"
            | "web_search"
    )
}

fn tool_result_metadata_summary(metadata: Option<&serde_json::Value>) -> Option<String> {
    let obj = metadata?.as_object()?;
    for key in ["summary", "stdout_summary", "stderr_summary", "message"] {
        if let Some(text) = obj.get(key).and_then(serde_json::Value::as_str) {
            let trimmed = text.trim();
            if !trimmed.is_empty() {
                return Some(summarize_text(trimmed, TOOL_RESULT_METADATA_SUMMARY_CHARS));
            }
        }
    }
    None
}

fn summarize_subagent_status(status: &serde_json::Value) -> String {
    if let Some(raw) = status.as_str() {
        return raw.to_string();
    }
    if let Some(obj) = status.as_object()
        && let Some((kind, value)) = obj.iter().next()
    {
        if let Some(reason) = value.as_str().filter(|s| !s.trim().is_empty()) {
            return format!("{kind}({})", summarize_text(reason.trim(), 120));
        }
        return kind.to_string();
    }
    status.to_string()
}

fn summarize_subagent_snapshot(snapshot: &serde_json::Value, index: usize) -> String {
    if let Some(inner) = snapshot.get("snapshot") {
        return summarize_subagent_snapshot(inner, index);
    }

    let Some(obj) = snapshot.as_object() else {
        return format!(
            "- item {index}: {}",
            summarize_text(&snapshot.to_string(), 240)
        );
    };

    let agent_id = obj
        .get("agent_id")
        .and_then(serde_json::Value::as_str)
        .unwrap_or("unknown");
    let agent_type = obj
        .get("agent_type")
        .and_then(serde_json::Value::as_str)
        .unwrap_or("agent");
    let status = obj
        .get("status")
        .map(summarize_subagent_status)
        .unwrap_or_else(|| "unknown".to_string());
    let objective = obj
        .get("assignment")
        .and_then(|assignment| assignment.get("objective"))
        .and_then(serde_json::Value::as_str)
        .map(str::trim)
        .filter(|s| !s.is_empty())
        .map(|s| summarize_text(s, 220));
    let result = obj
        .get("result")
        .and_then(serde_json::Value::as_str)
        .map(str::trim)
        .filter(|s| !s.is_empty())
        .map(|s| summarize_text(s, 1_600));
    let steps = obj.get("steps_taken").and_then(serde_json::Value::as_u64);
    let duration_ms = obj.get("duration_ms").and_then(serde_json::Value::as_u64);

    let mut lines = vec![format!("- {agent_id} ({agent_type}) status={status}")];
    if let Some(objective) = objective {
        lines.push(format!("  objective: {objective}"));
    }
    match result {
        Some(result) => lines.push(format!("  result: {result}")),
        None => lines.push("  result: not available yet".to_string()),
    }
    if steps.is_some() || duration_ms.is_some() {
        let steps = steps
            .map(|n| n.to_string())
            .unwrap_or_else(|| "?".to_string());
        let duration_ms = duration_ms
            .map(|n| n.to_string())
            .unwrap_or_else(|| "?".to_string());
        lines.push(format!("  stats: steps={steps}, duration_ms={duration_ms}"));
    }
    lines.join("\n")
}

fn compact_subagent_tool_result_for_context(tool_name: &str, raw: &str) -> Option<String> {
    if !matches!(
        tool_name,
        "agent_open"
            | "agent_eval"
            | "agent_close"
            | "agent_result"
            | "agent_wait"
            | "tool_agent"
            | "wait"
    ) {
        return None;
    }

    let parsed: serde_json::Value = serde_json::from_str(raw).ok()?;
    let snapshots: Vec<&serde_json::Value> = match &parsed {
        serde_json::Value::Array(items) => items.iter().collect(),
        serde_json::Value::Object(_) => vec![&parsed],
        _ => return None,
    };

    let mut out = String::from("[sub-agent result summarized for parent context]\n");
    out.push_str(
        "Child results are self-reports; verify side effects with tools like read_file or list_dir before claiming success.\n",
    );
    out.push_str("Use `agent_eval` for a fresh projection or `handle_read` on `transcript_handle` for bounded transcript slices.\n");
    for (idx, snapshot) in snapshots.iter().enumerate() {
        if idx >= 8 {
            out.push_str(&format!(
                "- ... {} more sub-agent result(s) omitted from context summary\n",
                snapshots.len().saturating_sub(idx)
            ));
            break;
        }
        out.push_str(&summarize_subagent_snapshot(snapshot, idx + 1));
        out.push('\n');
    }
    Some(out.trim_end().to_string())
}

fn json_text<'a>(value: &'a Value, key: &str) -> Option<&'a str> {
    value
        .get(key)
        .and_then(Value::as_str)
        .map(str::trim)
        .filter(|s| !s.is_empty())
}

fn json_number_text(value: &Value, key: &str) -> Option<String> {
    value
        .get(key)
        .and_then(|value| {
            value
                .as_i64()
                .map(|n| n.to_string())
                .or_else(|| value.as_u64().map(|n| n.to_string()))
        })
        .or_else(|| {
            value
                .get(key)
                .and_then(Value::as_str)
                .map(str::trim)
                .filter(|s| !s.is_empty())
                .map(ToString::to_string)
        })
}

fn compact_run_tests_result_for_context(raw: &str) -> Option<String> {
    let parsed: Value = serde_json::from_str(raw).ok()?;
    let success = parsed.get("success")?.as_bool()?;
    let exit_code = json_number_text(&parsed, "exit_code").unwrap_or_else(|| "?".to_string());
    let command = json_text(&parsed, "command").unwrap_or("(unknown command)");
    let stdout = json_text(&parsed, "stdout");
    let stderr = json_text(&parsed, "stderr");
    let stream_limit = if success { 500 } else { 1_000 };

    let mut lines = vec![
        "[run_tests result summarized for context]".to_string(),
        format!(
            "status: {}, exit_code: {exit_code}",
            if success { "passed" } else { "failed" }
        ),
        format!("command: {}", summarize_text(command, 300)),
    ];
    if let Some(stderr) = stderr {
        lines.push(format!(
            "stderr: {}",
            summarize_text_head_tail(stderr, stream_limit)
        ));
    }
    if let Some(stdout) = stdout {
        lines.push(format!(
            "stdout: {}",
            summarize_text_head_tail(stdout, stream_limit)
        ));
    }
    Some(lines.join("\n"))
}

fn run_verifier_status_rank(status: Option<&str>) -> u8 {
    match status.unwrap_or_default() {
        "failed" | "timeout" => 0,
        "skipped" => 1,
        "passed" => 2,
        _ => 3,
    }
}

fn compact_run_verifiers_result_for_context(raw: &str) -> Option<String> {
    let parsed: Value = serde_json::from_str(raw).ok()?;
    let gates = parsed.get("gates")?.as_array()?;
    let summary = json_text(&parsed, "summary")
        .map(ToString::to_string)
        .unwrap_or_else(|| {
            let passed = json_number_text(&parsed, "passed").unwrap_or_else(|| "?".to_string());
            let failed = json_number_text(&parsed, "failed").unwrap_or_else(|| "?".to_string());
            let skipped = json_number_text(&parsed, "skipped").unwrap_or_else(|| "?".to_string());
            format!("{passed} passed, {failed} failed, {skipped} skipped")
        });

    let mut ordered: Vec<&Value> = gates.iter().collect();
    ordered.sort_by(|a, b| {
        run_verifier_status_rank(json_text(a, "status"))
            .cmp(&run_verifier_status_rank(json_text(b, "status")))
            .then_with(|| json_text(a, "name").cmp(&json_text(b, "name")))
    });

    let mut lines = vec![
        "[run_verifiers result summarized for context]".to_string(),
        format!("summary: {summary}"),
    ];
    let profile = json_text(&parsed, "profile");
    let level = json_text(&parsed, "level");
    if profile.is_some() || level.is_some() {
        lines.push(format!(
            "selection: profile={}, level={}",
            profile.unwrap_or("?"),
            level.unwrap_or("?")
        ));
    }

    for (idx, gate) in ordered.iter().enumerate() {
        if idx >= 12 {
            lines.push(format!(
                "- ... {} more gate(s) omitted from context summary",
                ordered.len().saturating_sub(idx)
            ));
            break;
        }

        let name = json_text(gate, "name").unwrap_or("gate");
        let ecosystem = json_text(gate, "ecosystem").unwrap_or("unknown");
        let status = json_text(gate, "status").unwrap_or("unknown");
        let exit = json_number_text(gate, "exit_code")
            .map(|code| format!(" exit={code}"))
            .unwrap_or_default();
        lines.push(format!("- {name} ({ecosystem}): {status}{exit}"));

        if status != "passed" {
            if let Some(command) = json_text(gate, "command") {
                lines.push(format!("  command: {}", summarize_text(command, 240)));
            }
            if let Some(detail) = json_text(gate, "skipped_reason")
                .or_else(|| json_text(gate, "stderr"))
                .or_else(|| json_text(gate, "stdout"))
            {
                lines.push(format!(
                    "  detail: {}",
                    summarize_text_head_tail(detail, 600)
                ));
            }
        }
    }

    Some(lines.join("\n"))
}

fn compact_task_gate_run_result_for_context(raw: &str) -> Option<String> {
    let parsed: Value = serde_json::from_str(raw).ok()?;
    let gate = parsed.get("gate")?;
    let gate_name = json_text(gate, "gate").unwrap_or("gate");
    let status = json_text(gate, "status").unwrap_or("unknown");
    let command = json_text(gate, "command").unwrap_or("(unknown command)");
    let summary = json_text(gate, "summary")
        .or_else(|| json_text(&parsed, "stderr_summary"))
        .or_else(|| json_text(&parsed, "stdout_summary"));
    let exit = json_number_text(gate, "exit_code")
        .map(|code| format!(", exit_code: {code}"))
        .unwrap_or_default();

    let mut lines = vec![
        "[task_gate_run result summarized for context]".to_string(),
        format!("gate: {gate_name}, status: {status}{exit}"),
        format!("command: {}", summarize_text(command, 300)),
    ];
    if let Some(summary) = summary {
        lines.push(format!("summary: {}", summarize_text(summary, 800)));
    }
    if let Some(log_path) = json_text(gate, "log_path") {
        lines.push(format!("log_path: {log_path}"));
    }
    Some(lines.join("\n"))
}

fn compact_structured_tool_result_for_context(tool_name: &str, raw: &str) -> Option<String> {
    match tool_name {
        "run_tests" => compact_run_tests_result_for_context(raw),
        "run_verifiers" => compact_run_verifiers_result_for_context(raw),
        "task_gate_run" => compact_task_gate_run_result_for_context(raw),
        _ => None,
    }
}

fn tool_result_context_limits_for_model(model: &str) -> ToolResultContextLimits {
    let is_large_context =
        context_window_for_model(model).is_some_and(|window| window >= LARGE_CONTEXT_WINDOW_TOKENS);

    if is_large_context {
        ToolResultContextLimits {
            hard_limit_chars: LARGE_CONTEXT_TOOL_RESULT_HARD_LIMIT_CHARS,
            noisy_soft_limit_chars: LARGE_CONTEXT_TOOL_RESULT_SOFT_LIMIT_CHARS,
            snippet_chars: LARGE_CONTEXT_TOOL_RESULT_SNIPPET_CHARS,
        }
    } else {
        ToolResultContextLimits {
            hard_limit_chars: TOOL_RESULT_CONTEXT_HARD_LIMIT_CHARS,
            noisy_soft_limit_chars: TOOL_RESULT_CONTEXT_SOFT_LIMIT_CHARS,
            snippet_chars: TOOL_RESULT_CONTEXT_SNIPPET_CHARS,
        }
    }
}

pub(crate) fn compact_tool_result_for_context(
    model: &str,
    tool_name: &str,
    output: &ToolResult,
) -> String {
    let raw = output.content.trim();
    if raw.is_empty() {
        return String::new();
    }

    if let Some(summary) = compact_subagent_tool_result_for_context(tool_name, raw) {
        return summary;
    }

    if let Some(summary) = compact_structured_tool_result_for_context(tool_name, raw) {
        return summary;
    }

    let limits = tool_result_context_limits_for_model(model);
    let raw_chars = raw.chars().count();
    let should_compact = raw_chars > limits.hard_limit_chars
        || (tool_result_is_noisy(tool_name) && raw_chars > limits.noisy_soft_limit_chars);
    if !should_compact {
        return raw.to_string();
    }

    let snippet = summarize_text_head_tail(raw, limits.snippet_chars);
    let omitted = raw_chars.saturating_sub(snippet.chars().count());
    let summary = tool_result_metadata_summary(output.metadata.as_ref());

    if let Some(summary) = summary {
        format!(
            "[{tool_name} output compacted to protect context]\nSummary: {summary}\nSnippet: {snippet}\n(Original: {raw_chars} chars, omitted: {omitted} chars.)"
        )
    } else {
        format!(
            "[{tool_name} output compacted to protect context]\nSnippet: {snippet}\n(Original: {raw_chars} chars, omitted: {omitted} chars.)"
        )
    }
}

pub(super) fn extract_compaction_summary_prompt(
    prompt: Option<SystemPrompt>,
) -> Option<SystemPrompt> {
    match prompt {
        Some(SystemPrompt::Blocks(blocks)) => {
            let summary_blocks: Vec<_> = blocks
                .into_iter()
                .filter(|block| block.text.contains(COMPACTION_SUMMARY_MARKER))
                .collect();
            if summary_blocks.is_empty() {
                None
            } else {
                Some(SystemPrompt::Blocks(summary_blocks))
            }
        }
        Some(SystemPrompt::Text(text)) => {
            if text.contains(COMPACTION_SUMMARY_MARKER) {
                Some(SystemPrompt::Text(text))
            } else {
                None
            }
        }
        None => None,
    }
}

#[allow(dead_code)] // exposed for future engine-side callers; current call path goes through compaction::estimate_input_tokens_conservative via token_estimate_cache.
fn estimate_text_tokens_conservative(text: &str) -> usize {
    text.chars().count().div_ceil(3)
}

#[allow(dead_code)] // see estimate_text_tokens_conservative above
fn estimate_system_tokens_conservative(system: Option<&SystemPrompt>) -> usize {
    match system {
        Some(SystemPrompt::Text(text)) => estimate_text_tokens_conservative(text),
        Some(SystemPrompt::Blocks(blocks)) => blocks
            .iter()
            .map(|block| estimate_text_tokens_conservative(&block.text))
            .sum(),
        None => 0,
    }
}

#[allow(dead_code)] // see estimate_text_tokens_conservative above
pub(super) fn estimate_input_tokens_conservative(
    messages: &[Message],
    system: Option<&SystemPrompt>,
) -> usize {
    let message_tokens = estimate_tokens(messages).saturating_mul(3).div_ceil(2);
    let system_tokens = estimate_system_tokens_conservative(system);
    let framing_overhead = messages.len().saturating_mul(12).saturating_add(48);
    message_tokens
        .saturating_add(system_tokens)
        .saturating_add(framing_overhead)
}

/// Context windows at or above this size reserve the full
/// [`TURN_MAX_OUTPUT_TOKENS`] (262K) when computing the internal input budget,
/// leaving room for V4-class interleaved thinking. Below it, the reservation
/// falls back to [`effective_max_output_tokens`] so a smaller self-hosted
/// window does not underflow to a negative budget.
const INTERNAL_BUDGET_LARGE_WINDOW_THRESHOLD: u32 = 500_000;

/// Internal input-side token budget for a model: `window - reserved_output -
/// headroom`. Used by the preflight check, emergency recovery, and capacity
/// trimming to decide when to compact.
///
/// The reserved-output term is window-dependent:
///   * `window >= 500K` (V4-class large-context) -> [`TURN_MAX_OUTPUT_TOKENS`]
///     (262K). Preserves the "leave room for interleaved thinking" contract.
///   * `window < 500K` (smaller / self-hosted, e.g. a 256K vLLM Qwen window)
///     -> [`effective_max_output_tokens`], i.e. what the API actually caps
///     output at. Reserving the full 262K here would compute
///     `256K - 262K - 1K`, which underflows `checked_sub` to `None` and
///     *silently disables every preflight and emergency recovery path* — the
///     session then runs until the provider hard-rejects on context length.
pub(super) fn context_input_budget(model: &str) -> Option<usize> {
    let window_tokens = context_window_for_model(model)?;
    let window = usize::try_from(window_tokens).ok()?;
    let reserved_output = if window_tokens >= INTERNAL_BUDGET_LARGE_WINDOW_THRESHOLD {
        TURN_MAX_OUTPUT_TOKENS
    } else {
        effective_max_output_tokens(model)
    };
    let output = usize::try_from(reserved_output).ok()?;
    window
        .checked_sub(output)
        .and_then(|v| v.checked_sub(CONTEXT_HEADROOM_TOKENS))
}

pub(super) fn is_context_length_error_message(message: &str) -> bool {
    crate::error_taxonomy::classify_error_message(message) == ErrorCategory::InvalidInput
}