Files
codewhale/crates/tui/src/core/engine/context.rs
T
HUQIANTAO 3de07a99ed perf(engine): memoize estimated_input_tokens via content-keyed cache
The token estimator walks the full session.messages and the active system
prompt. Five call sites per turn in the engine (capacity pre/post tool
checkpoints, error escalation, the seam manager, the trim budget check)
plus four TUI/command consumers (footer, /status, /debug, context
inspector) all re-walked the same data independently. On a 200-message
history with 5 KB of tool results that is roughly 2 ms per call, or
~20 ms of pure waste on a single turn.

Introduce a process-local TokenEstimateCache keyed on
(session.messages_revision, system_prompt_fingerprint). Repeated calls
with the same inputs return the cached value without re-walking the
message list. The cache invalidates as soon as either input changes:
  * session.messages_revision is a monotonic counter bumped in
    Session::add_message, Session::replace_messages, the new
    Session::bump_messages_revision helper, and at every direct
    session.messages mutation site in core/engine.rs and
    core/engine/capacity_flow.rs.
  * system_prompt_fingerprint is a stable 64-bit hash of the
    SystemPrompt::Text or SystemPrompt::Blocks payload.

Also restructures layered_context_checkpoint to compute the estimated
token count before taking a long-lived &SeamManager borrow, and
re-routes the capacity pre/post tool checkpoints to compute the
observation into a local before calling
capacity_controller.observe_*. Both refactors are required to satisfy
the borrow checker once estimated_input_tokens requires &mut self.

Tests: 10 new unit tests cover the miss/hit path, revision bumps,
system-prompt changes, audit-ring capacity, and downward-revision
no-ops. The full 157-test engine suite still passes.
2026-06-03 21:01:37 -07:00

595 lines
22 KiB
Rust

//! Context budgeting and prompt-shaping helpers for the engine.
//!
//! These functions are shared by the streaming turn loop, capacity flow, and
//! engine session maintenance code. Keeping them here prevents the top-level
//! engine module from accumulating unrelated context-policy details.
use crate::compaction::estimate_tokens;
use crate::error_taxonomy::ErrorCategory;
use crate::models::{Message, SystemPrompt, context_window_for_model};
use crate::tools::spec::ToolResult;
use serde_json::Value;
/// Max output tokens requested for normal agent turns. Generous on purpose:
/// V4 thinking models can produce tens of thousands of reasoning tokens on
/// hard prompts before the visible reply, and DeepSeek V4 ships with a 1M
/// context window. v0.7.5 keeps this cap fixed instead of silently lowering
/// `max_tokens` near pressure; hard-cycle/preflight checks reserve this budget
/// plus safety headroom before sending the next request.
pub(super) const TURN_MAX_OUTPUT_TOKENS: u32 = 262_144;
/// Safe max output tokens sent in the API request. This must be low enough to
/// work with providers that have smaller context limits than the model's native
/// window (e.g., self-hosted vLLM/SGLang with `--max-model-len 131072`).
/// DeepSeek's API will still produce as many tokens as needed for thinking;
/// this cap just prevents HTTP 400 from providers with tight limits.
const API_MAX_OUTPUT_TOKENS: u32 = 65_536;
/// Compute the effective `max_tokens` to send in the API request for a given
/// model. Uses `API_MAX_OUTPUT_TOKENS` (64K) which fits within common provider
/// limits (128K+ total). For non-V4 models with smaller context windows, caps
/// at half the context window.
///
/// Override: when the env var `DEEPSEEK_MAX_OUTPUT_TOKENS` is set to a positive
/// integer, this function returns that value directly. Use this for self-hosted
/// providers (vLLM/SGLang) whose `max-model-len` is tight and where the
/// model-table heuristic above would over-allocate. Example: vLLM serving
/// Qwen3.6 with `--max-model-len 65536` should set
/// `DEEPSEEK_MAX_OUTPUT_TOKENS=16384` so input + output stays well under the
/// provider's hard limit.
pub(super) fn effective_max_output_tokens(model: &str) -> u32 {
if let Ok(raw) = std::env::var("DEEPSEEK_MAX_OUTPUT_TOKENS")
&& let Ok(n) = raw.trim().parse::<u32>()
&& n > 0
{
return n;
}
let window = context_window_for_model(model).unwrap_or(128_000);
if window >= 500_000 {
// V4-class models on large-context providers: use 64K which is safe
// for most deployments while still allowing substantial output.
API_MAX_OUTPUT_TOKENS
} else {
// Smaller models: cap at half the context window (leave room for input)
let capped = window / 2;
capped.min(API_MAX_OUTPUT_TOKENS)
}
}
/// Keep this many most recent messages when emergency trimming is required.
pub(super) const MIN_RECENT_MESSAGES_TO_KEEP: usize = 4;
/// Allow a few emergency recovery attempts before failing the turn.
pub(super) const MAX_CONTEXT_RECOVERY_ATTEMPTS: u8 = 2;
/// Reserve additional headroom to avoid hitting provider hard limits.
const CONTEXT_HEADROOM_TOKENS: usize = 1024;
/// Hard cap for any tool output inserted into model context.
const TOOL_RESULT_CONTEXT_HARD_LIMIT_CHARS: usize = 12_000;
/// Soft cap for known noisy tools inserted into model context.
const TOOL_RESULT_CONTEXT_SOFT_LIMIT_CHARS: usize = 2_000;
/// Snippet length kept when compacting tool output for model context.
const TOOL_RESULT_CONTEXT_SNIPPET_CHARS: usize = 900;
/// Hard cap for tool output inserted into a large-context model.
const LARGE_CONTEXT_TOOL_RESULT_HARD_LIMIT_CHARS: usize = 180_000;
/// Soft cap for known noisy tools inserted into a large-context model.
const LARGE_CONTEXT_TOOL_RESULT_SOFT_LIMIT_CHARS: usize = 60_000;
/// Snippet length kept when compacting large-context tool output.
const LARGE_CONTEXT_TOOL_RESULT_SNIPPET_CHARS: usize = 40_000;
/// Context window size at which tool output limits can be relaxed.
const LARGE_CONTEXT_WINDOW_TOKENS: u32 = 500_000;
/// Max chars to keep from metadata-provided output summaries.
const TOOL_RESULT_METADATA_SUMMARY_CHARS: usize = 320;
pub(super) const COMPACTION_SUMMARY_MARKER: &str = "Conversation Summary (Auto-Generated)";
#[derive(Debug, Clone, Copy)]
struct ToolResultContextLimits {
hard_limit_chars: usize,
noisy_soft_limit_chars: usize,
snippet_chars: usize,
}
pub(super) fn summarize_text(text: &str, limit: usize) -> String {
if text.chars().count() <= limit {
return text.to_string();
}
let take = limit.saturating_sub(3);
let mut out: String = text.chars().take(take).collect();
out.push_str("...");
out
}
fn summarize_text_head_tail(text: &str, limit: usize) -> String {
let total = text.chars().count();
if total <= limit {
return text.to_string();
}
if limit <= 20 {
return summarize_text(text, limit);
}
let marker = "\n\n[... output truncated for context ...]\n\n";
let marker_len = marker.chars().count();
if limit <= marker_len + 20 {
return summarize_text(text, limit);
}
let remaining = limit - marker_len;
let head_len = remaining.saturating_mul(2) / 3;
let tail_len = remaining.saturating_sub(head_len);
let head: String = text.chars().take(head_len).collect();
let tail_vec: Vec<char> = text.chars().rev().take(tail_len).collect();
let tail: String = tail_vec.into_iter().rev().collect();
format!("{head}{marker}{tail}")
}
fn tool_result_is_noisy(tool_name: &str) -> bool {
matches!(
tool_name,
"exec_shell"
| "exec_shell_wait"
| "exec_shell_interact"
| "exec_shell_cancel"
| "task_shell_start"
| "task_shell_wait"
| "run_tests"
| "run_verifiers"
| "task_gate_run"
| "multi_tool_use.parallel"
| "web_search"
)
}
fn tool_result_metadata_summary(metadata: Option<&serde_json::Value>) -> Option<String> {
let obj = metadata?.as_object()?;
for key in ["summary", "stdout_summary", "stderr_summary", "message"] {
if let Some(text) = obj.get(key).and_then(serde_json::Value::as_str) {
let trimmed = text.trim();
if !trimmed.is_empty() {
return Some(summarize_text(trimmed, TOOL_RESULT_METADATA_SUMMARY_CHARS));
}
}
}
None
}
fn summarize_subagent_status(status: &serde_json::Value) -> String {
if let Some(raw) = status.as_str() {
return raw.to_string();
}
if let Some(obj) = status.as_object()
&& let Some((kind, value)) = obj.iter().next()
{
if let Some(reason) = value.as_str().filter(|s| !s.trim().is_empty()) {
return format!("{kind}({})", summarize_text(reason.trim(), 120));
}
return kind.to_string();
}
status.to_string()
}
fn summarize_subagent_snapshot(snapshot: &serde_json::Value, index: usize) -> String {
if let Some(inner) = snapshot.get("snapshot") {
return summarize_subagent_snapshot(inner, index);
}
let Some(obj) = snapshot.as_object() else {
return format!(
"- item {index}: {}",
summarize_text(&snapshot.to_string(), 240)
);
};
let agent_id = obj
.get("agent_id")
.and_then(serde_json::Value::as_str)
.unwrap_or("unknown");
let agent_type = obj
.get("agent_type")
.and_then(serde_json::Value::as_str)
.unwrap_or("agent");
let status = obj
.get("status")
.map(summarize_subagent_status)
.unwrap_or_else(|| "unknown".to_string());
let objective = obj
.get("assignment")
.and_then(|assignment| assignment.get("objective"))
.and_then(serde_json::Value::as_str)
.map(str::trim)
.filter(|s| !s.is_empty())
.map(|s| summarize_text(s, 220));
let result = obj
.get("result")
.and_then(serde_json::Value::as_str)
.map(str::trim)
.filter(|s| !s.is_empty())
.map(|s| summarize_text(s, 1_600));
let steps = obj.get("steps_taken").and_then(serde_json::Value::as_u64);
let duration_ms = obj.get("duration_ms").and_then(serde_json::Value::as_u64);
let mut lines = vec![format!("- {agent_id} ({agent_type}) status={status}")];
if let Some(objective) = objective {
lines.push(format!(" objective: {objective}"));
}
match result {
Some(result) => lines.push(format!(" result: {result}")),
None => lines.push(" result: not available yet".to_string()),
}
if steps.is_some() || duration_ms.is_some() {
let steps = steps
.map(|n| n.to_string())
.unwrap_or_else(|| "?".to_string());
let duration_ms = duration_ms
.map(|n| n.to_string())
.unwrap_or_else(|| "?".to_string());
lines.push(format!(" stats: steps={steps}, duration_ms={duration_ms}"));
}
lines.join("\n")
}
fn compact_subagent_tool_result_for_context(tool_name: &str, raw: &str) -> Option<String> {
if !matches!(
tool_name,
"agent_open"
| "agent_eval"
| "agent_close"
| "agent_result"
| "agent_wait"
| "tool_agent"
| "wait"
) {
return None;
}
let parsed: serde_json::Value = serde_json::from_str(raw).ok()?;
let snapshots: Vec<&serde_json::Value> = match &parsed {
serde_json::Value::Array(items) => items.iter().collect(),
serde_json::Value::Object(_) => vec![&parsed],
_ => return None,
};
let mut out = String::from("[sub-agent result summarized for parent context]\n");
out.push_str(
"Child results are self-reports; verify side effects with tools like read_file or list_dir before claiming success.\n",
);
out.push_str("Use `agent_eval` for a fresh projection or `handle_read` on `transcript_handle` for bounded transcript slices.\n");
for (idx, snapshot) in snapshots.iter().enumerate() {
if idx >= 8 {
out.push_str(&format!(
"- ... {} more sub-agent result(s) omitted from context summary\n",
snapshots.len().saturating_sub(idx)
));
break;
}
out.push_str(&summarize_subagent_snapshot(snapshot, idx + 1));
out.push('\n');
}
Some(out.trim_end().to_string())
}
fn json_text<'a>(value: &'a Value, key: &str) -> Option<&'a str> {
value
.get(key)
.and_then(Value::as_str)
.map(str::trim)
.filter(|s| !s.is_empty())
}
fn json_number_text(value: &Value, key: &str) -> Option<String> {
value
.get(key)
.and_then(|value| {
value
.as_i64()
.map(|n| n.to_string())
.or_else(|| value.as_u64().map(|n| n.to_string()))
})
.or_else(|| {
value
.get(key)
.and_then(Value::as_str)
.map(str::trim)
.filter(|s| !s.is_empty())
.map(ToString::to_string)
})
}
fn compact_run_tests_result_for_context(raw: &str) -> Option<String> {
let parsed: Value = serde_json::from_str(raw).ok()?;
let success = parsed.get("success")?.as_bool()?;
let exit_code = json_number_text(&parsed, "exit_code").unwrap_or_else(|| "?".to_string());
let command = json_text(&parsed, "command").unwrap_or("(unknown command)");
let stdout = json_text(&parsed, "stdout");
let stderr = json_text(&parsed, "stderr");
let stream_limit = if success { 500 } else { 1_000 };
let mut lines = vec![
"[run_tests result summarized for context]".to_string(),
format!(
"status: {}, exit_code: {exit_code}",
if success { "passed" } else { "failed" }
),
format!("command: {}", summarize_text(command, 300)),
];
if let Some(stderr) = stderr {
lines.push(format!(
"stderr: {}",
summarize_text_head_tail(stderr, stream_limit)
));
}
if let Some(stdout) = stdout {
lines.push(format!(
"stdout: {}",
summarize_text_head_tail(stdout, stream_limit)
));
}
Some(lines.join("\n"))
}
fn run_verifier_status_rank(status: Option<&str>) -> u8 {
match status.unwrap_or_default() {
"failed" | "timeout" => 0,
"skipped" => 1,
"passed" => 2,
_ => 3,
}
}
fn compact_run_verifiers_result_for_context(raw: &str) -> Option<String> {
let parsed: Value = serde_json::from_str(raw).ok()?;
let gates = parsed.get("gates")?.as_array()?;
let summary = json_text(&parsed, "summary")
.map(ToString::to_string)
.unwrap_or_else(|| {
let passed = json_number_text(&parsed, "passed").unwrap_or_else(|| "?".to_string());
let failed = json_number_text(&parsed, "failed").unwrap_or_else(|| "?".to_string());
let skipped = json_number_text(&parsed, "skipped").unwrap_or_else(|| "?".to_string());
format!("{passed} passed, {failed} failed, {skipped} skipped")
});
let mut ordered: Vec<&Value> = gates.iter().collect();
ordered.sort_by(|a, b| {
run_verifier_status_rank(json_text(a, "status"))
.cmp(&run_verifier_status_rank(json_text(b, "status")))
.then_with(|| json_text(a, "name").cmp(&json_text(b, "name")))
});
let mut lines = vec![
"[run_verifiers result summarized for context]".to_string(),
format!("summary: {summary}"),
];
let profile = json_text(&parsed, "profile");
let level = json_text(&parsed, "level");
if profile.is_some() || level.is_some() {
lines.push(format!(
"selection: profile={}, level={}",
profile.unwrap_or("?"),
level.unwrap_or("?")
));
}
for (idx, gate) in ordered.iter().enumerate() {
if idx >= 12 {
lines.push(format!(
"- ... {} more gate(s) omitted from context summary",
ordered.len().saturating_sub(idx)
));
break;
}
let name = json_text(gate, "name").unwrap_or("gate");
let ecosystem = json_text(gate, "ecosystem").unwrap_or("unknown");
let status = json_text(gate, "status").unwrap_or("unknown");
let exit = json_number_text(gate, "exit_code")
.map(|code| format!(" exit={code}"))
.unwrap_or_default();
lines.push(format!("- {name} ({ecosystem}): {status}{exit}"));
if status != "passed" {
if let Some(command) = json_text(gate, "command") {
lines.push(format!(" command: {}", summarize_text(command, 240)));
}
if let Some(detail) = json_text(gate, "skipped_reason")
.or_else(|| json_text(gate, "stderr"))
.or_else(|| json_text(gate, "stdout"))
{
lines.push(format!(
" detail: {}",
summarize_text_head_tail(detail, 600)
));
}
}
}
Some(lines.join("\n"))
}
fn compact_task_gate_run_result_for_context(raw: &str) -> Option<String> {
let parsed: Value = serde_json::from_str(raw).ok()?;
let gate = parsed.get("gate")?;
let gate_name = json_text(gate, "gate").unwrap_or("gate");
let status = json_text(gate, "status").unwrap_or("unknown");
let command = json_text(gate, "command").unwrap_or("(unknown command)");
let summary = json_text(gate, "summary")
.or_else(|| json_text(&parsed, "stderr_summary"))
.or_else(|| json_text(&parsed, "stdout_summary"));
let exit = json_number_text(gate, "exit_code")
.map(|code| format!(", exit_code: {code}"))
.unwrap_or_default();
let mut lines = vec![
"[task_gate_run result summarized for context]".to_string(),
format!("gate: {gate_name}, status: {status}{exit}"),
format!("command: {}", summarize_text(command, 300)),
];
if let Some(summary) = summary {
lines.push(format!("summary: {}", summarize_text(summary, 800)));
}
if let Some(log_path) = json_text(gate, "log_path") {
lines.push(format!("log_path: {log_path}"));
}
Some(lines.join("\n"))
}
fn compact_structured_tool_result_for_context(tool_name: &str, raw: &str) -> Option<String> {
match tool_name {
"run_tests" => compact_run_tests_result_for_context(raw),
"run_verifiers" => compact_run_verifiers_result_for_context(raw),
"task_gate_run" => compact_task_gate_run_result_for_context(raw),
_ => None,
}
}
fn tool_result_context_limits_for_model(model: &str) -> ToolResultContextLimits {
let is_large_context =
context_window_for_model(model).is_some_and(|window| window >= LARGE_CONTEXT_WINDOW_TOKENS);
if is_large_context {
ToolResultContextLimits {
hard_limit_chars: LARGE_CONTEXT_TOOL_RESULT_HARD_LIMIT_CHARS,
noisy_soft_limit_chars: LARGE_CONTEXT_TOOL_RESULT_SOFT_LIMIT_CHARS,
snippet_chars: LARGE_CONTEXT_TOOL_RESULT_SNIPPET_CHARS,
}
} else {
ToolResultContextLimits {
hard_limit_chars: TOOL_RESULT_CONTEXT_HARD_LIMIT_CHARS,
noisy_soft_limit_chars: TOOL_RESULT_CONTEXT_SOFT_LIMIT_CHARS,
snippet_chars: TOOL_RESULT_CONTEXT_SNIPPET_CHARS,
}
}
}
pub(crate) fn compact_tool_result_for_context(
model: &str,
tool_name: &str,
output: &ToolResult,
) -> String {
let raw = output.content.trim();
if raw.is_empty() {
return String::new();
}
if let Some(summary) = compact_subagent_tool_result_for_context(tool_name, raw) {
return summary;
}
if let Some(summary) = compact_structured_tool_result_for_context(tool_name, raw) {
return summary;
}
let limits = tool_result_context_limits_for_model(model);
let raw_chars = raw.chars().count();
let should_compact = raw_chars > limits.hard_limit_chars
|| (tool_result_is_noisy(tool_name) && raw_chars > limits.noisy_soft_limit_chars);
if !should_compact {
return raw.to_string();
}
let snippet = summarize_text_head_tail(raw, limits.snippet_chars);
let omitted = raw_chars.saturating_sub(snippet.chars().count());
let summary = tool_result_metadata_summary(output.metadata.as_ref());
if let Some(summary) = summary {
format!(
"[{tool_name} output compacted to protect context]\nSummary: {summary}\nSnippet: {snippet}\n(Original: {raw_chars} chars, omitted: {omitted} chars.)"
)
} else {
format!(
"[{tool_name} output compacted to protect context]\nSnippet: {snippet}\n(Original: {raw_chars} chars, omitted: {omitted} chars.)"
)
}
}
pub(super) fn extract_compaction_summary_prompt(
prompt: Option<SystemPrompt>,
) -> Option<SystemPrompt> {
match prompt {
Some(SystemPrompt::Blocks(blocks)) => {
let summary_blocks: Vec<_> = blocks
.into_iter()
.filter(|block| block.text.contains(COMPACTION_SUMMARY_MARKER))
.collect();
if summary_blocks.is_empty() {
None
} else {
Some(SystemPrompt::Blocks(summary_blocks))
}
}
Some(SystemPrompt::Text(text)) => {
if text.contains(COMPACTION_SUMMARY_MARKER) {
Some(SystemPrompt::Text(text))
} else {
None
}
}
None => None,
}
}
#[allow(dead_code)] // exposed for future engine-side callers; current call path goes through compaction::estimate_input_tokens_conservative via token_estimate_cache.
fn estimate_text_tokens_conservative(text: &str) -> usize {
text.chars().count().div_ceil(3)
}
#[allow(dead_code)] // see estimate_text_tokens_conservative above
fn estimate_system_tokens_conservative(system: Option<&SystemPrompt>) -> usize {
match system {
Some(SystemPrompt::Text(text)) => estimate_text_tokens_conservative(text),
Some(SystemPrompt::Blocks(blocks)) => blocks
.iter()
.map(|block| estimate_text_tokens_conservative(&block.text))
.sum(),
None => 0,
}
}
#[allow(dead_code)] // see estimate_text_tokens_conservative above
pub(super) fn estimate_input_tokens_conservative(
messages: &[Message],
system: Option<&SystemPrompt>,
) -> usize {
let message_tokens = estimate_tokens(messages).saturating_mul(3).div_ceil(2);
let system_tokens = estimate_system_tokens_conservative(system);
let framing_overhead = messages.len().saturating_mul(12).saturating_add(48);
message_tokens
.saturating_add(system_tokens)
.saturating_add(framing_overhead)
}
/// Context windows at or above this size reserve the full
/// [`TURN_MAX_OUTPUT_TOKENS`] (262K) when computing the internal input budget,
/// leaving room for V4-class interleaved thinking. Below it, the reservation
/// falls back to [`effective_max_output_tokens`] so a smaller self-hosted
/// window does not underflow to a negative budget.
const INTERNAL_BUDGET_LARGE_WINDOW_THRESHOLD: u32 = 500_000;
/// Internal input-side token budget for a model: `window - reserved_output -
/// headroom`. Used by the preflight check, emergency recovery, and capacity
/// trimming to decide when to compact.
///
/// The reserved-output term is window-dependent:
/// * `window >= 500K` (V4-class large-context) -> [`TURN_MAX_OUTPUT_TOKENS`]
/// (262K). Preserves the "leave room for interleaved thinking" contract.
/// * `window < 500K` (smaller / self-hosted, e.g. a 256K vLLM Qwen window)
/// -> [`effective_max_output_tokens`], i.e. what the API actually caps
/// output at. Reserving the full 262K here would compute
/// `256K - 262K - 1K`, which underflows `checked_sub` to `None` and
/// *silently disables every preflight and emergency recovery path* — the
/// session then runs until the provider hard-rejects on context length.
pub(super) fn context_input_budget(model: &str) -> Option<usize> {
let window_tokens = context_window_for_model(model)?;
let window = usize::try_from(window_tokens).ok()?;
let reserved_output = if window_tokens >= INTERNAL_BUDGET_LARGE_WINDOW_THRESHOLD {
TURN_MAX_OUTPUT_TOKENS
} else {
effective_max_output_tokens(model)
};
let output = usize::try_from(reserved_output).ok()?;
window
.checked_sub(output)
.and_then(|v| v.checked_sub(CONTEXT_HEADROOM_TOKENS))
}
pub(super) fn is_context_length_error_message(message: &str) -> bool {
crate::error_taxonomy::classify_error_message(message) == ErrorCategory::InvalidInput
}