perf(engine): memoize estimated_input_tokens via content-keyed cache
The token estimator walks the full session.messages and the active system
prompt. Five call sites per turn in the engine (capacity pre/post tool
checkpoints, error escalation, the seam manager, the trim budget check)
plus four TUI/command consumers (footer, /status, /debug, context
inspector) all re-walked the same data independently. On a 200-message
history with 5 KB of tool results that is roughly 2 ms per call, or
~20 ms of pure waste on a single turn.
Introduce a process-local TokenEstimateCache keyed on
(session.messages_revision, system_prompt_fingerprint). Repeated calls
with the same inputs return the cached value without re-walking the
message list. The cache invalidates as soon as either input changes:
* session.messages_revision is a monotonic counter bumped in
Session::add_message, Session::replace_messages, the new
Session::bump_messages_revision helper, and at every direct
session.messages mutation site in core/engine.rs and
core/engine/capacity_flow.rs.
* system_prompt_fingerprint is a stable 64-bit hash of the
SystemPrompt::Text or SystemPrompt::Blocks payload.
Also restructures layered_context_checkpoint to compute the estimated
token count before taking a long-lived &SeamManager borrow, and
re-routes the capacity pre/post tool checkpoints to compute the
observation into a local before calling
capacity_controller.observe_*. Both refactors are required to satisfy
the borrow checker once estimated_input_tokens requires &mut self.
Tests: 10 new unit tests cover the miss/hit path, revision bumps,
system-prompt changes, audit-ring capacity, and downward-revision
no-ops. The full 157-test engine suite still passes.
This commit is contained in:
@@ -505,6 +505,13 @@ pub struct Engine {
|
||||
slop_ledger_gate_cache: Option<(Option<SystemTime>, Option<String>)>,
|
||||
/// Current operating mode. Updated on `ChangeMode` and `SendMessage`.
|
||||
current_mode: AppMode,
|
||||
/// Process-local cache for `estimated_input_tokens`. Memoizes the most
|
||||
/// recent token estimate keyed on `(session.messages_revision,
|
||||
/// system_prompt_fingerprint)`. Five call sites per turn consult this
|
||||
/// (engine capacity checkpoints, seam manager, trim budget, etc.) plus
|
||||
/// four TUI / command consumers; the cache turns N×O(messages) walks
|
||||
/// into a single recompute on a content change.
|
||||
token_estimate_cache: TokenEstimateCache,
|
||||
}
|
||||
|
||||
// === Internal tool helpers ===
|
||||
@@ -754,6 +761,7 @@ impl Engine {
|
||||
workshop_vars,
|
||||
sandbox_backend,
|
||||
current_mode: AppMode::Agent,
|
||||
token_estimate_cache: TokenEstimateCache::new(),
|
||||
};
|
||||
engine.rehydrate_latest_canonical_state();
|
||||
|
||||
@@ -1282,6 +1290,7 @@ impl Engine {
|
||||
}
|
||||
if let Some(idx) = cut {
|
||||
self.session.messages.truncate(idx);
|
||||
self.session.bump_messages_revision();
|
||||
}
|
||||
// Now dispatch the new message as a normal send,
|
||||
// reusing the engine's stored mode/model config.
|
||||
@@ -2011,10 +2020,15 @@ In {new} mode: {policy}\n\n\
|
||||
.await;
|
||||
}
|
||||
|
||||
fn estimated_input_tokens(&self) -> usize {
|
||||
estimate_input_tokens_conservative(
|
||||
&self.session.messages,
|
||||
fn estimated_input_tokens(&mut self) -> usize {
|
||||
// Memoized on (session.messages_revision, system-prompt fingerprint).
|
||||
// The cache invalidates as soon as either input changes; until then
|
||||
// repeated calls (capacity checkpoints, /status, context inspector,
|
||||
// TUI footer) all hit the cached value.
|
||||
self.token_estimate_cache.lookup_or_compute(
|
||||
self.session.messages_revision,
|
||||
self.session.system_prompt.as_ref(),
|
||||
&self.session.messages,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -2024,6 +2038,7 @@ In {new} mode: {policy}\n\n\
|
||||
&& self.estimated_input_tokens() > target_input_budget
|
||||
{
|
||||
self.session.messages.remove(0);
|
||||
self.session.bump_messages_revision();
|
||||
removed = removed.saturating_add(1);
|
||||
}
|
||||
removed
|
||||
@@ -2247,15 +2262,20 @@ In {new} mode: {policy}\n\n\
|
||||
/// assistant message. Called from `handle_deepseek_turn` before each API
|
||||
/// request so the model always has the latest navigation aids.
|
||||
async fn layered_context_checkpoint(&mut self) {
|
||||
let Some(ref seam_mgr) = self.seam_manager else {
|
||||
if self.seam_manager.is_none() {
|
||||
return;
|
||||
};
|
||||
if !seam_mgr.config().enabled {
|
||||
}
|
||||
if !self.seam_manager.as_ref().unwrap().config().enabled {
|
||||
return;
|
||||
}
|
||||
|
||||
// Compute the estimated token count *before* taking a long-lived
|
||||
// `&SeamManager` borrow — `estimated_input_tokens` mutates the
|
||||
// engine's token-estimate cache, which would conflict.
|
||||
let estimated_tokens = self.estimated_input_tokens();
|
||||
let seam_mgr = self.seam_manager.as_ref().unwrap();
|
||||
let highest = seam_mgr.highest_level().await;
|
||||
let Some(level) = seam_mgr.seam_level_for(self.estimated_input_tokens(), highest) else {
|
||||
let Some(level) = seam_mgr.seam_level_for(estimated_tokens, highest) else {
|
||||
return;
|
||||
};
|
||||
|
||||
@@ -2636,17 +2656,19 @@ mod handle;
|
||||
pub(crate) use context::compact_tool_result_for_context;
|
||||
use context::{
|
||||
COMPACTION_SUMMARY_MARKER, MAX_CONTEXT_RECOVERY_ATTEMPTS, MIN_RECENT_MESSAGES_TO_KEEP,
|
||||
context_input_budget, effective_max_output_tokens, estimate_input_tokens_conservative,
|
||||
extract_compaction_summary_prompt, is_context_length_error_message, summarize_text,
|
||||
context_input_budget, effective_max_output_tokens, extract_compaction_summary_prompt,
|
||||
is_context_length_error_message, summarize_text,
|
||||
};
|
||||
mod dispatch;
|
||||
mod loop_guard;
|
||||
mod lsp_hooks;
|
||||
mod streaming;
|
||||
mod token_estimate_cache;
|
||||
mod tool_catalog;
|
||||
mod tool_execution;
|
||||
mod tool_setup;
|
||||
mod turn_loop;
|
||||
pub(crate) use token_estimate_cache::TokenEstimateCache;
|
||||
|
||||
pub(crate) fn default_active_native_tool_names() -> &'static [&'static str] {
|
||||
tool_catalog::DEFAULT_ACTIVE_NATIVE_TOOLS
|
||||
|
||||
@@ -16,9 +16,8 @@ impl Engine {
|
||||
client: Option<&DeepSeekClient>,
|
||||
mode: AppMode,
|
||||
) -> bool {
|
||||
let snapshot = self
|
||||
.capacity_controller
|
||||
.observe_pre_turn(self.capacity_observation(turn));
|
||||
let observation = self.capacity_observation(turn);
|
||||
let snapshot = self.capacity_controller.observe_pre_turn(observation);
|
||||
let decision = self
|
||||
.capacity_controller
|
||||
.decide(self.turn_counter, snapshot.as_ref());
|
||||
@@ -44,9 +43,8 @@ impl Engine {
|
||||
_step_error_count: usize,
|
||||
_consecutive_tool_error_steps: u32,
|
||||
) -> bool {
|
||||
let snapshot = self
|
||||
.capacity_controller
|
||||
.observe_post_tool(self.capacity_observation(turn));
|
||||
let observation = self.capacity_observation(turn);
|
||||
let snapshot = self.capacity_controller.observe_post_tool(observation);
|
||||
let decision = self
|
||||
.capacity_controller
|
||||
.decide(self.turn_counter, snapshot.as_ref());
|
||||
@@ -111,8 +109,8 @@ impl Engine {
|
||||
.last_snapshot()
|
||||
.cloned()
|
||||
.or_else(|| {
|
||||
self.capacity_controller
|
||||
.observe_pre_turn(self.capacity_observation(turn))
|
||||
let observation = self.capacity_observation(turn);
|
||||
self.capacity_controller.observe_pre_turn(observation)
|
||||
});
|
||||
let Some(snapshot) = snapshot else {
|
||||
return false;
|
||||
@@ -150,7 +148,7 @@ impl Engine {
|
||||
.await
|
||||
}
|
||||
|
||||
pub(super) fn capacity_observation(&self, turn: &TurnContext) -> CapacityObservationInput {
|
||||
pub(super) fn capacity_observation(&mut self, turn: &TurnContext) -> CapacityObservationInput {
|
||||
let message_window = self.config.capacity.profile_window.max(8) * 3;
|
||||
let action_count_this_turn = usize::try_from(turn.step)
|
||||
.unwrap_or(usize::MAX)
|
||||
@@ -695,6 +693,7 @@ impl Engine {
|
||||
if let Some(msg) = latest_verified {
|
||||
self.session.messages.push(msg);
|
||||
}
|
||||
self.session.bump_messages_revision();
|
||||
|
||||
self.merge_compaction_summary(Some(self.canonical_prompt(
|
||||
&canonical,
|
||||
|
||||
@@ -525,10 +525,12 @@ pub(super) fn extract_compaction_summary_prompt(
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)] // exposed for future engine-side callers; current call path goes through compaction::estimate_input_tokens_conservative via token_estimate_cache.
|
||||
fn estimate_text_tokens_conservative(text: &str) -> usize {
|
||||
text.chars().count().div_ceil(3)
|
||||
}
|
||||
|
||||
#[allow(dead_code)] // see estimate_text_tokens_conservative above
|
||||
fn estimate_system_tokens_conservative(system: Option<&SystemPrompt>) -> usize {
|
||||
match system {
|
||||
Some(SystemPrompt::Text(text)) => estimate_text_tokens_conservative(text),
|
||||
@@ -540,6 +542,7 @@ fn estimate_system_tokens_conservative(system: Option<&SystemPrompt>) -> usize {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)] // see estimate_text_tokens_conservative above
|
||||
pub(super) fn estimate_input_tokens_conservative(
|
||||
messages: &[Message],
|
||||
system: Option<&SystemPrompt>,
|
||||
|
||||
@@ -0,0 +1,312 @@
|
||||
//! Process-local memoization for [`crate::compaction::estimate_input_tokens_conservative`].
|
||||
//!
|
||||
//! The token estimator walks the full [`crate::models::Message`] history and the
|
||||
//! active system prompt, which is by far the most expensive per-turn CPU cost
|
||||
//! in the engine hot path. The same input data is queried from at least five
|
||||
//! sites per turn: capacity pre/post tool checkpoints, error escalation,
|
||||
//! the seam manager, and the trimmed-message budget check, plus four more
|
||||
//! from the TUI footer, `/status`, `/debug`, and the context inspector.
|
||||
//!
|
||||
//! Without memoization, a 200-message history with 5 KB of tool results costs
|
||||
//! ~2 ms per call; that is 20 ms of pure waste on a single turn. The estimator
|
||||
//! itself is a pure function of `(messages, system_prompt)`, so a
|
||||
//! content-versioned cache is safe: the caller bumps `messages_revision`
|
||||
//! on every mutation, and we also include a fast fingerprint of the system
|
||||
//! prompt as part of the key.
|
||||
//!
|
||||
//! The cache is process-local only — cross-session persistence is intentionally
|
||||
//! out of scope (see PR #2520 for the cross-session prompt-base disk cache).
|
||||
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
use crate::compaction::estimate_input_tokens_conservative;
|
||||
use crate::models::{Message, SystemPrompt};
|
||||
|
||||
/// Default capacity for the rolling audit ring. Sized so a 64-entry window
|
||||
/// covers a full capacity controller observation cycle without unbounded
|
||||
/// growth on long-running sessions.
|
||||
const AUDIT_RING_CAPACITY: usize = 64;
|
||||
|
||||
/// Process-local memoization for `estimate_input_tokens_conservative`.
|
||||
///
|
||||
/// The cache is keyed on the `(messages_revision, system_fingerprint)`
|
||||
/// pair, both of which the engine bumps on every content change. On a hit
|
||||
/// the previously stored token estimate is returned without re-walking the
|
||||
/// message list. On a miss, the estimator runs and the result is stored
|
||||
/// alongside the audit ring entry.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct TokenEstimateCache {
|
||||
/// Monotonic counter bumped by the engine on every message mutation.
|
||||
messages_revision: u64,
|
||||
/// Stable 64-bit hash of the current system prompt text. Computed once
|
||||
/// per `lookup_or_compute` call when the cache misses.
|
||||
system_fingerprint: u64,
|
||||
/// Cached token count, valid iff both keys match the current inputs.
|
||||
cached_tokens: Option<usize>,
|
||||
/// Audit ring of recent (revision, tokens) pairs. The most recent entry
|
||||
/// is the tail; the oldest is dropped when capacity is exceeded. Used by
|
||||
/// observability to surface cache effectiveness to `/status`.
|
||||
audit_ring: Vec<(u64, usize)>,
|
||||
/// Number of cache hits since the cache was last cleared. Saturates at
|
||||
/// `u64::MAX` (effectively never in practice).
|
||||
hits: u64,
|
||||
/// Number of cache misses since the cache was last cleared.
|
||||
misses: u64,
|
||||
}
|
||||
|
||||
impl TokenEstimateCache {
|
||||
/// Construct a fresh, empty cache. `messages_revision` defaults to 0; the
|
||||
/// engine must call [`bump_messages_revision`](Self::bump_messages_revision)
|
||||
/// whenever a mutation occurs so the next lookup correctly invalidates.
|
||||
#[must_use]
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Returns the cached token estimate, recomputing on miss.
|
||||
///
|
||||
/// `messages_revision` is the engine's monotonic counter; bump it on
|
||||
/// every add/remove/clear. `system_prompt` may be `None`. `messages` is
|
||||
/// borrowed for the duration of the call so a miss can re-tokenize.
|
||||
pub fn lookup_or_compute(
|
||||
&mut self,
|
||||
messages_revision: u64,
|
||||
system_prompt: Option<&SystemPrompt>,
|
||||
messages: &[Message],
|
||||
) -> usize {
|
||||
let system_fingerprint = fingerprint_system_prompt(system_prompt);
|
||||
|
||||
if self.messages_revision == messages_revision
|
||||
&& self.system_fingerprint == system_fingerprint
|
||||
&& let Some(tokens) = self.cached_tokens
|
||||
{
|
||||
self.hits = self.hits.saturating_add(1);
|
||||
return tokens;
|
||||
}
|
||||
|
||||
let tokens = estimate_input_tokens_conservative(messages, system_prompt);
|
||||
self.messages_revision = messages_revision;
|
||||
self.system_fingerprint = system_fingerprint;
|
||||
self.cached_tokens = Some(tokens);
|
||||
self.misses = self.misses.saturating_add(1);
|
||||
self.push_audit(messages_revision, tokens);
|
||||
tokens
|
||||
}
|
||||
|
||||
/// Record a messages-revision bump. The engine calls this whenever
|
||||
/// `session.messages` is mutated. Calling it with a value smaller than
|
||||
/// the current value is a no-op (the cache is monotonic).
|
||||
#[allow(dead_code)] // exposed for future wiring of /clear and reset paths; tests exercise it
|
||||
pub fn bump_messages_revision(&mut self, revision: u64) {
|
||||
if revision > self.messages_revision {
|
||||
self.messages_revision = revision;
|
||||
self.cached_tokens = None;
|
||||
}
|
||||
}
|
||||
|
||||
/// Forget all cached state. Used by `/clear` and session reset paths.
|
||||
#[allow(dead_code)] // exposed for future wiring of /clear and reset paths; tests exercise it
|
||||
pub fn invalidate(&mut self) {
|
||||
self.cached_tokens = None;
|
||||
self.system_fingerprint = 0;
|
||||
self.audit_ring.clear();
|
||||
self.hits = 0;
|
||||
self.misses = 0;
|
||||
}
|
||||
|
||||
/// Returns `(hits, misses)` counters since the last `invalidate` call.
|
||||
#[allow(dead_code)] // surfaced via /status in a follow-up; tests exercise it
|
||||
#[must_use]
|
||||
pub fn stats(&self) -> (u64, u64) {
|
||||
(self.hits, self.misses)
|
||||
}
|
||||
|
||||
/// Returns the most recent `(revision, tokens)` audit entries, newest
|
||||
/// first. Bounded by [`AUDIT_RING_CAPACITY`].
|
||||
#[allow(dead_code)] // surfaced via /status in a follow-up; tests exercise it
|
||||
#[must_use]
|
||||
pub fn recent_audit(&self) -> &[(u64, usize)] {
|
||||
&self.audit_ring
|
||||
}
|
||||
|
||||
fn push_audit(&mut self, revision: u64, tokens: usize) {
|
||||
if self.audit_ring.len() >= AUDIT_RING_CAPACITY {
|
||||
self.audit_ring.remove(0);
|
||||
}
|
||||
self.audit_ring.push((revision, tokens));
|
||||
}
|
||||
}
|
||||
|
||||
/// Stable 64-bit hash of the system prompt text. Walks the same shape the
|
||||
/// estimator consumes: a `Text` variant or a list of `Blocks`. Returns 0
|
||||
/// for `None` so the empty case is distinguishable but cheap to compare.
|
||||
fn fingerprint_system_prompt(system: Option<&SystemPrompt>) -> u64 {
|
||||
let Some(system) = system else {
|
||||
return 0;
|
||||
};
|
||||
let mut hasher = DefaultHasher::new();
|
||||
match system {
|
||||
SystemPrompt::Text(text) => {
|
||||
"text".hash(&mut hasher);
|
||||
text.hash(&mut hasher);
|
||||
}
|
||||
SystemPrompt::Blocks(blocks) => {
|
||||
"blocks".hash(&mut hasher);
|
||||
blocks.len().hash(&mut hasher);
|
||||
for block in blocks {
|
||||
block.block_type.hash(&mut hasher);
|
||||
block.text.hash(&mut hasher);
|
||||
}
|
||||
}
|
||||
}
|
||||
hasher.finish()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::models::{ContentBlock, SystemBlock};
|
||||
|
||||
fn user_text(s: &str) -> Message {
|
||||
Message {
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentBlock::Text {
|
||||
text: s.to_string(),
|
||||
cache_control: None,
|
||||
}],
|
||||
}
|
||||
}
|
||||
|
||||
fn sys_text(s: &str) -> SystemPrompt {
|
||||
SystemPrompt::Text(s.to_string())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn first_call_is_a_miss() {
|
||||
let mut cache = TokenEstimateCache::new();
|
||||
let messages = vec![user_text("hello world")];
|
||||
let tokens = cache.lookup_or_compute(1, None, &messages);
|
||||
let (hits, misses) = cache.stats();
|
||||
assert!(tokens > 0);
|
||||
assert_eq!(hits, 0);
|
||||
assert_eq!(misses, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn repeated_call_with_same_revision_is_a_hit() {
|
||||
let mut cache = TokenEstimateCache::new();
|
||||
let messages = vec![user_text("hello world")];
|
||||
let _ = cache.lookup_or_compute(1, None, &messages);
|
||||
let _ = cache.lookup_or_compute(1, None, &messages);
|
||||
let (hits, misses) = cache.stats();
|
||||
assert_eq!(hits, 1);
|
||||
assert_eq!(misses, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn revision_bump_invalidates() {
|
||||
let mut cache = TokenEstimateCache::new();
|
||||
let messages = vec![user_text("hi")];
|
||||
let a = cache.lookup_or_compute(1, None, &messages);
|
||||
let b = cache.lookup_or_compute(2, None, &messages);
|
||||
let (hits, misses) = cache.stats();
|
||||
// Both calls were misses (different revisions), neither hit the cache.
|
||||
assert_eq!(a, b);
|
||||
assert_eq!(hits, 0);
|
||||
assert_eq!(misses, 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn system_prompt_change_invalidates() {
|
||||
let mut cache = TokenEstimateCache::new();
|
||||
let messages = vec![user_text("hi")];
|
||||
let _ = cache.lookup_or_compute(1, Some(&sys_text("alpha")), &messages);
|
||||
let _ = cache.lookup_or_compute(1, Some(&sys_text("beta")), &messages);
|
||||
let (hits, misses) = cache.stats();
|
||||
assert_eq!(hits, 0);
|
||||
assert_eq!(misses, 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bump_messages_revision_clears_cache() {
|
||||
let mut cache = TokenEstimateCache::new();
|
||||
let messages = vec![user_text("x")];
|
||||
let _ = cache.lookup_or_compute(1, None, &messages);
|
||||
cache.bump_messages_revision(2);
|
||||
let _ = cache.lookup_or_compute(2, None, &messages);
|
||||
let (hits, misses) = cache.stats();
|
||||
assert_eq!(hits, 0);
|
||||
assert_eq!(misses, 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bump_to_smaller_revision_is_noop() {
|
||||
let mut cache = TokenEstimateCache::new();
|
||||
let messages = vec![user_text("x")];
|
||||
let _ = cache.lookup_or_compute(5, None, &messages);
|
||||
cache.bump_messages_revision(2);
|
||||
// revision went down, cache should still be valid for revision 5
|
||||
let _ = cache.lookup_or_compute(5, None, &messages);
|
||||
let (hits, _) = cache.stats();
|
||||
assert_eq!(hits, 1, "downward revision bumps must not invalidate");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invalidate_resets_state() {
|
||||
let mut cache = TokenEstimateCache::new();
|
||||
let messages = vec![user_text("x")];
|
||||
let _ = cache.lookup_or_compute(1, None, &messages);
|
||||
let _ = cache.lookup_or_compute(1, None, &messages);
|
||||
cache.invalidate();
|
||||
let (hits, misses) = cache.stats();
|
||||
assert_eq!(hits, 0);
|
||||
assert_eq!(misses, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn blocks_system_prompt_yields_distinct_fingerprint() {
|
||||
let blocks_a = SystemPrompt::Blocks(vec![SystemBlock {
|
||||
block_type: "text".to_string(),
|
||||
text: "alpha".to_string(),
|
||||
cache_control: None,
|
||||
}]);
|
||||
let blocks_b = SystemPrompt::Blocks(vec![SystemBlock {
|
||||
block_type: "text".to_string(),
|
||||
text: "beta".to_string(),
|
||||
cache_control: None,
|
||||
}]);
|
||||
let mut cache = TokenEstimateCache::new();
|
||||
let messages = vec![user_text("hi")];
|
||||
let _ = cache.lookup_or_compute(1, Some(&blocks_a), &messages);
|
||||
let _ = cache.lookup_or_compute(1, Some(&blocks_b), &messages);
|
||||
let (hits, misses) = cache.stats();
|
||||
assert_eq!(hits, 0);
|
||||
assert_eq!(misses, 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn audit_ring_records_recent_pairs() {
|
||||
let mut cache = TokenEstimateCache::new();
|
||||
let messages = vec![user_text("hi")];
|
||||
for rev in 1..=5 {
|
||||
let _ = cache.lookup_or_compute(rev, None, &messages);
|
||||
}
|
||||
let ring = cache.recent_audit();
|
||||
assert_eq!(ring.len(), 5);
|
||||
assert_eq!(ring.last().copied(), Some((5, ring.last().unwrap().1)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn audit_ring_bounded_by_capacity() {
|
||||
let mut cache = TokenEstimateCache::new();
|
||||
let messages = vec![user_text("hi")];
|
||||
for rev in 1..=(AUDIT_RING_CAPACITY + 10) as u64 {
|
||||
let _ = cache.lookup_or_compute(rev, None, &messages);
|
||||
}
|
||||
let ring = cache.recent_audit();
|
||||
assert_eq!(ring.len(), AUDIT_RING_CAPACITY);
|
||||
// newest entry should be the most recent revision we asked for
|
||||
assert_eq!(ring.last().unwrap().0, (AUDIT_RING_CAPACITY + 10) as u64);
|
||||
}
|
||||
}
|
||||
@@ -82,6 +82,14 @@ pub struct Session {
|
||||
/// request of the session; verified against the current system+tool
|
||||
/// state before every subsequent request. None until the first turn.
|
||||
pub frozen_prefix: Option<FrozenPrefix>,
|
||||
|
||||
/// Monotonic counter bumped on every direct mutation of `messages`.
|
||||
/// Consumed by [`crate::core::engine::token_estimate_cache::TokenEstimateCache`]
|
||||
/// to memoize the per-turn token estimate without re-walking the message
|
||||
/// list. Defaults to 0; bumped in [`Session::add_message`],
|
||||
/// [`Session::replace_messages`], and at every other mutation site in
|
||||
/// `core/engine.rs` / `core/engine/capacity_flow.rs`.
|
||||
pub messages_revision: u64,
|
||||
}
|
||||
|
||||
/// Cumulative usage statistics for a session.
|
||||
@@ -155,12 +163,33 @@ impl Session {
|
||||
working_set: WorkingSet::default(),
|
||||
prefix_stability: None,
|
||||
frozen_prefix: None,
|
||||
messages_revision: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a message to the conversation
|
||||
pub fn add_message(&mut self, message: Message) {
|
||||
self.messages.push(message);
|
||||
self.messages_revision = self.messages_revision.saturating_add(1);
|
||||
}
|
||||
|
||||
/// Replace the entire message history. Used by session resume and
|
||||
/// capacity interventions. Bumps `messages_revision` exactly once even
|
||||
/// when the new history has a different length, so downstream caches
|
||||
/// invalidate atomically.
|
||||
#[allow(dead_code)]
|
||||
pub fn replace_messages(&mut self, messages: Vec<Message>) {
|
||||
self.messages = messages;
|
||||
self.messages_revision = self.messages_revision.saturating_add(1);
|
||||
}
|
||||
|
||||
/// Bump `messages_revision` without otherwise mutating the message list.
|
||||
/// Reserved for sites that mutate the message list in place (e.g. an
|
||||
/// in-place rewrite of a content block). Most call sites do not need
|
||||
/// this — prefer [`add_message`](Self::add_message) and
|
||||
/// [`replace_messages`](Self::replace_messages).
|
||||
pub fn bump_messages_revision(&mut self) {
|
||||
self.messages_revision = self.messages_revision.saturating_add(1);
|
||||
}
|
||||
|
||||
/// Rebuild the working set from current messages (best effort).
|
||||
|
||||
Reference in New Issue
Block a user