diff --git a/CHANGELOG.md b/CHANGELOG.md index 94638d2d..ee8ccb36 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -62,6 +62,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 to Hugging Face's settings-generated MCP configuration and intentionally does not include Hub search, direct Hugging Face HTTP requests, or upload behavior (#2709, #2782). Thanks @idling11 for the original Hugging Face MCP draft. +- Added an in-process response cache for deterministic non-streaming, + tool-free chat requests. The cache is keyed by provider, base URL, path + suffix, API-key fingerprint, and final wire body, and zeroes usage on hits so + local spend counters are not double-counted (#2501). Thanks @HUQIANTAO for + the response-cache proposal and canonical-body key update. - Added `/sidebar` so users can toggle, show, hide, and optionally persist the TUI sidebar from the command line instead of relying on copy-hostile sidebar state during long transcript work (#2766, #2788). Thanks @mo-vic for the diff --git a/Cargo.lock b/Cargo.lock index c0f6d0fa..8004b967 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -981,6 +981,7 @@ dependencies = [ "ignore", "image", "libc", + "lru", "multimap", "objc2", "objc2-foundation", diff --git a/crates/tui/CHANGELOG.md b/crates/tui/CHANGELOG.md index 94638d2d..ee8ccb36 100644 --- a/crates/tui/CHANGELOG.md +++ b/crates/tui/CHANGELOG.md @@ -62,6 +62,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 to Hugging Face's settings-generated MCP configuration and intentionally does not include Hub search, direct Hugging Face HTTP requests, or upload behavior (#2709, #2782). Thanks @idling11 for the original Hugging Face MCP draft. +- Added an in-process response cache for deterministic non-streaming, + tool-free chat requests. The cache is keyed by provider, base URL, path + suffix, API-key fingerprint, and final wire body, and zeroes usage on hits so + local spend counters are not double-counted (#2501). Thanks @HUQIANTAO for + the response-cache proposal and canonical-body key update. - Added `/sidebar` so users can toggle, show, hide, and optionally persist the TUI sidebar from the command line instead of relying on copy-hostile sidebar state during long transcript work (#2766, #2788). Thanks @mo-vic for the diff --git a/crates/tui/Cargo.toml b/crates/tui/Cargo.toml index 52a05004..9fca03ca 100644 --- a/crates/tui/Cargo.toml +++ b/crates/tui/Cargo.toml @@ -74,6 +74,7 @@ tiny_http = "0.12" zeroize = "1.8.2" ignore = "0.4" image = { version = "0.25", default-features = false, features = ["png"] } +lru = "0.16" parking_lot = "0.12" pdf-extract = "0.7" tar = "0.4" diff --git a/crates/tui/src/client/chat.rs b/crates/tui/src/client/chat.rs index 6acc77e6..a338170d 100644 --- a/crates/tui/src/client/chat.rs +++ b/crates/tui/src/client/chat.rs @@ -75,6 +75,7 @@ impl DeepSeekClient { &self, request: &MessageRequest, ) -> Result { + let cacheable = crate::llm_response_cache::request_is_cacheable(request); let messages = build_chat_messages_for_request_and_provider(request, self.api_provider); let model = wire_model_for_provider(self.api_provider, &request.model); let mut body = json!({ @@ -121,6 +122,24 @@ impl DeepSeekClient { self.api_provider, ); + let response_cache_key = if cacheable { + let wire_body = + serde_json::to_vec(&body).context("Failed to serialize Chat API cache key")?; + let key = crate::llm_response_cache::ResponseCache::make_key( + self.api_provider.as_str(), + &self.base_url, + self.path_suffix.as_deref(), + &self.api_key, + &wire_body, + ); + if let Some(cached) = crate::llm_response_cache::response_cache().get(&key) { + return Ok(cached); + } + Some(key) + } else { + None + }; + let url = api_url_with_suffix( &self.base_url, "chat/completions", @@ -158,7 +177,11 @@ impl DeepSeekClient { let response_text = response.text().await.unwrap_or_default(); let value: Value = serde_json::from_str(&response_text).context("Failed to parse Chat API JSON")?; - parse_chat_message(&value) + let parsed = parse_chat_message(&value)?; + if let Some(key) = response_cache_key { + crate::llm_response_cache::response_cache().put(key, parsed.clone()); + } + Ok(parsed) } } diff --git a/crates/tui/src/llm_response_cache.rs b/crates/tui/src/llm_response_cache.rs new file mode 100644 index 00000000..8141f194 --- /dev/null +++ b/crates/tui/src/llm_response_cache.rs @@ -0,0 +1,239 @@ +//! Small in-process cache for deterministic non-streaming chat responses. + +use std::num::NonZeroUsize; +use std::sync::{Mutex, OnceLock}; + +use lru::LruCache; +use sha2::{Digest, Sha256}; + +use crate::models::{MessageRequest, MessageResponse, Usage}; + +const DEFAULT_CAPACITY: usize = 256; + +static RESPONSE_CACHE: OnceLock = OnceLock::new(); + +pub(crate) fn response_cache() -> &'static ResponseCache { + RESPONSE_CACHE.get_or_init(ResponseCache::new) +} + +pub(crate) fn request_is_cacheable(request: &MessageRequest) -> bool { + request.stream != Some(true) + && request.tools.as_ref().is_none_or(Vec::is_empty) + && request.tool_choice.is_none() + && request.temperature == Some(0.0) + && request.top_p.is_none_or(|top_p| top_p == 1.0) +} + +pub(crate) struct ResponseCache { + inner: Mutex>, +} + +impl ResponseCache { + fn new() -> Self { + Self::with_capacity(NonZeroUsize::new(DEFAULT_CAPACITY).expect("non-zero capacity")) + } + + fn with_capacity(capacity: NonZeroUsize) -> Self { + Self { + inner: Mutex::new(LruCache::new(capacity)), + } + } + + pub(crate) fn make_key( + provider: &str, + base_url: &str, + path_suffix: Option<&str>, + api_key: &str, + wire_body: &[u8], + ) -> [u8; 32] { + let mut hasher = Sha256::new(); + update_field(&mut hasher, provider.as_bytes()); + update_field(&mut hasher, base_url.as_bytes()); + update_field(&mut hasher, path_suffix.unwrap_or("").as_bytes()); + update_field(&mut hasher, &Sha256::digest(api_key.as_bytes())); + update_field(&mut hasher, wire_body); + hasher.finalize().into() + } + + pub(crate) fn get(&self, key: &[u8; 32]) -> Option { + let mut cache = self.inner.lock().ok()?; + cache.get(key).cloned().map(|mut response| { + response.usage = Usage::default(); + response + }) + } + + pub(crate) fn put(&self, key: [u8; 32], value: MessageResponse) { + if let Ok(mut cache) = self.inner.lock() { + cache.put(key, value); + } + } +} + +fn update_field(hasher: &mut Sha256, bytes: &[u8]) { + hasher.update((bytes.len() as u64).to_le_bytes()); + hasher.update(bytes); +} + +#[cfg(test)] +mod tests { + use super::*; + + fn response_with_usage(id: &str) -> MessageResponse { + MessageResponse { + id: id.to_string(), + r#type: "message".to_string(), + role: "assistant".to_string(), + content: Vec::new(), + model: "test-model".to_string(), + stop_reason: Some("end_turn".to_string()), + stop_sequence: None, + container: None, + usage: Usage { + input_tokens: 42, + output_tokens: 7, + prompt_cache_hit_tokens: Some(3), + prompt_cache_miss_tokens: Some(39), + reasoning_tokens: Some(5), + reasoning_replay_tokens: Some(2), + server_tool_use: None, + }, + } + } + + fn request() -> MessageRequest { + MessageRequest { + model: "test-model".to_string(), + messages: Vec::new(), + max_tokens: 16, + system: None, + tools: None, + tool_choice: None, + metadata: None, + thinking: None, + reasoning_effort: None, + stream: None, + temperature: Some(0.0), + top_p: None, + } + } + + #[test] + fn cache_key_separates_provider_route_account_and_wire_body() { + let base = ResponseCache::make_key( + "deepseek", + "https://api.example.com/v1", + None, + "key-a", + br#"{"model":"m","messages":[]}"#, + ); + + assert_ne!( + base, + ResponseCache::make_key( + "openai", + "https://api.example.com/v1", + None, + "key-a", + br#"{"model":"m","messages":[]}"# + ) + ); + assert_ne!( + base, + ResponseCache::make_key( + "deepseek", + "https://proxy.example.com/v1", + None, + "key-a", + br#"{"model":"m","messages":[]}"# + ) + ); + assert_ne!( + base, + ResponseCache::make_key( + "deepseek", + "https://api.example.com/v1", + Some("responses"), + "key-a", + br#"{"model":"m","messages":[]}"# + ) + ); + assert_ne!( + base, + ResponseCache::make_key( + "deepseek", + "https://api.example.com/v1", + None, + "key-b", + br#"{"model":"m","messages":[]}"# + ) + ); + assert_ne!( + base, + ResponseCache::make_key( + "deepseek", + "https://api.example.com/v1", + None, + "key-a", + br#"{"model":"m","messages":[],"reasoning_effort":"high"}"# + ) + ); + } + + #[test] + fn cache_hit_zeroes_usage_to_avoid_fake_spend() { + let cache = ResponseCache::with_capacity(NonZeroUsize::new(2).unwrap()); + let key = + ResponseCache::make_key("deepseek", "https://api.example.com", None, "key", b"{}"); + + cache.put(key, response_with_usage("cached")); + + let hit = cache.get(&key).expect("cache hit"); + assert_eq!(hit.id, "cached"); + assert_eq!(hit.usage, Usage::default()); + } + + #[test] + fn capacity_evicts_oldest_entry() { + let cache = ResponseCache::with_capacity(NonZeroUsize::new(2).unwrap()); + let key1 = + ResponseCache::make_key("deepseek", "https://api.example.com", None, "key", b"one"); + let key2 = + ResponseCache::make_key("deepseek", "https://api.example.com", None, "key", b"two"); + let key3 = + ResponseCache::make_key("deepseek", "https://api.example.com", None, "key", b"three"); + + cache.put(key1, response_with_usage("one")); + cache.put(key2, response_with_usage("two")); + cache.put(key3, response_with_usage("three")); + + assert!(cache.get(&key1).is_none()); + assert!(cache.get(&key2).is_some()); + assert!(cache.get(&key3).is_some()); + } + + #[test] + fn cacheability_requires_deterministic_tool_free_non_streaming_request() { + let mut req = request(); + assert!(request_is_cacheable(&req)); + + req.temperature = None; + assert!(!request_is_cacheable(&req)); + + req = request(); + req.temperature = Some(0.2); + assert!(!request_is_cacheable(&req)); + + req = request(); + req.stream = Some(true); + assert!(!request_is_cacheable(&req)); + + req = request(); + req.top_p = Some(0.5); + assert!(!request_is_cacheable(&req)); + + req = request(); + req.tool_choice = Some(serde_json::json!("auto")); + assert!(!request_is_cacheable(&req)); + } +} diff --git a/crates/tui/src/main.rs b/crates/tui/src/main.rs index d5d2dfae..cd283682 100644 --- a/crates/tui/src/main.rs +++ b/crates/tui/src/main.rs @@ -39,6 +39,7 @@ mod features; mod handoff; mod hooks; mod llm_client; +mod llm_response_cache; mod localization; mod logging; mod lsp;