feat(client): harvest deterministic response cache

Harvested from PR #2501 by @HUQIANTAO. Cache only explicit deterministic non-streaming tool-free requests, key entries by provider, route, account fingerprint, and final wire body, and zero usage on hits so local spend counters are not double-counted. Co-authored-by: HUQIANTAO <58421104+HUQIANTAO@users.noreply.github.com>
2026-06-05 10:18:12 -07:00
parent f0827627a6
commit 7fc074cc36
7 changed files with 276 additions and 1 deletions
@@ -62,6 +62,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
  to Hugging Face's settings-generated MCP configuration and intentionally does
  not include Hub search, direct Hugging Face HTTP requests, or upload behavior
  (#2709, #2782). Thanks @idling11 for the original Hugging Face MCP draft.
+- Added an in-process response cache for deterministic non-streaming,
+  tool-free chat requests. The cache is keyed by provider, base URL, path
+  suffix, API-key fingerprint, and final wire body, and zeroes usage on hits so
+  local spend counters are not double-counted (#2501). Thanks @HUQIANTAO for
+  the response-cache proposal and canonical-body key update.
 - Added `/sidebar` so users can toggle, show, hide, and optionally persist the
  TUI sidebar from the command line instead of relying on copy-hostile sidebar
  state during long transcript work (#2766, #2788). Thanks @mo-vic for the
@@ -981,6 +981,7 @@ dependencies = [
 "ignore",
 "image",
 "libc",
+ "lru",
 "multimap",
 "objc2",
 "objc2-foundation",
@@ -62,6 +62,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
  to Hugging Face's settings-generated MCP configuration and intentionally does
  not include Hub search, direct Hugging Face HTTP requests, or upload behavior
  (#2709, #2782). Thanks @idling11 for the original Hugging Face MCP draft.
+- Added an in-process response cache for deterministic non-streaming,
+  tool-free chat requests. The cache is keyed by provider, base URL, path
+  suffix, API-key fingerprint, and final wire body, and zeroes usage on hits so
+  local spend counters are not double-counted (#2501). Thanks @HUQIANTAO for
+  the response-cache proposal and canonical-body key update.
 - Added `/sidebar` so users can toggle, show, hide, and optionally persist the
  TUI sidebar from the command line instead of relying on copy-hostile sidebar
  state during long transcript work (#2766, #2788). Thanks @mo-vic for the
@@ -74,6 +74,7 @@ tiny_http = "0.12"
 zeroize = "1.8.2"
 ignore = "0.4"
 image = { version = "0.25", default-features = false, features = ["png"] }
+lru = "0.16"
 parking_lot = "0.12"
 pdf-extract = "0.7"
 tar = "0.4"
@@ -75,6 +75,7 @@ impl DeepSeekClient {
        &self,
        request: &MessageRequest,
    ) -> Result<MessageResponse> {
+        let cacheable = crate::llm_response_cache::request_is_cacheable(request);
        let messages = build_chat_messages_for_request_and_provider(request, self.api_provider);
        let model = wire_model_for_provider(self.api_provider, &request.model);
        let mut body = json!({
@@ -121,6 +122,24 @@ impl DeepSeekClient {
            self.api_provider,
        );

+        let response_cache_key = if cacheable {
+            let wire_body =
+                serde_json::to_vec(&body).context("Failed to serialize Chat API cache key")?;
+            let key = crate::llm_response_cache::ResponseCache::make_key(
+                self.api_provider.as_str(),
+                &self.base_url,
+                self.path_suffix.as_deref(),
+                &self.api_key,
+                &wire_body,
+            );
+            if let Some(cached) = crate::llm_response_cache::response_cache().get(&key) {
+                return Ok(cached);
+            }
+            Some(key)
+        } else {
+            None
+        };
+
        let url = api_url_with_suffix(
            &self.base_url,
            "chat/completions",
@@ -158,7 +177,11 @@ impl DeepSeekClient {
        let response_text = response.text().await.unwrap_or_default();
        let value: Value =
            serde_json::from_str(&response_text).context("Failed to parse Chat API JSON")?;
-        parse_chat_message(&value)
+        let parsed = parse_chat_message(&value)?;
+        if let Some(key) = response_cache_key {
+            crate::llm_response_cache::response_cache().put(key, parsed.clone());
+        }
+        Ok(parsed)
    }
 }

@@ -0,0 +1,239 @@
+//! Small in-process cache for deterministic non-streaming chat responses.
+
+use std::num::NonZeroUsize;
+use std::sync::{Mutex, OnceLock};
+
+use lru::LruCache;
+use sha2::{Digest, Sha256};
+
+use crate::models::{MessageRequest, MessageResponse, Usage};
+
+const DEFAULT_CAPACITY: usize = 256;
+
+static RESPONSE_CACHE: OnceLock<ResponseCache> = OnceLock::new();
+
+pub(crate) fn response_cache() -> &'static ResponseCache {
+    RESPONSE_CACHE.get_or_init(ResponseCache::new)
+}
+
+pub(crate) fn request_is_cacheable(request: &MessageRequest) -> bool {
+    request.stream != Some(true)
+        && request.tools.as_ref().is_none_or(Vec::is_empty)
+        && request.tool_choice.is_none()
+        && request.temperature == Some(0.0)
+        && request.top_p.is_none_or(|top_p| top_p == 1.0)
+}
+
+pub(crate) struct ResponseCache {
+    inner: Mutex<LruCache<[u8; 32], MessageResponse>>,
+}
+
+impl ResponseCache {
+    fn new() -> Self {
+        Self::with_capacity(NonZeroUsize::new(DEFAULT_CAPACITY).expect("non-zero capacity"))
+    }
+
+    fn with_capacity(capacity: NonZeroUsize) -> Self {
+        Self {
+            inner: Mutex::new(LruCache::new(capacity)),
+        }
+    }
+
+    pub(crate) fn make_key(
+        provider: &str,
+        base_url: &str,
+        path_suffix: Option<&str>,
+        api_key: &str,
+        wire_body: &[u8],
+    ) -> [u8; 32] {
+        let mut hasher = Sha256::new();
+        update_field(&mut hasher, provider.as_bytes());
+        update_field(&mut hasher, base_url.as_bytes());
+        update_field(&mut hasher, path_suffix.unwrap_or("").as_bytes());
+        update_field(&mut hasher, &Sha256::digest(api_key.as_bytes()));
+        update_field(&mut hasher, wire_body);
+        hasher.finalize().into()
+    }
+
+    pub(crate) fn get(&self, key: &[u8; 32]) -> Option<MessageResponse> {
+        let mut cache = self.inner.lock().ok()?;
+        cache.get(key).cloned().map(|mut response| {
+            response.usage = Usage::default();
+            response
+        })
+    }
+
+    pub(crate) fn put(&self, key: [u8; 32], value: MessageResponse) {
+        if let Ok(mut cache) = self.inner.lock() {
+            cache.put(key, value);
+        }
+    }
+}
+
+fn update_field(hasher: &mut Sha256, bytes: &[u8]) {
+    hasher.update((bytes.len() as u64).to_le_bytes());
+    hasher.update(bytes);
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn response_with_usage(id: &str) -> MessageResponse {
+        MessageResponse {
+            id: id.to_string(),
+            r#type: "message".to_string(),
+            role: "assistant".to_string(),
+            content: Vec::new(),
+            model: "test-model".to_string(),
+            stop_reason: Some("end_turn".to_string()),
+            stop_sequence: None,
+            container: None,
+            usage: Usage {
+                input_tokens: 42,
+                output_tokens: 7,
+                prompt_cache_hit_tokens: Some(3),
+                prompt_cache_miss_tokens: Some(39),
+                reasoning_tokens: Some(5),
+                reasoning_replay_tokens: Some(2),
+                server_tool_use: None,
+            },
+        }
+    }
+
+    fn request() -> MessageRequest {
+        MessageRequest {
+            model: "test-model".to_string(),
+            messages: Vec::new(),
+            max_tokens: 16,
+            system: None,
+            tools: None,
+            tool_choice: None,
+            metadata: None,
+            thinking: None,
+            reasoning_effort: None,
+            stream: None,
+            temperature: Some(0.0),
+            top_p: None,
+        }
+    }
+
+    #[test]
+    fn cache_key_separates_provider_route_account_and_wire_body() {
+        let base = ResponseCache::make_key(
+            "deepseek",
+            "https://api.example.com/v1",
+            None,
+            "key-a",
+            br#"{"model":"m","messages":[]}"#,
+        );
+
+        assert_ne!(
+            base,
+            ResponseCache::make_key(
+                "openai",
+                "https://api.example.com/v1",
+                None,
+                "key-a",
+                br#"{"model":"m","messages":[]}"#
+            )
+        );
+        assert_ne!(
+            base,
+            ResponseCache::make_key(
+                "deepseek",
+                "https://proxy.example.com/v1",
+                None,
+                "key-a",
+                br#"{"model":"m","messages":[]}"#
+            )
+        );
+        assert_ne!(
+            base,
+            ResponseCache::make_key(
+                "deepseek",
+                "https://api.example.com/v1",
+                Some("responses"),
+                "key-a",
+                br#"{"model":"m","messages":[]}"#
+            )
+        );
+        assert_ne!(
+            base,
+            ResponseCache::make_key(
+                "deepseek",
+                "https://api.example.com/v1",
+                None,
+                "key-b",
+                br#"{"model":"m","messages":[]}"#
+            )
+        );
+        assert_ne!(
+            base,
+            ResponseCache::make_key(
+                "deepseek",
+                "https://api.example.com/v1",
+                None,
+                "key-a",
+                br#"{"model":"m","messages":[],"reasoning_effort":"high"}"#
+            )
+        );
+    }
+
+    #[test]
+    fn cache_hit_zeroes_usage_to_avoid_fake_spend() {
+        let cache = ResponseCache::with_capacity(NonZeroUsize::new(2).unwrap());
+        let key =
+            ResponseCache::make_key("deepseek", "https://api.example.com", None, "key", b"{}");
+
+        cache.put(key, response_with_usage("cached"));
+
+        let hit = cache.get(&key).expect("cache hit");
+        assert_eq!(hit.id, "cached");
+        assert_eq!(hit.usage, Usage::default());
+    }
+
+    #[test]
+    fn capacity_evicts_oldest_entry() {
+        let cache = ResponseCache::with_capacity(NonZeroUsize::new(2).unwrap());
+        let key1 =
+            ResponseCache::make_key("deepseek", "https://api.example.com", None, "key", b"one");
+        let key2 =
+            ResponseCache::make_key("deepseek", "https://api.example.com", None, "key", b"two");
+        let key3 =
+            ResponseCache::make_key("deepseek", "https://api.example.com", None, "key", b"three");
+
+        cache.put(key1, response_with_usage("one"));
+        cache.put(key2, response_with_usage("two"));
+        cache.put(key3, response_with_usage("three"));
+
+        assert!(cache.get(&key1).is_none());
+        assert!(cache.get(&key2).is_some());
+        assert!(cache.get(&key3).is_some());
+    }
+
+    #[test]
+    fn cacheability_requires_deterministic_tool_free_non_streaming_request() {
+        let mut req = request();
+        assert!(request_is_cacheable(&req));
+
+        req.temperature = None;
+        assert!(!request_is_cacheable(&req));
+
+        req = request();
+        req.temperature = Some(0.2);
+        assert!(!request_is_cacheable(&req));
+
+        req = request();
+        req.stream = Some(true);
+        assert!(!request_is_cacheable(&req));
+
+        req = request();
+        req.top_p = Some(0.5);
+        assert!(!request_is_cacheable(&req));
+
+        req = request();
+        req.tool_choice = Some(serde_json::json!("auto"));
+        assert!(!request_is_cacheable(&req));
+    }
+}
@@ -39,6 +39,7 @@ mod features;
 mod handoff;
 mod hooks;
 mod llm_client;
+mod llm_response_cache;
 mod localization;
 mod logging;
 mod lsp;