feat(client): harvest deterministic response cache

Harvested from PR #2501 by @HUQIANTAO.

Cache only explicit deterministic non-streaming tool-free requests, key entries by provider, route, account fingerprint, and final wire body, and zero usage on hits so local spend counters are not double-counted.

Co-authored-by: HUQIANTAO <58421104+HUQIANTAO@users.noreply.github.com>
This commit is contained in:
Hunter B
2026-06-05 10:18:12 -07:00
parent f0827627a6
commit 7fc074cc36
7 changed files with 276 additions and 1 deletions
+5
View File
@@ -62,6 +62,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
to Hugging Face's settings-generated MCP configuration and intentionally does
not include Hub search, direct Hugging Face HTTP requests, or upload behavior
(#2709, #2782). Thanks @idling11 for the original Hugging Face MCP draft.
- Added an in-process response cache for deterministic non-streaming,
tool-free chat requests. The cache is keyed by provider, base URL, path
suffix, API-key fingerprint, and final wire body, and zeroes usage on hits so
local spend counters are not double-counted (#2501). Thanks @HUQIANTAO for
the response-cache proposal and canonical-body key update.
- Added `/sidebar` so users can toggle, show, hide, and optionally persist the
TUI sidebar from the command line instead of relying on copy-hostile sidebar
state during long transcript work (#2766, #2788). Thanks @mo-vic for the
Generated
+1
View File
@@ -981,6 +981,7 @@ dependencies = [
"ignore",
"image",
"libc",
"lru",
"multimap",
"objc2",
"objc2-foundation",
+5
View File
@@ -62,6 +62,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
to Hugging Face's settings-generated MCP configuration and intentionally does
not include Hub search, direct Hugging Face HTTP requests, or upload behavior
(#2709, #2782). Thanks @idling11 for the original Hugging Face MCP draft.
- Added an in-process response cache for deterministic non-streaming,
tool-free chat requests. The cache is keyed by provider, base URL, path
suffix, API-key fingerprint, and final wire body, and zeroes usage on hits so
local spend counters are not double-counted (#2501). Thanks @HUQIANTAO for
the response-cache proposal and canonical-body key update.
- Added `/sidebar` so users can toggle, show, hide, and optionally persist the
TUI sidebar from the command line instead of relying on copy-hostile sidebar
state during long transcript work (#2766, #2788). Thanks @mo-vic for the
+1
View File
@@ -74,6 +74,7 @@ tiny_http = "0.12"
zeroize = "1.8.2"
ignore = "0.4"
image = { version = "0.25", default-features = false, features = ["png"] }
lru = "0.16"
parking_lot = "0.12"
pdf-extract = "0.7"
tar = "0.4"
+24 -1
View File
@@ -75,6 +75,7 @@ impl DeepSeekClient {
&self,
request: &MessageRequest,
) -> Result<MessageResponse> {
let cacheable = crate::llm_response_cache::request_is_cacheable(request);
let messages = build_chat_messages_for_request_and_provider(request, self.api_provider);
let model = wire_model_for_provider(self.api_provider, &request.model);
let mut body = json!({
@@ -121,6 +122,24 @@ impl DeepSeekClient {
self.api_provider,
);
let response_cache_key = if cacheable {
let wire_body =
serde_json::to_vec(&body).context("Failed to serialize Chat API cache key")?;
let key = crate::llm_response_cache::ResponseCache::make_key(
self.api_provider.as_str(),
&self.base_url,
self.path_suffix.as_deref(),
&self.api_key,
&wire_body,
);
if let Some(cached) = crate::llm_response_cache::response_cache().get(&key) {
return Ok(cached);
}
Some(key)
} else {
None
};
let url = api_url_with_suffix(
&self.base_url,
"chat/completions",
@@ -158,7 +177,11 @@ impl DeepSeekClient {
let response_text = response.text().await.unwrap_or_default();
let value: Value =
serde_json::from_str(&response_text).context("Failed to parse Chat API JSON")?;
parse_chat_message(&value)
let parsed = parse_chat_message(&value)?;
if let Some(key) = response_cache_key {
crate::llm_response_cache::response_cache().put(key, parsed.clone());
}
Ok(parsed)
}
}
+239
View File
@@ -0,0 +1,239 @@
//! Small in-process cache for deterministic non-streaming chat responses.
use std::num::NonZeroUsize;
use std::sync::{Mutex, OnceLock};
use lru::LruCache;
use sha2::{Digest, Sha256};
use crate::models::{MessageRequest, MessageResponse, Usage};
const DEFAULT_CAPACITY: usize = 256;
static RESPONSE_CACHE: OnceLock<ResponseCache> = OnceLock::new();
pub(crate) fn response_cache() -> &'static ResponseCache {
RESPONSE_CACHE.get_or_init(ResponseCache::new)
}
pub(crate) fn request_is_cacheable(request: &MessageRequest) -> bool {
request.stream != Some(true)
&& request.tools.as_ref().is_none_or(Vec::is_empty)
&& request.tool_choice.is_none()
&& request.temperature == Some(0.0)
&& request.top_p.is_none_or(|top_p| top_p == 1.0)
}
pub(crate) struct ResponseCache {
inner: Mutex<LruCache<[u8; 32], MessageResponse>>,
}
impl ResponseCache {
fn new() -> Self {
Self::with_capacity(NonZeroUsize::new(DEFAULT_CAPACITY).expect("non-zero capacity"))
}
fn with_capacity(capacity: NonZeroUsize) -> Self {
Self {
inner: Mutex::new(LruCache::new(capacity)),
}
}
pub(crate) fn make_key(
provider: &str,
base_url: &str,
path_suffix: Option<&str>,
api_key: &str,
wire_body: &[u8],
) -> [u8; 32] {
let mut hasher = Sha256::new();
update_field(&mut hasher, provider.as_bytes());
update_field(&mut hasher, base_url.as_bytes());
update_field(&mut hasher, path_suffix.unwrap_or("").as_bytes());
update_field(&mut hasher, &Sha256::digest(api_key.as_bytes()));
update_field(&mut hasher, wire_body);
hasher.finalize().into()
}
pub(crate) fn get(&self, key: &[u8; 32]) -> Option<MessageResponse> {
let mut cache = self.inner.lock().ok()?;
cache.get(key).cloned().map(|mut response| {
response.usage = Usage::default();
response
})
}
pub(crate) fn put(&self, key: [u8; 32], value: MessageResponse) {
if let Ok(mut cache) = self.inner.lock() {
cache.put(key, value);
}
}
}
fn update_field(hasher: &mut Sha256, bytes: &[u8]) {
hasher.update((bytes.len() as u64).to_le_bytes());
hasher.update(bytes);
}
#[cfg(test)]
mod tests {
use super::*;
fn response_with_usage(id: &str) -> MessageResponse {
MessageResponse {
id: id.to_string(),
r#type: "message".to_string(),
role: "assistant".to_string(),
content: Vec::new(),
model: "test-model".to_string(),
stop_reason: Some("end_turn".to_string()),
stop_sequence: None,
container: None,
usage: Usage {
input_tokens: 42,
output_tokens: 7,
prompt_cache_hit_tokens: Some(3),
prompt_cache_miss_tokens: Some(39),
reasoning_tokens: Some(5),
reasoning_replay_tokens: Some(2),
server_tool_use: None,
},
}
}
fn request() -> MessageRequest {
MessageRequest {
model: "test-model".to_string(),
messages: Vec::new(),
max_tokens: 16,
system: None,
tools: None,
tool_choice: None,
metadata: None,
thinking: None,
reasoning_effort: None,
stream: None,
temperature: Some(0.0),
top_p: None,
}
}
#[test]
fn cache_key_separates_provider_route_account_and_wire_body() {
let base = ResponseCache::make_key(
"deepseek",
"https://api.example.com/v1",
None,
"key-a",
br#"{"model":"m","messages":[]}"#,
);
assert_ne!(
base,
ResponseCache::make_key(
"openai",
"https://api.example.com/v1",
None,
"key-a",
br#"{"model":"m","messages":[]}"#
)
);
assert_ne!(
base,
ResponseCache::make_key(
"deepseek",
"https://proxy.example.com/v1",
None,
"key-a",
br#"{"model":"m","messages":[]}"#
)
);
assert_ne!(
base,
ResponseCache::make_key(
"deepseek",
"https://api.example.com/v1",
Some("responses"),
"key-a",
br#"{"model":"m","messages":[]}"#
)
);
assert_ne!(
base,
ResponseCache::make_key(
"deepseek",
"https://api.example.com/v1",
None,
"key-b",
br#"{"model":"m","messages":[]}"#
)
);
assert_ne!(
base,
ResponseCache::make_key(
"deepseek",
"https://api.example.com/v1",
None,
"key-a",
br#"{"model":"m","messages":[],"reasoning_effort":"high"}"#
)
);
}
#[test]
fn cache_hit_zeroes_usage_to_avoid_fake_spend() {
let cache = ResponseCache::with_capacity(NonZeroUsize::new(2).unwrap());
let key =
ResponseCache::make_key("deepseek", "https://api.example.com", None, "key", b"{}");
cache.put(key, response_with_usage("cached"));
let hit = cache.get(&key).expect("cache hit");
assert_eq!(hit.id, "cached");
assert_eq!(hit.usage, Usage::default());
}
#[test]
fn capacity_evicts_oldest_entry() {
let cache = ResponseCache::with_capacity(NonZeroUsize::new(2).unwrap());
let key1 =
ResponseCache::make_key("deepseek", "https://api.example.com", None, "key", b"one");
let key2 =
ResponseCache::make_key("deepseek", "https://api.example.com", None, "key", b"two");
let key3 =
ResponseCache::make_key("deepseek", "https://api.example.com", None, "key", b"three");
cache.put(key1, response_with_usage("one"));
cache.put(key2, response_with_usage("two"));
cache.put(key3, response_with_usage("three"));
assert!(cache.get(&key1).is_none());
assert!(cache.get(&key2).is_some());
assert!(cache.get(&key3).is_some());
}
#[test]
fn cacheability_requires_deterministic_tool_free_non_streaming_request() {
let mut req = request();
assert!(request_is_cacheable(&req));
req.temperature = None;
assert!(!request_is_cacheable(&req));
req = request();
req.temperature = Some(0.2);
assert!(!request_is_cacheable(&req));
req = request();
req.stream = Some(true);
assert!(!request_is_cacheable(&req));
req = request();
req.top_p = Some(0.5);
assert!(!request_is_cacheable(&req));
req = request();
req.tool_choice = Some(serde_json::json!("auto"));
assert!(!request_is_cacheable(&req));
}
}
+1
View File
@@ -39,6 +39,7 @@ mod features;
mod handoff;
mod hooks;
mod llm_client;
mod llm_response_cache;
mod localization;
mod logging;
mod lsp;