feat(client): harvest deterministic response cache
Harvested from PR #2501 by @HUQIANTAO. Cache only explicit deterministic non-streaming tool-free requests, key entries by provider, route, account fingerprint, and final wire body, and zero usage on hits so local spend counters are not double-counted. Co-authored-by: HUQIANTAO <58421104+HUQIANTAO@users.noreply.github.com>
This commit is contained in:
@@ -62,6 +62,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
to Hugging Face's settings-generated MCP configuration and intentionally does
|
||||
not include Hub search, direct Hugging Face HTTP requests, or upload behavior
|
||||
(#2709, #2782). Thanks @idling11 for the original Hugging Face MCP draft.
|
||||
- Added an in-process response cache for deterministic non-streaming,
|
||||
tool-free chat requests. The cache is keyed by provider, base URL, path
|
||||
suffix, API-key fingerprint, and final wire body, and zeroes usage on hits so
|
||||
local spend counters are not double-counted (#2501). Thanks @HUQIANTAO for
|
||||
the response-cache proposal and canonical-body key update.
|
||||
- Added `/sidebar` so users can toggle, show, hide, and optionally persist the
|
||||
TUI sidebar from the command line instead of relying on copy-hostile sidebar
|
||||
state during long transcript work (#2766, #2788). Thanks @mo-vic for the
|
||||
|
||||
Generated
+1
@@ -981,6 +981,7 @@ dependencies = [
|
||||
"ignore",
|
||||
"image",
|
||||
"libc",
|
||||
"lru",
|
||||
"multimap",
|
||||
"objc2",
|
||||
"objc2-foundation",
|
||||
|
||||
@@ -62,6 +62,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
to Hugging Face's settings-generated MCP configuration and intentionally does
|
||||
not include Hub search, direct Hugging Face HTTP requests, or upload behavior
|
||||
(#2709, #2782). Thanks @idling11 for the original Hugging Face MCP draft.
|
||||
- Added an in-process response cache for deterministic non-streaming,
|
||||
tool-free chat requests. The cache is keyed by provider, base URL, path
|
||||
suffix, API-key fingerprint, and final wire body, and zeroes usage on hits so
|
||||
local spend counters are not double-counted (#2501). Thanks @HUQIANTAO for
|
||||
the response-cache proposal and canonical-body key update.
|
||||
- Added `/sidebar` so users can toggle, show, hide, and optionally persist the
|
||||
TUI sidebar from the command line instead of relying on copy-hostile sidebar
|
||||
state during long transcript work (#2766, #2788). Thanks @mo-vic for the
|
||||
|
||||
@@ -74,6 +74,7 @@ tiny_http = "0.12"
|
||||
zeroize = "1.8.2"
|
||||
ignore = "0.4"
|
||||
image = { version = "0.25", default-features = false, features = ["png"] }
|
||||
lru = "0.16"
|
||||
parking_lot = "0.12"
|
||||
pdf-extract = "0.7"
|
||||
tar = "0.4"
|
||||
|
||||
@@ -75,6 +75,7 @@ impl DeepSeekClient {
|
||||
&self,
|
||||
request: &MessageRequest,
|
||||
) -> Result<MessageResponse> {
|
||||
let cacheable = crate::llm_response_cache::request_is_cacheable(request);
|
||||
let messages = build_chat_messages_for_request_and_provider(request, self.api_provider);
|
||||
let model = wire_model_for_provider(self.api_provider, &request.model);
|
||||
let mut body = json!({
|
||||
@@ -121,6 +122,24 @@ impl DeepSeekClient {
|
||||
self.api_provider,
|
||||
);
|
||||
|
||||
let response_cache_key = if cacheable {
|
||||
let wire_body =
|
||||
serde_json::to_vec(&body).context("Failed to serialize Chat API cache key")?;
|
||||
let key = crate::llm_response_cache::ResponseCache::make_key(
|
||||
self.api_provider.as_str(),
|
||||
&self.base_url,
|
||||
self.path_suffix.as_deref(),
|
||||
&self.api_key,
|
||||
&wire_body,
|
||||
);
|
||||
if let Some(cached) = crate::llm_response_cache::response_cache().get(&key) {
|
||||
return Ok(cached);
|
||||
}
|
||||
Some(key)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let url = api_url_with_suffix(
|
||||
&self.base_url,
|
||||
"chat/completions",
|
||||
@@ -158,7 +177,11 @@ impl DeepSeekClient {
|
||||
let response_text = response.text().await.unwrap_or_default();
|
||||
let value: Value =
|
||||
serde_json::from_str(&response_text).context("Failed to parse Chat API JSON")?;
|
||||
parse_chat_message(&value)
|
||||
let parsed = parse_chat_message(&value)?;
|
||||
if let Some(key) = response_cache_key {
|
||||
crate::llm_response_cache::response_cache().put(key, parsed.clone());
|
||||
}
|
||||
Ok(parsed)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,239 @@
|
||||
//! Small in-process cache for deterministic non-streaming chat responses.
|
||||
|
||||
use std::num::NonZeroUsize;
|
||||
use std::sync::{Mutex, OnceLock};
|
||||
|
||||
use lru::LruCache;
|
||||
use sha2::{Digest, Sha256};
|
||||
|
||||
use crate::models::{MessageRequest, MessageResponse, Usage};
|
||||
|
||||
const DEFAULT_CAPACITY: usize = 256;
|
||||
|
||||
static RESPONSE_CACHE: OnceLock<ResponseCache> = OnceLock::new();
|
||||
|
||||
pub(crate) fn response_cache() -> &'static ResponseCache {
|
||||
RESPONSE_CACHE.get_or_init(ResponseCache::new)
|
||||
}
|
||||
|
||||
pub(crate) fn request_is_cacheable(request: &MessageRequest) -> bool {
|
||||
request.stream != Some(true)
|
||||
&& request.tools.as_ref().is_none_or(Vec::is_empty)
|
||||
&& request.tool_choice.is_none()
|
||||
&& request.temperature == Some(0.0)
|
||||
&& request.top_p.is_none_or(|top_p| top_p == 1.0)
|
||||
}
|
||||
|
||||
pub(crate) struct ResponseCache {
|
||||
inner: Mutex<LruCache<[u8; 32], MessageResponse>>,
|
||||
}
|
||||
|
||||
impl ResponseCache {
|
||||
fn new() -> Self {
|
||||
Self::with_capacity(NonZeroUsize::new(DEFAULT_CAPACITY).expect("non-zero capacity"))
|
||||
}
|
||||
|
||||
fn with_capacity(capacity: NonZeroUsize) -> Self {
|
||||
Self {
|
||||
inner: Mutex::new(LruCache::new(capacity)),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn make_key(
|
||||
provider: &str,
|
||||
base_url: &str,
|
||||
path_suffix: Option<&str>,
|
||||
api_key: &str,
|
||||
wire_body: &[u8],
|
||||
) -> [u8; 32] {
|
||||
let mut hasher = Sha256::new();
|
||||
update_field(&mut hasher, provider.as_bytes());
|
||||
update_field(&mut hasher, base_url.as_bytes());
|
||||
update_field(&mut hasher, path_suffix.unwrap_or("").as_bytes());
|
||||
update_field(&mut hasher, &Sha256::digest(api_key.as_bytes()));
|
||||
update_field(&mut hasher, wire_body);
|
||||
hasher.finalize().into()
|
||||
}
|
||||
|
||||
pub(crate) fn get(&self, key: &[u8; 32]) -> Option<MessageResponse> {
|
||||
let mut cache = self.inner.lock().ok()?;
|
||||
cache.get(key).cloned().map(|mut response| {
|
||||
response.usage = Usage::default();
|
||||
response
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn put(&self, key: [u8; 32], value: MessageResponse) {
|
||||
if let Ok(mut cache) = self.inner.lock() {
|
||||
cache.put(key, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn update_field(hasher: &mut Sha256, bytes: &[u8]) {
|
||||
hasher.update((bytes.len() as u64).to_le_bytes());
|
||||
hasher.update(bytes);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn response_with_usage(id: &str) -> MessageResponse {
|
||||
MessageResponse {
|
||||
id: id.to_string(),
|
||||
r#type: "message".to_string(),
|
||||
role: "assistant".to_string(),
|
||||
content: Vec::new(),
|
||||
model: "test-model".to_string(),
|
||||
stop_reason: Some("end_turn".to_string()),
|
||||
stop_sequence: None,
|
||||
container: None,
|
||||
usage: Usage {
|
||||
input_tokens: 42,
|
||||
output_tokens: 7,
|
||||
prompt_cache_hit_tokens: Some(3),
|
||||
prompt_cache_miss_tokens: Some(39),
|
||||
reasoning_tokens: Some(5),
|
||||
reasoning_replay_tokens: Some(2),
|
||||
server_tool_use: None,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn request() -> MessageRequest {
|
||||
MessageRequest {
|
||||
model: "test-model".to_string(),
|
||||
messages: Vec::new(),
|
||||
max_tokens: 16,
|
||||
system: None,
|
||||
tools: None,
|
||||
tool_choice: None,
|
||||
metadata: None,
|
||||
thinking: None,
|
||||
reasoning_effort: None,
|
||||
stream: None,
|
||||
temperature: Some(0.0),
|
||||
top_p: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cache_key_separates_provider_route_account_and_wire_body() {
|
||||
let base = ResponseCache::make_key(
|
||||
"deepseek",
|
||||
"https://api.example.com/v1",
|
||||
None,
|
||||
"key-a",
|
||||
br#"{"model":"m","messages":[]}"#,
|
||||
);
|
||||
|
||||
assert_ne!(
|
||||
base,
|
||||
ResponseCache::make_key(
|
||||
"openai",
|
||||
"https://api.example.com/v1",
|
||||
None,
|
||||
"key-a",
|
||||
br#"{"model":"m","messages":[]}"#
|
||||
)
|
||||
);
|
||||
assert_ne!(
|
||||
base,
|
||||
ResponseCache::make_key(
|
||||
"deepseek",
|
||||
"https://proxy.example.com/v1",
|
||||
None,
|
||||
"key-a",
|
||||
br#"{"model":"m","messages":[]}"#
|
||||
)
|
||||
);
|
||||
assert_ne!(
|
||||
base,
|
||||
ResponseCache::make_key(
|
||||
"deepseek",
|
||||
"https://api.example.com/v1",
|
||||
Some("responses"),
|
||||
"key-a",
|
||||
br#"{"model":"m","messages":[]}"#
|
||||
)
|
||||
);
|
||||
assert_ne!(
|
||||
base,
|
||||
ResponseCache::make_key(
|
||||
"deepseek",
|
||||
"https://api.example.com/v1",
|
||||
None,
|
||||
"key-b",
|
||||
br#"{"model":"m","messages":[]}"#
|
||||
)
|
||||
);
|
||||
assert_ne!(
|
||||
base,
|
||||
ResponseCache::make_key(
|
||||
"deepseek",
|
||||
"https://api.example.com/v1",
|
||||
None,
|
||||
"key-a",
|
||||
br#"{"model":"m","messages":[],"reasoning_effort":"high"}"#
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cache_hit_zeroes_usage_to_avoid_fake_spend() {
|
||||
let cache = ResponseCache::with_capacity(NonZeroUsize::new(2).unwrap());
|
||||
let key =
|
||||
ResponseCache::make_key("deepseek", "https://api.example.com", None, "key", b"{}");
|
||||
|
||||
cache.put(key, response_with_usage("cached"));
|
||||
|
||||
let hit = cache.get(&key).expect("cache hit");
|
||||
assert_eq!(hit.id, "cached");
|
||||
assert_eq!(hit.usage, Usage::default());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn capacity_evicts_oldest_entry() {
|
||||
let cache = ResponseCache::with_capacity(NonZeroUsize::new(2).unwrap());
|
||||
let key1 =
|
||||
ResponseCache::make_key("deepseek", "https://api.example.com", None, "key", b"one");
|
||||
let key2 =
|
||||
ResponseCache::make_key("deepseek", "https://api.example.com", None, "key", b"two");
|
||||
let key3 =
|
||||
ResponseCache::make_key("deepseek", "https://api.example.com", None, "key", b"three");
|
||||
|
||||
cache.put(key1, response_with_usage("one"));
|
||||
cache.put(key2, response_with_usage("two"));
|
||||
cache.put(key3, response_with_usage("three"));
|
||||
|
||||
assert!(cache.get(&key1).is_none());
|
||||
assert!(cache.get(&key2).is_some());
|
||||
assert!(cache.get(&key3).is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cacheability_requires_deterministic_tool_free_non_streaming_request() {
|
||||
let mut req = request();
|
||||
assert!(request_is_cacheable(&req));
|
||||
|
||||
req.temperature = None;
|
||||
assert!(!request_is_cacheable(&req));
|
||||
|
||||
req = request();
|
||||
req.temperature = Some(0.2);
|
||||
assert!(!request_is_cacheable(&req));
|
||||
|
||||
req = request();
|
||||
req.stream = Some(true);
|
||||
assert!(!request_is_cacheable(&req));
|
||||
|
||||
req = request();
|
||||
req.top_p = Some(0.5);
|
||||
assert!(!request_is_cacheable(&req));
|
||||
|
||||
req = request();
|
||||
req.tool_choice = Some(serde_json::json!("auto"));
|
||||
assert!(!request_is_cacheable(&req));
|
||||
}
|
||||
}
|
||||
@@ -39,6 +39,7 @@ mod features;
|
||||
mod handoff;
|
||||
mod hooks;
|
||||
mod llm_client;
|
||||
mod llm_response_cache;
|
||||
mod localization;
|
||||
mod logging;
|
||||
mod lsp;
|
||||
|
||||
Reference in New Issue
Block a user