fix(client): stabilize reasoning_content replay for prompt cache (#1297)

* fix(client): stabilize reasoning_content replay for prompt cache

- stop gating assistant reasoning_content on whether a later user turn
  exists; the field now depends only on the stored message itself
- preserve historical message bytes across turns so DeepSeek's prefix
  cache stays warm on every text-reply follow-up
- add a byte-stability regression test and update the prior-non-tool
  reasoning test to assert the new contract

* style(client): rustfmt single-line let binding

- collapse a two-line `let mut has_reasoning = ...` into a single line
  so `cargo fmt --all -- --check` passes
This commit is contained in:
Duducoco
2026-05-10 01:34:36 +08:00
committed by GitHub
parent ebae6a07f6
commit e10e53d396
2 changed files with 80 additions and 23 deletions
+68 -4
View File
@@ -1290,7 +1290,12 @@ mod tests {
}
#[test]
fn chat_messages_omit_prior_non_tool_reasoning_after_new_user_turn() {
fn chat_messages_keep_prior_non_tool_reasoning_after_new_user_turn() {
// The serialized JSON for a stored assistant message MUST be a pure
// function of that message — never of what comes after it. DeepSeek's
// prompt cache hashes the leading bytes of every request; flipping
// `reasoning_content` on/off across turns rewrites historical bytes
// and busts the prefix cache from that message onwards. (#583)
let messages = vec![
Message {
role: "user".to_string(),
@@ -1330,9 +1335,68 @@ mod tests {
assistant.get("content").and_then(Value::as_str),
Some("Final answer")
);
assert!(
assistant.get("reasoning_content").is_none(),
"non-tool reasoning from previous turns should not be replayed"
assert_eq!(
assistant.get("reasoning_content").and_then(Value::as_str),
Some("Internal explanation plan"),
"reasoning_content must be preserved across follow-up user turns to keep DeepSeek's prefix cache warm"
);
}
#[test]
fn chat_messages_assistant_json_is_byte_stable_across_follow_up_user_turn() {
// Direct prefix-cache regression: the JSON for the assistant message
// built on turn N must equal the JSON for the same assistant message
// built on turn N+1, after a new user message has been appended.
let assistant = Message {
role: "assistant".to_string(),
content: vec![
ContentBlock::Thinking {
thinking: "I should explain step by step.".to_string(),
},
ContentBlock::Text {
text: "Here is the explanation.".to_string(),
cache_control: None,
},
],
};
let user_initial = Message {
role: "user".to_string(),
content: vec![ContentBlock::Text {
text: "Explain it".to_string(),
cache_control: None,
}],
};
let user_follow_up = Message {
role: "user".to_string(),
content: vec![ContentBlock::Text {
text: "Next question".to_string(),
cache_control: None,
}],
};
let turn_n = build_chat_messages(
None,
&[user_initial.clone(), assistant.clone()],
"deepseek-v4-pro",
);
let turn_n_plus_1 = build_chat_messages(
None,
&[user_initial, assistant, user_follow_up],
"deepseek-v4-pro",
);
let assistant_n = turn_n
.iter()
.find(|v| v.get("role").and_then(Value::as_str) == Some("assistant"))
.expect("assistant present in turn N");
let assistant_n1 = turn_n_plus_1
.iter()
.find(|v| v.get("role").and_then(Value::as_str) == Some("assistant"))
.expect("assistant present in turn N+1");
assert_eq!(
assistant_n, assistant_n1,
"assistant message JSON must be byte-identical across turns or DeepSeek's prefix cache breaks"
);
}
+12 -19
View File
@@ -1008,9 +1008,6 @@ fn build_chat_messages_with_reasoning(
let mut tool_call_infos = Vec::new();
let mut tool_results: Vec<(String, String, String)> = Vec::new();
let mut turn_meta_budget: Option<TurnMetaBudget> = None;
let later_user_turn = messages[message_index + 1..]
.iter()
.any(message_starts_user_turn);
for block in &message.content {
match block {
@@ -1075,14 +1072,18 @@ fn build_chat_messages_with_reasoning(
let mut reasoning_content = thinking_parts.join("\n");
let has_text = !content.trim().is_empty();
let has_tool_calls = !tool_calls.is_empty();
// DeepSeek thinking-mode tool calls must replay `reasoning_content`
// on subsequent requests. Non-tool assistant reasoning can be
// omitted once a later real user text message starts a new turn.
let include_reasoning_for_turn =
include_reasoning && (has_tool_calls || !later_user_turn);
let mut has_reasoning =
include_reasoning_for_turn && !reasoning_content.trim().is_empty();
if include_reasoning_for_turn && has_tool_calls && !has_reasoning {
// Reasoning replay must be a function of the stored message ONLY,
// never of later history. DeepSeek's prefix cache hashes the raw
// bytes of every message; flipping `reasoning_content` on/off
// depending on whether a follow-up user turn exists rewrites a
// historical message between turns and busts the cache from that
// point onwards. Always emit `reasoning_content` when the model
// requires replay AND the stored message carries thinking text.
// Tool-call messages with empty thinking still need a placeholder
// (DeepSeek 400s without it), but text-only assistant messages
// simply omit the field when there's nothing to replay.
let mut has_reasoning = include_reasoning && !reasoning_content.trim().is_empty();
if include_reasoning && has_tool_calls && !has_reasoning {
logging::warn(
"Substituting placeholder reasoning_content for DeepSeek tool-call assistant message",
);
@@ -1295,14 +1296,6 @@ fn build_chat_messages_with_reasoning(
out
}
fn message_starts_user_turn(message: &Message) -> bool {
message.role == "user"
&& message.content.iter().any(|block| match block {
ContentBlock::Text { text, .. } => !text.trim().is_empty(),
_ => false,
})
}
pub(super) fn tool_to_chat(tool: &Tool) -> Value {
let mut value = json!({
"type": "function",