fix(client): stabilize reasoning_content replay for prompt cache (#1297)
* fix(client): stabilize reasoning_content replay for prompt cache - stop gating assistant reasoning_content on whether a later user turn exists; the field now depends only on the stored message itself - preserve historical message bytes across turns so DeepSeek's prefix cache stays warm on every text-reply follow-up - add a byte-stability regression test and update the prior-non-tool reasoning test to assert the new contract * style(client): rustfmt single-line let binding - collapse a two-line `let mut has_reasoning = ...` into a single line so `cargo fmt --all -- --check` passes
This commit is contained in:
@@ -1290,7 +1290,12 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chat_messages_omit_prior_non_tool_reasoning_after_new_user_turn() {
|
||||
fn chat_messages_keep_prior_non_tool_reasoning_after_new_user_turn() {
|
||||
// The serialized JSON for a stored assistant message MUST be a pure
|
||||
// function of that message — never of what comes after it. DeepSeek's
|
||||
// prompt cache hashes the leading bytes of every request; flipping
|
||||
// `reasoning_content` on/off across turns rewrites historical bytes
|
||||
// and busts the prefix cache from that message onwards. (#583)
|
||||
let messages = vec![
|
||||
Message {
|
||||
role: "user".to_string(),
|
||||
@@ -1330,9 +1335,68 @@ mod tests {
|
||||
assistant.get("content").and_then(Value::as_str),
|
||||
Some("Final answer")
|
||||
);
|
||||
assert!(
|
||||
assistant.get("reasoning_content").is_none(),
|
||||
"non-tool reasoning from previous turns should not be replayed"
|
||||
assert_eq!(
|
||||
assistant.get("reasoning_content").and_then(Value::as_str),
|
||||
Some("Internal explanation plan"),
|
||||
"reasoning_content must be preserved across follow-up user turns to keep DeepSeek's prefix cache warm"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chat_messages_assistant_json_is_byte_stable_across_follow_up_user_turn() {
|
||||
// Direct prefix-cache regression: the JSON for the assistant message
|
||||
// built on turn N must equal the JSON for the same assistant message
|
||||
// built on turn N+1, after a new user message has been appended.
|
||||
let assistant = Message {
|
||||
role: "assistant".to_string(),
|
||||
content: vec![
|
||||
ContentBlock::Thinking {
|
||||
thinking: "I should explain step by step.".to_string(),
|
||||
},
|
||||
ContentBlock::Text {
|
||||
text: "Here is the explanation.".to_string(),
|
||||
cache_control: None,
|
||||
},
|
||||
],
|
||||
};
|
||||
let user_initial = Message {
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentBlock::Text {
|
||||
text: "Explain it".to_string(),
|
||||
cache_control: None,
|
||||
}],
|
||||
};
|
||||
let user_follow_up = Message {
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentBlock::Text {
|
||||
text: "Next question".to_string(),
|
||||
cache_control: None,
|
||||
}],
|
||||
};
|
||||
|
||||
let turn_n = build_chat_messages(
|
||||
None,
|
||||
&[user_initial.clone(), assistant.clone()],
|
||||
"deepseek-v4-pro",
|
||||
);
|
||||
let turn_n_plus_1 = build_chat_messages(
|
||||
None,
|
||||
&[user_initial, assistant, user_follow_up],
|
||||
"deepseek-v4-pro",
|
||||
);
|
||||
|
||||
let assistant_n = turn_n
|
||||
.iter()
|
||||
.find(|v| v.get("role").and_then(Value::as_str) == Some("assistant"))
|
||||
.expect("assistant present in turn N");
|
||||
let assistant_n1 = turn_n_plus_1
|
||||
.iter()
|
||||
.find(|v| v.get("role").and_then(Value::as_str) == Some("assistant"))
|
||||
.expect("assistant present in turn N+1");
|
||||
|
||||
assert_eq!(
|
||||
assistant_n, assistant_n1,
|
||||
"assistant message JSON must be byte-identical across turns or DeepSeek's prefix cache breaks"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -1008,9 +1008,6 @@ fn build_chat_messages_with_reasoning(
|
||||
let mut tool_call_infos = Vec::new();
|
||||
let mut tool_results: Vec<(String, String, String)> = Vec::new();
|
||||
let mut turn_meta_budget: Option<TurnMetaBudget> = None;
|
||||
let later_user_turn = messages[message_index + 1..]
|
||||
.iter()
|
||||
.any(message_starts_user_turn);
|
||||
|
||||
for block in &message.content {
|
||||
match block {
|
||||
@@ -1075,14 +1072,18 @@ fn build_chat_messages_with_reasoning(
|
||||
let mut reasoning_content = thinking_parts.join("\n");
|
||||
let has_text = !content.trim().is_empty();
|
||||
let has_tool_calls = !tool_calls.is_empty();
|
||||
// DeepSeek thinking-mode tool calls must replay `reasoning_content`
|
||||
// on subsequent requests. Non-tool assistant reasoning can be
|
||||
// omitted once a later real user text message starts a new turn.
|
||||
let include_reasoning_for_turn =
|
||||
include_reasoning && (has_tool_calls || !later_user_turn);
|
||||
let mut has_reasoning =
|
||||
include_reasoning_for_turn && !reasoning_content.trim().is_empty();
|
||||
if include_reasoning_for_turn && has_tool_calls && !has_reasoning {
|
||||
// Reasoning replay must be a function of the stored message ONLY,
|
||||
// never of later history. DeepSeek's prefix cache hashes the raw
|
||||
// bytes of every message; flipping `reasoning_content` on/off
|
||||
// depending on whether a follow-up user turn exists rewrites a
|
||||
// historical message between turns and busts the cache from that
|
||||
// point onwards. Always emit `reasoning_content` when the model
|
||||
// requires replay AND the stored message carries thinking text.
|
||||
// Tool-call messages with empty thinking still need a placeholder
|
||||
// (DeepSeek 400s without it), but text-only assistant messages
|
||||
// simply omit the field when there's nothing to replay.
|
||||
let mut has_reasoning = include_reasoning && !reasoning_content.trim().is_empty();
|
||||
if include_reasoning && has_tool_calls && !has_reasoning {
|
||||
logging::warn(
|
||||
"Substituting placeholder reasoning_content for DeepSeek tool-call assistant message",
|
||||
);
|
||||
@@ -1295,14 +1296,6 @@ fn build_chat_messages_with_reasoning(
|
||||
out
|
||||
}
|
||||
|
||||
fn message_starts_user_turn(message: &Message) -> bool {
|
||||
message.role == "user"
|
||||
&& message.content.iter().any(|block| match block {
|
||||
ContentBlock::Text { text, .. } => !text.trim().is_empty(),
|
||||
_ => false,
|
||||
})
|
||||
}
|
||||
|
||||
pub(super) fn tool_to_chat(tool: &Tool) -> Value {
|
||||
let mut value = json!({
|
||||
"type": "function",
|
||||
|
||||
Reference in New Issue
Block a user