fix(client): stabilize reasoning_content replay for prompt cache (#1297)

* fix(client): stabilize reasoning_content replay for prompt cache - stop gating assistant reasoning_content on whether a later user turn exists; the field now depends only on the stored message itself - preserve historical message bytes across turns so DeepSeek's prefix cache stays warm on every text-reply follow-up - add a byte-stability regression test and update the prior-non-tool reasoning test to assert the new contract * style(client): rustfmt single-line let binding - collapse a two-line `let mut has_reasoning = ...` into a single line so `cargo fmt --all -- --check` passes
2026-05-10 01:34:36 +08:00
parent ebae6a07f6
commit e10e53d396
2 changed files with 80 additions and 23 deletions
@@ -1290,7 +1290,12 @@ mod tests {
    }

    #[test]
-    fn chat_messages_omit_prior_non_tool_reasoning_after_new_user_turn() {
+    fn chat_messages_keep_prior_non_tool_reasoning_after_new_user_turn() {
+        // The serialized JSON for a stored assistant message MUST be a pure
+        // function of that message — never of what comes after it. DeepSeek's
+        // prompt cache hashes the leading bytes of every request; flipping
+        // `reasoning_content` on/off across turns rewrites historical bytes
+        // and busts the prefix cache from that message onwards. (#583)
        let messages = vec![
            Message {
                role: "user".to_string(),
@@ -1330,9 +1335,68 @@ mod tests {
            assistant.get("content").and_then(Value::as_str),
            Some("Final answer")
        );
-        assert!(
-            assistant.get("reasoning_content").is_none(),
-            "non-tool reasoning from previous turns should not be replayed"
+        assert_eq!(
+            assistant.get("reasoning_content").and_then(Value::as_str),
+            Some("Internal explanation plan"),
+            "reasoning_content must be preserved across follow-up user turns to keep DeepSeek's prefix cache warm"
+        );
+    }
+
+    #[test]
+    fn chat_messages_assistant_json_is_byte_stable_across_follow_up_user_turn() {
+        // Direct prefix-cache regression: the JSON for the assistant message
+        // built on turn N must equal the JSON for the same assistant message
+        // built on turn N+1, after a new user message has been appended.
+        let assistant = Message {
+            role: "assistant".to_string(),
+            content: vec![
+                ContentBlock::Thinking {
+                    thinking: "I should explain step by step.".to_string(),
+                },
+                ContentBlock::Text {
+                    text: "Here is the explanation.".to_string(),
+                    cache_control: None,
+                },
+            ],
+        };
+        let user_initial = Message {
+            role: "user".to_string(),
+            content: vec![ContentBlock::Text {
+                text: "Explain it".to_string(),
+                cache_control: None,
+            }],
+        };
+        let user_follow_up = Message {
+            role: "user".to_string(),
+            content: vec![ContentBlock::Text {
+                text: "Next question".to_string(),
+                cache_control: None,
+            }],
+        };
+
+        let turn_n = build_chat_messages(
+            None,
+            &[user_initial.clone(), assistant.clone()],
+            "deepseek-v4-pro",
+        );
+        let turn_n_plus_1 = build_chat_messages(
+            None,
+            &[user_initial, assistant, user_follow_up],
+            "deepseek-v4-pro",
+        );
+
+        let assistant_n = turn_n
+            .iter()
+            .find(|v| v.get("role").and_then(Value::as_str) == Some("assistant"))
+            .expect("assistant present in turn N");
+        let assistant_n1 = turn_n_plus_1
+            .iter()
+            .find(|v| v.get("role").and_then(Value::as_str) == Some("assistant"))
+            .expect("assistant present in turn N+1");
+
+        assert_eq!(
+            assistant_n, assistant_n1,
+            "assistant message JSON must be byte-identical across turns or DeepSeek's prefix cache breaks"
        );
    }

@@ -1008,9 +1008,6 @@ fn build_chat_messages_with_reasoning(
        let mut tool_call_infos = Vec::new();
        let mut tool_results: Vec<(String, String, String)> = Vec::new();
        let mut turn_meta_budget: Option<TurnMetaBudget> = None;
-        let later_user_turn = messages[message_index + 1..]
-            .iter()
-            .any(message_starts_user_turn);

        for block in &message.content {
            match block {
@@ -1075,14 +1072,18 @@ fn build_chat_messages_with_reasoning(
            let mut reasoning_content = thinking_parts.join("\n");
            let has_text = !content.trim().is_empty();
            let has_tool_calls = !tool_calls.is_empty();
-            // DeepSeek thinking-mode tool calls must replay `reasoning_content`
-            // on subsequent requests. Non-tool assistant reasoning can be
-            // omitted once a later real user text message starts a new turn.
-            let include_reasoning_for_turn =
-                include_reasoning && (has_tool_calls || !later_user_turn);
-            let mut has_reasoning =
-                include_reasoning_for_turn && !reasoning_content.trim().is_empty();
-            if include_reasoning_for_turn && has_tool_calls && !has_reasoning {
+            // Reasoning replay must be a function of the stored message ONLY,
+            // never of later history. DeepSeek's prefix cache hashes the raw
+            // bytes of every message; flipping `reasoning_content` on/off
+            // depending on whether a follow-up user turn exists rewrites a
+            // historical message between turns and busts the cache from that
+            // point onwards. Always emit `reasoning_content` when the model
+            // requires replay AND the stored message carries thinking text.
+            // Tool-call messages with empty thinking still need a placeholder
+            // (DeepSeek 400s without it), but text-only assistant messages
+            // simply omit the field when there's nothing to replay.
+            let mut has_reasoning = include_reasoning && !reasoning_content.trim().is_empty();
+            if include_reasoning && has_tool_calls && !has_reasoning {
                logging::warn(
                    "Substituting placeholder reasoning_content for DeepSeek tool-call assistant message",
                );
@@ -1295,14 +1296,6 @@ fn build_chat_messages_with_reasoning(
    out
 }

-fn message_starts_user_turn(message: &Message) -> bool {
-    message.role == "user"
-        && message.content.iter().any(|block| match block {
-            ContentBlock::Text { text, .. } => !text.trim().is_empty(),
-            _ => false,
-        })
-}
-
 pub(super) fn tool_to_chat(tool: &Tool) -> Value {
    let mut value = json!({
        "type": "function",