From e10e53d3966972f2d0eafec115310cba67f9a24e Mon Sep 17 00:00:00 2001
From: Duducoco <69681789+Duducoco@users.noreply.github.com>
Date: Sun, 10 May 2026 01:34:36 +0800
Subject: [PATCH] fix(client): stabilize reasoning_content replay for prompt
 cache (#1297)

* fix(client): stabilize reasoning_content replay for prompt cache

- stop gating assistant reasoning_content on whether a later user turn
  exists; the field now depends only on the stored message itself
- preserve historical message bytes across turns so DeepSeek's prefix
  cache stays warm on every text-reply follow-up
- add a byte-stability regression test and update the prior-non-tool
  reasoning test to assert the new contract

* style(client): rustfmt single-line let binding

- collapse a two-line `let mut has_reasoning = ...` into a single line
  so `cargo fmt --all -- --check` passes
---
 crates/tui/src/client.rs      | 72 +++++++++++++++++++++++++++++++++--
 crates/tui/src/client/chat.rs | 31 ++++++---------
 2 files changed, 80 insertions(+), 23 deletions(-)

diff --git a/crates/tui/src/client.rs b/crates/tui/src/client.rs
index 88a696e1..54571591 100644
--- a/crates/tui/src/client.rs
+++ b/crates/tui/src/client.rs
@@ -1290,7 +1290,12 @@ mod tests {
     }
 
     #[test]
-    fn chat_messages_omit_prior_non_tool_reasoning_after_new_user_turn() {
+    fn chat_messages_keep_prior_non_tool_reasoning_after_new_user_turn() {
+        // The serialized JSON for a stored assistant message MUST be a pure
+        // function of that message — never of what comes after it. DeepSeek's
+        // prompt cache hashes the leading bytes of every request; flipping
+        // `reasoning_content` on/off across turns rewrites historical bytes
+        // and busts the prefix cache from that message onwards. (#583)
         let messages = vec![
             Message {
                 role: "user".to_string(),
@@ -1330,9 +1335,68 @@ mod tests {
             assistant.get("content").and_then(Value::as_str),
             Some("Final answer")
         );
-        assert!(
-            assistant.get("reasoning_content").is_none(),
-            "non-tool reasoning from previous turns should not be replayed"
+        assert_eq!(
+            assistant.get("reasoning_content").and_then(Value::as_str),
+            Some("Internal explanation plan"),
+            "reasoning_content must be preserved across follow-up user turns to keep DeepSeek's prefix cache warm"
+        );
+    }
+
+    #[test]
+    fn chat_messages_assistant_json_is_byte_stable_across_follow_up_user_turn() {
+        // Direct prefix-cache regression: the JSON for the assistant message
+        // built on turn N must equal the JSON for the same assistant message
+        // built on turn N+1, after a new user message has been appended.
+        let assistant = Message {
+            role: "assistant".to_string(),
+            content: vec![
+                ContentBlock::Thinking {
+                    thinking: "I should explain step by step.".to_string(),
+                },
+                ContentBlock::Text {
+                    text: "Here is the explanation.".to_string(),
+                    cache_control: None,
+                },
+            ],
+        };
+        let user_initial = Message {
+            role: "user".to_string(),
+            content: vec![ContentBlock::Text {
+                text: "Explain it".to_string(),
+                cache_control: None,
+            }],
+        };
+        let user_follow_up = Message {
+            role: "user".to_string(),
+            content: vec![ContentBlock::Text {
+                text: "Next question".to_string(),
+                cache_control: None,
+            }],
+        };
+
+        let turn_n = build_chat_messages(
+            None,
+            &[user_initial.clone(), assistant.clone()],
+            "deepseek-v4-pro",
+        );
+        let turn_n_plus_1 = build_chat_messages(
+            None,
+            &[user_initial, assistant, user_follow_up],
+            "deepseek-v4-pro",
+        );
+
+        let assistant_n = turn_n
+            .iter()
+            .find(|v| v.get("role").and_then(Value::as_str) == Some("assistant"))
+            .expect("assistant present in turn N");
+        let assistant_n1 = turn_n_plus_1
+            .iter()
+            .find(|v| v.get("role").and_then(Value::as_str) == Some("assistant"))
+            .expect("assistant present in turn N+1");
+
+        assert_eq!(
+            assistant_n, assistant_n1,
+            "assistant message JSON must be byte-identical across turns or DeepSeek's prefix cache breaks"
         );
     }
 
diff --git a/crates/tui/src/client/chat.rs b/crates/tui/src/client/chat.rs
index 2775f6e9..62c2b63b 100644
--- a/crates/tui/src/client/chat.rs
+++ b/crates/tui/src/client/chat.rs
@@ -1008,9 +1008,6 @@ fn build_chat_messages_with_reasoning(
         let mut tool_call_infos = Vec::new();
         let mut tool_results: Vec<(String, String, String)> = Vec::new();
         let mut turn_meta_budget: Option<TurnMetaBudget> = None;
-        let later_user_turn = messages[message_index + 1..]
-            .iter()
-            .any(message_starts_user_turn);
 
         for block in &message.content {
             match block {
@@ -1075,14 +1072,18 @@ fn build_chat_messages_with_reasoning(
             let mut reasoning_content = thinking_parts.join("\n");
             let has_text = !content.trim().is_empty();
             let has_tool_calls = !tool_calls.is_empty();
-            // DeepSeek thinking-mode tool calls must replay `reasoning_content`
-            // on subsequent requests. Non-tool assistant reasoning can be
-            // omitted once a later real user text message starts a new turn.
-            let include_reasoning_for_turn =
-                include_reasoning && (has_tool_calls || !later_user_turn);
-            let mut has_reasoning =
-                include_reasoning_for_turn && !reasoning_content.trim().is_empty();
-            if include_reasoning_for_turn && has_tool_calls && !has_reasoning {
+            // Reasoning replay must be a function of the stored message ONLY,
+            // never of later history. DeepSeek's prefix cache hashes the raw
+            // bytes of every message; flipping `reasoning_content` on/off
+            // depending on whether a follow-up user turn exists rewrites a
+            // historical message between turns and busts the cache from that
+            // point onwards. Always emit `reasoning_content` when the model
+            // requires replay AND the stored message carries thinking text.
+            // Tool-call messages with empty thinking still need a placeholder
+            // (DeepSeek 400s without it), but text-only assistant messages
+            // simply omit the field when there's nothing to replay.
+            let mut has_reasoning = include_reasoning && !reasoning_content.trim().is_empty();
+            if include_reasoning && has_tool_calls && !has_reasoning {
                 logging::warn(
                     "Substituting placeholder reasoning_content for DeepSeek tool-call assistant message",
                 );
@@ -1295,14 +1296,6 @@ fn build_chat_messages_with_reasoning(
     out
 }
 
-fn message_starts_user_turn(message: &Message) -> bool {
-    message.role == "user"
-        && message.content.iter().any(|block| match block {
-            ContentBlock::Text { text, .. } => !text.trim().is_empty(),
-            _ => false,
-        })
-}
-
 pub(super) fn tool_to_chat(tool: &Tool) -> Value {
     let mut value = json!({
         "type": "function",