From e10e53d3966972f2d0eafec115310cba67f9a24e Mon Sep 17 00:00:00 2001 From: Duducoco <69681789+Duducoco@users.noreply.github.com> Date: Sun, 10 May 2026 01:34:36 +0800 Subject: [PATCH] fix(client): stabilize reasoning_content replay for prompt cache (#1297) * fix(client): stabilize reasoning_content replay for prompt cache - stop gating assistant reasoning_content on whether a later user turn exists; the field now depends only on the stored message itself - preserve historical message bytes across turns so DeepSeek's prefix cache stays warm on every text-reply follow-up - add a byte-stability regression test and update the prior-non-tool reasoning test to assert the new contract * style(client): rustfmt single-line let binding - collapse a two-line `let mut has_reasoning = ...` into a single line so `cargo fmt --all -- --check` passes --- crates/tui/src/client.rs | 72 +++++++++++++++++++++++++++++++++-- crates/tui/src/client/chat.rs | 31 ++++++--------- 2 files changed, 80 insertions(+), 23 deletions(-) diff --git a/crates/tui/src/client.rs b/crates/tui/src/client.rs index 88a696e1..54571591 100644 --- a/crates/tui/src/client.rs +++ b/crates/tui/src/client.rs @@ -1290,7 +1290,12 @@ mod tests { } #[test] - fn chat_messages_omit_prior_non_tool_reasoning_after_new_user_turn() { + fn chat_messages_keep_prior_non_tool_reasoning_after_new_user_turn() { + // The serialized JSON for a stored assistant message MUST be a pure + // function of that message — never of what comes after it. DeepSeek's + // prompt cache hashes the leading bytes of every request; flipping + // `reasoning_content` on/off across turns rewrites historical bytes + // and busts the prefix cache from that message onwards. (#583) let messages = vec![ Message { role: "user".to_string(), @@ -1330,9 +1335,68 @@ mod tests { assistant.get("content").and_then(Value::as_str), Some("Final answer") ); - assert!( - assistant.get("reasoning_content").is_none(), - "non-tool reasoning from previous turns should not be replayed" + assert_eq!( + assistant.get("reasoning_content").and_then(Value::as_str), + Some("Internal explanation plan"), + "reasoning_content must be preserved across follow-up user turns to keep DeepSeek's prefix cache warm" + ); + } + + #[test] + fn chat_messages_assistant_json_is_byte_stable_across_follow_up_user_turn() { + // Direct prefix-cache regression: the JSON for the assistant message + // built on turn N must equal the JSON for the same assistant message + // built on turn N+1, after a new user message has been appended. + let assistant = Message { + role: "assistant".to_string(), + content: vec![ + ContentBlock::Thinking { + thinking: "I should explain step by step.".to_string(), + }, + ContentBlock::Text { + text: "Here is the explanation.".to_string(), + cache_control: None, + }, + ], + }; + let user_initial = Message { + role: "user".to_string(), + content: vec![ContentBlock::Text { + text: "Explain it".to_string(), + cache_control: None, + }], + }; + let user_follow_up = Message { + role: "user".to_string(), + content: vec![ContentBlock::Text { + text: "Next question".to_string(), + cache_control: None, + }], + }; + + let turn_n = build_chat_messages( + None, + &[user_initial.clone(), assistant.clone()], + "deepseek-v4-pro", + ); + let turn_n_plus_1 = build_chat_messages( + None, + &[user_initial, assistant, user_follow_up], + "deepseek-v4-pro", + ); + + let assistant_n = turn_n + .iter() + .find(|v| v.get("role").and_then(Value::as_str) == Some("assistant")) + .expect("assistant present in turn N"); + let assistant_n1 = turn_n_plus_1 + .iter() + .find(|v| v.get("role").and_then(Value::as_str) == Some("assistant")) + .expect("assistant present in turn N+1"); + + assert_eq!( + assistant_n, assistant_n1, + "assistant message JSON must be byte-identical across turns or DeepSeek's prefix cache breaks" ); } diff --git a/crates/tui/src/client/chat.rs b/crates/tui/src/client/chat.rs index 2775f6e9..62c2b63b 100644 --- a/crates/tui/src/client/chat.rs +++ b/crates/tui/src/client/chat.rs @@ -1008,9 +1008,6 @@ fn build_chat_messages_with_reasoning( let mut tool_call_infos = Vec::new(); let mut tool_results: Vec<(String, String, String)> = Vec::new(); let mut turn_meta_budget: Option = None; - let later_user_turn = messages[message_index + 1..] - .iter() - .any(message_starts_user_turn); for block in &message.content { match block { @@ -1075,14 +1072,18 @@ fn build_chat_messages_with_reasoning( let mut reasoning_content = thinking_parts.join("\n"); let has_text = !content.trim().is_empty(); let has_tool_calls = !tool_calls.is_empty(); - // DeepSeek thinking-mode tool calls must replay `reasoning_content` - // on subsequent requests. Non-tool assistant reasoning can be - // omitted once a later real user text message starts a new turn. - let include_reasoning_for_turn = - include_reasoning && (has_tool_calls || !later_user_turn); - let mut has_reasoning = - include_reasoning_for_turn && !reasoning_content.trim().is_empty(); - if include_reasoning_for_turn && has_tool_calls && !has_reasoning { + // Reasoning replay must be a function of the stored message ONLY, + // never of later history. DeepSeek's prefix cache hashes the raw + // bytes of every message; flipping `reasoning_content` on/off + // depending on whether a follow-up user turn exists rewrites a + // historical message between turns and busts the cache from that + // point onwards. Always emit `reasoning_content` when the model + // requires replay AND the stored message carries thinking text. + // Tool-call messages with empty thinking still need a placeholder + // (DeepSeek 400s without it), but text-only assistant messages + // simply omit the field when there's nothing to replay. + let mut has_reasoning = include_reasoning && !reasoning_content.trim().is_empty(); + if include_reasoning && has_tool_calls && !has_reasoning { logging::warn( "Substituting placeholder reasoning_content for DeepSeek tool-call assistant message", ); @@ -1295,14 +1296,6 @@ fn build_chat_messages_with_reasoning( out } -fn message_starts_user_turn(message: &Message) -> bool { - message.role == "user" - && message.content.iter().any(|block| match block { - ContentBlock::Text { text, .. } => !text.trim().is_empty(), - _ => false, - }) -} - pub(super) fn tool_to_chat(tool: &Tool) -> Value { let mut value = json!({ "type": "function",