From 6f70a2832e60413edbb4d08e50bfe7d3fa204f9d Mon Sep 17 00:00:00 2001
From: Hunter Bown <hmbown@gmail.com>
Date: Tue, 12 May 2026 01:27:20 -0500
Subject: [PATCH] fix(file_mention): preserve UTF-8 codepoint boundary when
 truncating mention contents
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #1441. When `@`-mentioning a file larger than the 128 KB
`MAX_MENTION_FILE_BYTES` ceiling, the truncator clipped the buffer
to exactly the cap — which on CJK / emoji content frequently
landed mid-codepoint and left a stray U+FFFD replacement char at
the cut point.

The fix uses `str::from_utf8(...).error_len()` to distinguish the
two ways a truncated UTF-8 buffer can fail:

  - `error_len() == None` means the failure is an incomplete tail
    sequence — exactly the boundary case we want to handle. Round
    `buffer.truncate()` down to `valid_up_to()` so the trailing
    bytes are dropped cleanly.
  - `error_len() == Some(_)` means the file genuinely contains
    invalid UTF-8 bytes (not at the truncation boundary). Leave
    the buffer intact so the subsequent `from_utf8(&buffer)` call
    surfaces the canonical "file is not UTF-8" error rather than
    silently dropping the invalid bytes.

Collapsed the if-let-then-if pattern to `if let Err(e) = ... &&
e.error_len().is_none()` to satisfy the workspace's
`collapsible_if` clippy gate.

Harvested from PR #1495 by @CrepuscularIRIS

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md                       | 12 +++++++
 crates/tui/src/tui/file_mention.rs | 50 +++++++++++++++++++++++++-----
 2 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e5f4e348..63d19920 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,18 @@ real world uses."
 
 ### Fixed
 
+- **`@`-mention truncation no longer splits multi-byte UTF-8
+  sequences** (#1441, harvested from PR #1495 by
+  **@CrepuscularIRIS / autoghclaw**). When `@`-mentioning a file
+  larger than 128 KB the composer truncated the buffer at exactly
+  `MAX_MENTION_FILE_BYTES`, which on CJK / emoji content landed
+  mid-codepoint and produced a stray U+FFFD at the cut point. The
+  truncator now uses `str::from_utf8(...).error_len()` to detect
+  the incomplete-tail case and rounds down to the last valid
+  codepoint boundary before decoding. Genuinely invalid UTF-8
+  files still surface the "file is not UTF-8" error (the rounding
+  is only applied when the error is an incomplete tail, not a
+  real decoding failure mid-buffer).
 - **vLLM provider: `reasoning_effort = "off"` now actually
   disables thinking on Qwen3 / DeepSeek-R1 servers, cutting
   TTFT from ~13s to ~270ms** (harvested from PR #1480 by
diff --git a/crates/tui/src/tui/file_mention.rs b/crates/tui/src/tui/file_mention.rs
index a713e04e..2215bfb4 100644
--- a/crates/tui/src/tui/file_mention.rs
+++ b/crates/tui/src/tui/file_mention.rs
@@ -761,6 +761,17 @@ fn read_text_prefix(path: &Path) -> std::io::Result<(String, bool)> {
     let truncated = buffer.len() as u64 > MAX_MENTION_FILE_BYTES;
     if truncated {
         buffer.truncate(MAX_MENTION_FILE_BYTES as usize);
+        // Round down to the nearest valid UTF-8 character boundary so a
+        // multi-byte sequence (CJK, emoji, etc.) is never split at the cut point.
+        // Only adjust when error_len() is None — that means truncation landed
+        // mid-sequence (incomplete tail).  A Some(_) error_len means the file
+        // genuinely contains invalid UTF-8 bytes; leave the buffer intact so
+        // the from_utf8 call below returns the correct "file is not UTF-8" error.
+        if let Err(e) = std::str::from_utf8(&buffer)
+            && e.error_len().is_none()
+        {
+            buffer.truncate(e.valid_up_to());
+        }
     }
     if buffer.contains(&0) {
         return Err(std::io::Error::new(
@@ -768,13 +779,9 @@ fn read_text_prefix(path: &Path) -> std::io::Result<(String, bool)> {
             "file appears to be binary",
         ));
     }
-    let text = if truncated {
-        String::from_utf8_lossy(&buffer).to_string()
-    } else {
-        std::str::from_utf8(&buffer)
-            .map_err(|_| std::io::Error::new(std::io::ErrorKind::InvalidData, "file is not UTF-8"))?
-            .to_string()
-    };
+    let text = std::str::from_utf8(&buffer)
+        .map_err(|_| std::io::Error::new(std::io::ErrorKind::InvalidData, "file is not UTF-8"))?
+        .to_string();
     Ok((text, truncated))
 }
 
@@ -1002,4 +1009,33 @@ mod tests {
         let decoded: ContextReference = serde_json::from_str(&encoded).expect("deserialize");
         assert_eq!(&decoded, reference);
     }
+
+    /// Regression test for #1441: truncating at MAX_MENTION_FILE_BYTES must not
+    /// split a multi-byte UTF-8 sequence, which previously produced U+FFFD
+    /// replacement characters in the TUI output.
+    #[test]
+    fn read_text_prefix_truncation_respects_utf8_char_boundary() {
+        use std::io::Write;
+
+        // Build a file that is MAX_MENTION_FILE_BYTES - 1 ASCII bytes followed
+        // by a 3-byte CJK character (U+4E2D, '中'). The naive truncate at
+        // MAX_MENTION_FILE_BYTES cuts after the first byte of '中', producing
+        // an invalid sequence.
+        let tmp = TempDir::new().expect("tempdir");
+        let path = tmp.path().join("cjk.txt");
+        let mut f = std::fs::File::create(&path).expect("create");
+        let padding = vec![b'a'; MAX_MENTION_FILE_BYTES as usize - 1];
+        f.write_all(&padding).expect("write padding");
+        f.write_all("中".as_bytes()).expect("write CJK");
+
+        let (text, truncated) = read_text_prefix(&path).expect("should succeed");
+        assert!(
+            truncated,
+            "file exceeds limit so should be marked truncated"
+        );
+        assert!(
+            !text.contains('\u{FFFD}'),
+            "truncated text must not contain replacement characters; got: {text:?}",
+        );
+    }
 }