From 6f70a2832e60413edbb4d08e50bfe7d3fa204f9d Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Tue, 12 May 2026 01:27:20 -0500 Subject: [PATCH] fix(file_mention): preserve UTF-8 codepoint boundary when truncating mention contents MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #1441. When `@`-mentioning a file larger than the 128 KB `MAX_MENTION_FILE_BYTES` ceiling, the truncator clipped the buffer to exactly the cap — which on CJK / emoji content frequently landed mid-codepoint and left a stray U+FFFD replacement char at the cut point. The fix uses `str::from_utf8(...).error_len()` to distinguish the two ways a truncated UTF-8 buffer can fail: - `error_len() == None` means the failure is an incomplete tail sequence — exactly the boundary case we want to handle. Round `buffer.truncate()` down to `valid_up_to()` so the trailing bytes are dropped cleanly. - `error_len() == Some(_)` means the file genuinely contains invalid UTF-8 bytes (not at the truncation boundary). Leave the buffer intact so the subsequent `from_utf8(&buffer)` call surfaces the canonical "file is not UTF-8" error rather than silently dropping the invalid bytes. Collapsed the if-let-then-if pattern to `if let Err(e) = ... && e.error_len().is_none()` to satisfy the workspace's `collapsible_if` clippy gate. Harvested from PR #1495 by @CrepuscularIRIS Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 12 +++++++ crates/tui/src/tui/file_mention.rs | 50 +++++++++++++++++++++++++----- 2 files changed, 55 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e5f4e348..63d19920 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,18 @@ real world uses." ### Fixed +- **`@`-mention truncation no longer splits multi-byte UTF-8 + sequences** (#1441, harvested from PR #1495 by + **@CrepuscularIRIS / autoghclaw**). When `@`-mentioning a file + larger than 128 KB the composer truncated the buffer at exactly + `MAX_MENTION_FILE_BYTES`, which on CJK / emoji content landed + mid-codepoint and produced a stray U+FFFD at the cut point. The + truncator now uses `str::from_utf8(...).error_len()` to detect + the incomplete-tail case and rounds down to the last valid + codepoint boundary before decoding. Genuinely invalid UTF-8 + files still surface the "file is not UTF-8" error (the rounding + is only applied when the error is an incomplete tail, not a + real decoding failure mid-buffer). - **vLLM provider: `reasoning_effort = "off"` now actually disables thinking on Qwen3 / DeepSeek-R1 servers, cutting TTFT from ~13s to ~270ms** (harvested from PR #1480 by diff --git a/crates/tui/src/tui/file_mention.rs b/crates/tui/src/tui/file_mention.rs index a713e04e..2215bfb4 100644 --- a/crates/tui/src/tui/file_mention.rs +++ b/crates/tui/src/tui/file_mention.rs @@ -761,6 +761,17 @@ fn read_text_prefix(path: &Path) -> std::io::Result<(String, bool)> { let truncated = buffer.len() as u64 > MAX_MENTION_FILE_BYTES; if truncated { buffer.truncate(MAX_MENTION_FILE_BYTES as usize); + // Round down to the nearest valid UTF-8 character boundary so a + // multi-byte sequence (CJK, emoji, etc.) is never split at the cut point. + // Only adjust when error_len() is None — that means truncation landed + // mid-sequence (incomplete tail). A Some(_) error_len means the file + // genuinely contains invalid UTF-8 bytes; leave the buffer intact so + // the from_utf8 call below returns the correct "file is not UTF-8" error. + if let Err(e) = std::str::from_utf8(&buffer) + && e.error_len().is_none() + { + buffer.truncate(e.valid_up_to()); + } } if buffer.contains(&0) { return Err(std::io::Error::new( @@ -768,13 +779,9 @@ fn read_text_prefix(path: &Path) -> std::io::Result<(String, bool)> { "file appears to be binary", )); } - let text = if truncated { - String::from_utf8_lossy(&buffer).to_string() - } else { - std::str::from_utf8(&buffer) - .map_err(|_| std::io::Error::new(std::io::ErrorKind::InvalidData, "file is not UTF-8"))? - .to_string() - }; + let text = std::str::from_utf8(&buffer) + .map_err(|_| std::io::Error::new(std::io::ErrorKind::InvalidData, "file is not UTF-8"))? + .to_string(); Ok((text, truncated)) } @@ -1002,4 +1009,33 @@ mod tests { let decoded: ContextReference = serde_json::from_str(&encoded).expect("deserialize"); assert_eq!(&decoded, reference); } + + /// Regression test for #1441: truncating at MAX_MENTION_FILE_BYTES must not + /// split a multi-byte UTF-8 sequence, which previously produced U+FFFD + /// replacement characters in the TUI output. + #[test] + fn read_text_prefix_truncation_respects_utf8_char_boundary() { + use std::io::Write; + + // Build a file that is MAX_MENTION_FILE_BYTES - 1 ASCII bytes followed + // by a 3-byte CJK character (U+4E2D, '中'). The naive truncate at + // MAX_MENTION_FILE_BYTES cuts after the first byte of '中', producing + // an invalid sequence. + let tmp = TempDir::new().expect("tempdir"); + let path = tmp.path().join("cjk.txt"); + let mut f = std::fs::File::create(&path).expect("create"); + let padding = vec![b'a'; MAX_MENTION_FILE_BYTES as usize - 1]; + f.write_all(&padding).expect("write padding"); + f.write_all("中".as_bytes()).expect("write CJK"); + + let (text, truncated) = read_text_prefix(&path).expect("should succeed"); + assert!( + truncated, + "file exceeds limit so should be marked truncated" + ); + assert!( + !text.contains('\u{FFFD}'), + "truncated text must not contain replacement characters; got: {text:?}", + ); + } }