fix(file_mention): preserve UTF-8 codepoint boundary when truncating mention contents
Closes #1441. When `@`-mentioning a file larger than the 128 KB `MAX_MENTION_FILE_BYTES` ceiling, the truncator clipped the buffer to exactly the cap — which on CJK / emoji content frequently landed mid-codepoint and left a stray U+FFFD replacement char at the cut point. The fix uses `str::from_utf8(...).error_len()` to distinguish the two ways a truncated UTF-8 buffer can fail: - `error_len() == None` means the failure is an incomplete tail sequence — exactly the boundary case we want to handle. Round `buffer.truncate()` down to `valid_up_to()` so the trailing bytes are dropped cleanly. - `error_len() == Some(_)` means the file genuinely contains invalid UTF-8 bytes (not at the truncation boundary). Leave the buffer intact so the subsequent `from_utf8(&buffer)` call surfaces the canonical "file is not UTF-8" error rather than silently dropping the invalid bytes. Collapsed the if-let-then-if pattern to `if let Err(e) = ... && e.error_len().is_none()` to satisfy the workspace's `collapsible_if` clippy gate. Harvested from PR #1495 by @CrepuscularIRIS Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -16,6 +16,18 @@ real world uses."
|
||||
|
||||
### Fixed
|
||||
|
||||
- **`@`-mention truncation no longer splits multi-byte UTF-8
|
||||
sequences** (#1441, harvested from PR #1495 by
|
||||
**@CrepuscularIRIS / autoghclaw**). When `@`-mentioning a file
|
||||
larger than 128 KB the composer truncated the buffer at exactly
|
||||
`MAX_MENTION_FILE_BYTES`, which on CJK / emoji content landed
|
||||
mid-codepoint and produced a stray U+FFFD at the cut point. The
|
||||
truncator now uses `str::from_utf8(...).error_len()` to detect
|
||||
the incomplete-tail case and rounds down to the last valid
|
||||
codepoint boundary before decoding. Genuinely invalid UTF-8
|
||||
files still surface the "file is not UTF-8" error (the rounding
|
||||
is only applied when the error is an incomplete tail, not a
|
||||
real decoding failure mid-buffer).
|
||||
- **vLLM provider: `reasoning_effort = "off"` now actually
|
||||
disables thinking on Qwen3 / DeepSeek-R1 servers, cutting
|
||||
TTFT from ~13s to ~270ms** (harvested from PR #1480 by
|
||||
|
||||
@@ -761,6 +761,17 @@ fn read_text_prefix(path: &Path) -> std::io::Result<(String, bool)> {
|
||||
let truncated = buffer.len() as u64 > MAX_MENTION_FILE_BYTES;
|
||||
if truncated {
|
||||
buffer.truncate(MAX_MENTION_FILE_BYTES as usize);
|
||||
// Round down to the nearest valid UTF-8 character boundary so a
|
||||
// multi-byte sequence (CJK, emoji, etc.) is never split at the cut point.
|
||||
// Only adjust when error_len() is None — that means truncation landed
|
||||
// mid-sequence (incomplete tail). A Some(_) error_len means the file
|
||||
// genuinely contains invalid UTF-8 bytes; leave the buffer intact so
|
||||
// the from_utf8 call below returns the correct "file is not UTF-8" error.
|
||||
if let Err(e) = std::str::from_utf8(&buffer)
|
||||
&& e.error_len().is_none()
|
||||
{
|
||||
buffer.truncate(e.valid_up_to());
|
||||
}
|
||||
}
|
||||
if buffer.contains(&0) {
|
||||
return Err(std::io::Error::new(
|
||||
@@ -768,13 +779,9 @@ fn read_text_prefix(path: &Path) -> std::io::Result<(String, bool)> {
|
||||
"file appears to be binary",
|
||||
));
|
||||
}
|
||||
let text = if truncated {
|
||||
String::from_utf8_lossy(&buffer).to_string()
|
||||
} else {
|
||||
std::str::from_utf8(&buffer)
|
||||
.map_err(|_| std::io::Error::new(std::io::ErrorKind::InvalidData, "file is not UTF-8"))?
|
||||
.to_string()
|
||||
};
|
||||
let text = std::str::from_utf8(&buffer)
|
||||
.map_err(|_| std::io::Error::new(std::io::ErrorKind::InvalidData, "file is not UTF-8"))?
|
||||
.to_string();
|
||||
Ok((text, truncated))
|
||||
}
|
||||
|
||||
@@ -1002,4 +1009,33 @@ mod tests {
|
||||
let decoded: ContextReference = serde_json::from_str(&encoded).expect("deserialize");
|
||||
assert_eq!(&decoded, reference);
|
||||
}
|
||||
|
||||
/// Regression test for #1441: truncating at MAX_MENTION_FILE_BYTES must not
|
||||
/// split a multi-byte UTF-8 sequence, which previously produced U+FFFD
|
||||
/// replacement characters in the TUI output.
|
||||
#[test]
|
||||
fn read_text_prefix_truncation_respects_utf8_char_boundary() {
|
||||
use std::io::Write;
|
||||
|
||||
// Build a file that is MAX_MENTION_FILE_BYTES - 1 ASCII bytes followed
|
||||
// by a 3-byte CJK character (U+4E2D, '中'). The naive truncate at
|
||||
// MAX_MENTION_FILE_BYTES cuts after the first byte of '中', producing
|
||||
// an invalid sequence.
|
||||
let tmp = TempDir::new().expect("tempdir");
|
||||
let path = tmp.path().join("cjk.txt");
|
||||
let mut f = std::fs::File::create(&path).expect("create");
|
||||
let padding = vec![b'a'; MAX_MENTION_FILE_BYTES as usize - 1];
|
||||
f.write_all(&padding).expect("write padding");
|
||||
f.write_all("中".as_bytes()).expect("write CJK");
|
||||
|
||||
let (text, truncated) = read_text_prefix(&path).expect("should succeed");
|
||||
assert!(
|
||||
truncated,
|
||||
"file exceeds limit so should be marked truncated"
|
||||
);
|
||||
assert!(
|
||||
!text.contains('\u{FFFD}'),
|
||||
"truncated text must not contain replacement characters; got: {text:?}",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user