From d5f4d893527efb49ed962c15605037ccf8d12814 Mon Sep 17 00:00:00 2001 From: zhouyf <88364845+elowen53@users.noreply.github.com> Date: Wed, 6 May 2026 17:28:30 +0800 Subject: [PATCH] fix(web_run): decode percent-encoded UTF-8 bytes correctly (#840) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit percent_decode in web_run.rs builds the result via `out.push(b as char)`, mapping each decoded byte to its Latin-1 codepoint. For percent-encoded multi-byte UTF-8 sequences (e.g. %E4%B8%AA = 个) this produces visible mojibake: bytes E4 B8 AA become three Latin-1 chars `个` and the final String is the UTF-8 encoding of those codepoints, not the original character. The sister module web_search.rs::percent_decode (line 490) already uses the correct pattern: collect bytes into Vec, then finalize with String::from_utf8_lossy. Align web_run.rs with that implementation. Affects DuckDuckGo redirect URLs containing non-ASCII paths, where normalize_search_url -> percent_decode previously corrupted the decoded URL before it was shown to the model and to the user. Add a regression test covering percent-encoded CJK input, raw UTF-8 input, and mixed ASCII+UTF-8. Co-authored-by: Elowen --- crates/tui/src/tools/web_run.rs | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/crates/tui/src/tools/web_run.rs b/crates/tui/src/tools/web_run.rs index a07c107a..c0b2e6a6 100644 --- a/crates/tui/src/tools/web_run.rs +++ b/crates/tui/src/tools/web_run.rs @@ -1608,7 +1608,7 @@ fn extract_query_param(url: &str, key: &str) -> Option { } fn percent_decode(input: &str) -> String { - let mut out = String::new(); + let mut out = Vec::with_capacity(input.len()); let bytes = input.as_bytes(); let mut idx = 0; while idx < bytes.len() { @@ -1617,14 +1617,14 @@ fn percent_decode(input: &str) -> String { && let Ok(hex) = std::str::from_utf8(&bytes[idx + 1..idx + 3]) && let Ok(val) = u8::from_str_radix(hex, 16) { - out.push(val as char); + out.push(val); idx += 3; continue; } - out.push(bytes[idx] as char); + out.push(bytes[idx]); idx += 1; } - out + String::from_utf8_lossy(&out).into_owned() } fn url_encode(input: &str) -> String { @@ -1711,6 +1711,22 @@ mod tests { assert_eq!(results[0].snippet.as_deref(), Some("A useful snippet.")); } + #[test] + fn percent_decode_handles_utf8_multibyte_sequences() { + // Percent-encoded CJK: %E4%B8%AA%E4%BA%BA = 个人 (each glyph is 3 UTF-8 bytes). + assert_eq!(percent_decode("Hello %E4%B8%AA%E4%BA%BA"), "Hello 个人"); + assert_eq!(percent_decode("%E7%B4%A0%E6%9D%90"), "素材"); + // Percent-encoded UTF-8 inside a URL path (DuckDuckGo `uddg=` redirect shape). + assert_eq!( + percent_decode("https://example.com/%E9%A1%B5%E9%9D%A2"), + "https://example.com/页面" + ); + // Raw UTF-8 in the input passes through unchanged. + assert_eq!(percent_decode("查询 keyword"), "查询 keyword"); + // ASCII-only inputs preserve existing behavior; `+` stays literal. + assert_eq!(percent_decode("foo+bar%20baz"), "foo+bar baz"); + } + #[test] fn scoped_ref_prefix_is_session_specific() { reset_web_run_state();