fix(web_run): decode percent-encoded UTF-8 bytes correctly (#840)

percent_decode in web_run.rs builds the result via `out.push(b as char)`,
mapping each decoded byte to its Latin-1 codepoint. For percent-encoded
multi-byte UTF-8 sequences (e.g. %E4%B8%AA = 个) this produces visible
mojibake: bytes E4 B8 AA become three Latin-1 chars `个` and the final
String is the UTF-8 encoding of those codepoints, not the original
character.

The sister module web_search.rs::percent_decode (line 490) already uses
the correct pattern: collect bytes into Vec<u8>, then finalize with
String::from_utf8_lossy. Align web_run.rs with that implementation.

Affects DuckDuckGo redirect URLs containing non-ASCII paths, where
normalize_search_url -> percent_decode previously corrupted the decoded
URL before it was shown to the model and to the user.

Add a regression test covering percent-encoded CJK input, raw UTF-8
input, and mixed ASCII+UTF-8.

Co-authored-by: Elowen <xrnc@outlook.com>
This commit is contained in:
zhouyf
2026-05-06 17:28:30 +08:00
committed by GitHub
parent 0a17f144c0
commit d5f4d89352
+20 -4
View File
@@ -1608,7 +1608,7 @@ fn extract_query_param(url: &str, key: &str) -> Option<String> {
}
fn percent_decode(input: &str) -> String {
let mut out = String::new();
let mut out = Vec::with_capacity(input.len());
let bytes = input.as_bytes();
let mut idx = 0;
while idx < bytes.len() {
@@ -1617,14 +1617,14 @@ fn percent_decode(input: &str) -> String {
&& let Ok(hex) = std::str::from_utf8(&bytes[idx + 1..idx + 3])
&& let Ok(val) = u8::from_str_radix(hex, 16)
{
out.push(val as char);
out.push(val);
idx += 3;
continue;
}
out.push(bytes[idx] as char);
out.push(bytes[idx]);
idx += 1;
}
out
String::from_utf8_lossy(&out).into_owned()
}
fn url_encode(input: &str) -> String {
@@ -1711,6 +1711,22 @@ mod tests {
assert_eq!(results[0].snippet.as_deref(), Some("A useful snippet."));
}
#[test]
fn percent_decode_handles_utf8_multibyte_sequences() {
// Percent-encoded CJK: %E4%B8%AA%E4%BA%BA = 个人 (each glyph is 3 UTF-8 bytes).
assert_eq!(percent_decode("Hello %E4%B8%AA%E4%BA%BA"), "Hello 个人");
assert_eq!(percent_decode("%E7%B4%A0%E6%9D%90"), "素材");
// Percent-encoded UTF-8 inside a URL path (DuckDuckGo `uddg=` redirect shape).
assert_eq!(
percent_decode("https://example.com/%E9%A1%B5%E9%9D%A2"),
"https://example.com/页面"
);
// Raw UTF-8 in the input passes through unchanged.
assert_eq!(percent_decode("查询 keyword"), "查询 keyword");
// ASCII-only inputs preserve existing behavior; `+` stays literal.
assert_eq!(percent_decode("foo+bar%20baz"), "foo+bar baz");
}
#[test]
fn scoped_ref_prefix_is_session_specific() {
reset_web_run_state();