fix(web_run): decode percent-encoded UTF-8 bytes correctly (#840)
percent_decode in web_run.rs builds the result via `out.push(b as char)`, mapping each decoded byte to its Latin-1 codepoint. For percent-encoded multi-byte UTF-8 sequences (e.g. %E4%B8%AA = 个) this produces visible mojibake: bytes E4 B8 AA become three Latin-1 chars `个` and the final String is the UTF-8 encoding of those codepoints, not the original character. The sister module web_search.rs::percent_decode (line 490) already uses the correct pattern: collect bytes into Vec<u8>, then finalize with String::from_utf8_lossy. Align web_run.rs with that implementation. Affects DuckDuckGo redirect URLs containing non-ASCII paths, where normalize_search_url -> percent_decode previously corrupted the decoded URL before it was shown to the model and to the user. Add a regression test covering percent-encoded CJK input, raw UTF-8 input, and mixed ASCII+UTF-8. Co-authored-by: Elowen <xrnc@outlook.com>
This commit is contained in:
@@ -1608,7 +1608,7 @@ fn extract_query_param(url: &str, key: &str) -> Option<String> {
|
||||
}
|
||||
|
||||
fn percent_decode(input: &str) -> String {
|
||||
let mut out = String::new();
|
||||
let mut out = Vec::with_capacity(input.len());
|
||||
let bytes = input.as_bytes();
|
||||
let mut idx = 0;
|
||||
while idx < bytes.len() {
|
||||
@@ -1617,14 +1617,14 @@ fn percent_decode(input: &str) -> String {
|
||||
&& let Ok(hex) = std::str::from_utf8(&bytes[idx + 1..idx + 3])
|
||||
&& let Ok(val) = u8::from_str_radix(hex, 16)
|
||||
{
|
||||
out.push(val as char);
|
||||
out.push(val);
|
||||
idx += 3;
|
||||
continue;
|
||||
}
|
||||
out.push(bytes[idx] as char);
|
||||
out.push(bytes[idx]);
|
||||
idx += 1;
|
||||
}
|
||||
out
|
||||
String::from_utf8_lossy(&out).into_owned()
|
||||
}
|
||||
|
||||
fn url_encode(input: &str) -> String {
|
||||
@@ -1711,6 +1711,22 @@ mod tests {
|
||||
assert_eq!(results[0].snippet.as_deref(), Some("A useful snippet."));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn percent_decode_handles_utf8_multibyte_sequences() {
|
||||
// Percent-encoded CJK: %E4%B8%AA%E4%BA%BA = 个人 (each glyph is 3 UTF-8 bytes).
|
||||
assert_eq!(percent_decode("Hello %E4%B8%AA%E4%BA%BA"), "Hello 个人");
|
||||
assert_eq!(percent_decode("%E7%B4%A0%E6%9D%90"), "素材");
|
||||
// Percent-encoded UTF-8 inside a URL path (DuckDuckGo `uddg=` redirect shape).
|
||||
assert_eq!(
|
||||
percent_decode("https://example.com/%E9%A1%B5%E9%9D%A2"),
|
||||
"https://example.com/页面"
|
||||
);
|
||||
// Raw UTF-8 in the input passes through unchanged.
|
||||
assert_eq!(percent_decode("查询 keyword"), "查询 keyword");
|
||||
// ASCII-only inputs preserve existing behavior; `+` stays literal.
|
||||
assert_eq!(percent_decode("foo+bar%20baz"), "foo+bar baz");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scoped_ref_prefix_is_session_specific() {
|
||||
reset_web_run_state();
|
||||
|
||||
Reference in New Issue
Block a user