fix(web_search): decode HTML entities in Bing result URLs

Bing wraps every SERP result URL in a `/ck/a?...&u=<base64>` click-tracking
redirect, and in the raw HTML the separators are `&amp;` entities.
normalize_bing_url parsed the href without decoding entities first, so
extract_query_param looked for `u` while the actual key was `amp;u`. The
base64 redirect target was never recovered: every result collapsed to a
`bing.com` root domain, is_likely_spam_results rejected the whole batch,
and Bing — the default backend — returned zero results.

Decode HTML entities before extracting the redirect target. Adds a
regression test.
This commit is contained in:
hexin
2026-05-27 11:07:18 +08:00
committed by Hunter Bown
parent a800f7469e
commit d31caf40c5
+21 -2
View File
@@ -901,6 +901,14 @@ fn normalize_url(href: &str) -> String {
}
fn normalize_bing_url(href: &str) -> String {
// Bing wraps every SERP result URL in a `/ck/a?...&u=<base64>` click-tracking
// redirect, and in the raw HTML the separators are `&amp;` entities. Without
// decoding entities first, `extract_query_param` looks for `u` but the actual
// key is `amp;u`, so the real URL is never recovered: every result collapses to
// a `bing.com` root domain, which the spam heuristic then rejects — yielding
// zero results for the default Bing backend. Decode entities before parsing.
let href = decode_html_entities(href);
let href = href.as_str();
if let Some(encoded) = extract_query_param(href, "u") {
let decoded = percent_decode(&encoded);
let token = decoded.strip_prefix("a1").unwrap_or(&decoded);
@@ -1027,11 +1035,22 @@ fn extract_query_param(url: &str, key: &str) -> Option<String> {
mod tests {
use super::{
ERROR_BODY_PREVIEW_BYTES, WebSearchEntry, WebSearchTool, decode_html_entities,
extract_search_query, is_likely_spam_results, optional_search_max_results, root_domain,
sanitize_error_body, truncate_error_body,
extract_search_query, is_likely_spam_results, normalize_bing_url,
optional_search_max_results, root_domain, sanitize_error_body, truncate_error_body,
};
use serde_json::json;
// Regression guard: Bing /ck/a redirect hrefs are HTML-entity-encoded
// (`&amp;`). normalize_bing_url must decode entities before extracting the
// `u=` base64 payload, otherwise the real URL is never recovered and the
// result's root domain collapses to bing.com (then dropped as spam → 0
// results for the default Bing backend).
#[test]
fn bing_ckurl_with_html_entities_decodes_real_url() {
let href = "https://www.bing.com/ck/a?!&amp;&amp;p=abc&amp;u=a1aHR0cHM6Ly9ydXN0LWxhbmcub3JnLw&amp;ntb=1";
assert_eq!(normalize_bing_url(href), "https://rust-lang.org/");
}
fn entry(url: &str) -> WebSearchEntry {
WebSearchEntry {
title: "x".into(),