From 29a42ba31ab5e922ad52a66ecf35242ac46fa783 Mon Sep 17 00:00:00 2001 From: LinQ Date: Mon, 11 May 2026 00:14:55 +0100 Subject: [PATCH] fix(web_search): drop spam-stuffed SERPs in Bing/DDG fallback (#964) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #964 reports that `web_search` returns garbage results — every query in their reproduction case returned eight entries from a single low- quality forum domain (`*.forumgratuit.org`) regardless of input. The root cause is upstream: when Bing's scraping endpoint serves a stuffed page (often when our request looks too bot-like or the query falls into a degraded bucket), the parser happily extracts the b_algo entries and the model receives the junk as authoritative search results. Adds a `is_likely_spam_results` heuristic that runs after both Bing and DDG parsers. When 60% or more of the parsed entries share the same registrable root domain (with at least three entries to avoid false positives on legitimate two-link answers), the batch is discarded. The existing "no results" handling then surfaces a clean error message to the model instead of routing it toward spam. `root_domain` strips subdomains so `astralia.forumgratuit.org` and `russia.forumgratuit.org` collapse to `forumgratuit.org` for the purpose of dominance counting; eTLD+1 is approximated by keeping the last two labels, which is close enough for the threshold check. Five new tests cover the threshold (3-of-5 trips, 2-of-5 doesn't), short-batch passthrough, normal diverse SERPs (Wikipedia + SO + Reddit) staying through, and the precise spam reproduction from #964. Refs #964 Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/tui/src/tools/web_search.rs | 163 ++++++++++++++++++++++++++++- 1 file changed, 162 insertions(+), 1 deletion(-) diff --git a/crates/tui/src/tools/web_search.rs b/crates/tui/src/tools/web_search.rs index 889a26f6..4d3d2f58 100644 --- a/crates/tui/src/tools/web_search.rs +++ b/crates/tui/src/tools/web_search.rs @@ -382,6 +382,12 @@ fn parse_duckduckgo_results(html: &str, max_results: usize) -> Vec Vec { }); } + if is_likely_spam_results(&results) { + // Bing's scraping endpoint occasionally serves a stuffed page + // where the same low-quality domain owns most of the b_algo + // entries — #964 reported eight in a row from + // `astralia.forumgratuit.org` for unrelated queries. Treat the + // batch as "no results" so the caller surfaces a clean failure + // message instead of routing the model toward junk. + return Vec::new(); + } results } +/// Heuristic spam detector for scraped SERP HTML (#964). +/// +/// Returns `true` when one root domain owns at least 60% of the result +/// set and there are at least three results. A real-world top-five page +/// from Google/Bing/DDG mixes domains; a result page dominated by one +/// host is almost always SEO spam or a bot-detection-stuffed substitute. +fn is_likely_spam_results(results: &[WebSearchEntry]) -> bool { + if results.len() < 3 { + return false; + } + let mut counts: std::collections::HashMap = std::collections::HashMap::new(); + for r in results { + if let Some(host) = root_domain(&r.url) { + *counts.entry(host).or_insert(0) += 1; + } + } + let Some(&max) = counts.values().max() else { + return false; + }; + // 60% threshold: 3-of-5, 4-of-6, 5-of-8 all trip; 2-of-5 doesn't. + max * 5 >= results.len() * 3 +} + +/// Extract the registrable root domain (eTLD+1 approximation) from a URL +/// so spam detection groups `astralia.forumgratuit.org` with +/// `russia.forumgratuit.org`. Returns lowercase host minus the leftmost +/// label, or the bare host when there are only two labels. +fn root_domain(url: &str) -> Option { + let after_scheme = url.split_once("://").map(|(_, r)| r).unwrap_or(url); + let host = after_scheme.split(['/', '?', '#']).next()?; + let host = host.split('@').next_back()?; + let host = host.split(':').next()?.to_ascii_lowercase(); + if host.is_empty() { + return None; + } + let labels: Vec<&str> = host.split('.').filter(|s| !s.is_empty()).collect(); + if labels.len() <= 2 { + return Some(host); + } + Some(labels[labels.len().saturating_sub(2)..].join(".")) +} + fn normalize_url(href: &str) -> String { if let Some(uddg) = extract_query_param(href, "uddg") { let decoded = percent_decode(&uddg); @@ -564,9 +621,113 @@ fn extract_query_param(url: &str, key: &str) -> Option { #[cfg(test)] mod tests { - use super::{decode_html_entities, extract_search_query, optional_search_max_results}; + use super::{ + WebSearchEntry, decode_html_entities, extract_search_query, is_likely_spam_results, + optional_search_max_results, root_domain, + }; use serde_json::json; + fn entry(url: &str) -> WebSearchEntry { + WebSearchEntry { + title: "x".into(), + url: url.into(), + snippet: None, + } + } + + #[test] + fn root_domain_strips_subdomain_keeps_two_labels() { + assert_eq!( + root_domain("https://astralia.forumgratuit.org/path/page").as_deref(), + Some("forumgratuit.org"), + ); + assert_eq!( + root_domain("http://www.example.com/").as_deref(), + Some("example.com"), + ); + assert_eq!( + root_domain("https://example.com").as_deref(), + Some("example.com") + ); + } + + #[test] + fn root_domain_handles_port_and_userinfo() { + assert_eq!( + root_domain("http://user:pass@blog.example.com:8080/x").as_deref(), + Some("example.com"), + ); + } + + #[test] + fn root_domain_returns_none_for_garbage() { + assert!( + root_domain("not-a-url").as_deref().is_some(), + "bare token is treated as host" + ); + assert!(root_domain("https:///path").is_none()); + } + + #[test] + fn spam_detector_flags_single_domain_dominance() { + // #964 reproduction: 5/5 results from the same low-quality host. + let r = vec![ + entry("https://astralia.forumgratuit.org/page1"), + entry("https://russia.forumgratuit.org/page2"), + entry("https://other.forumgratuit.org/page3"), + entry("https://hello.forumgratuit.org/page4"), + entry("https://world.forumgratuit.org/page5"), + ]; + assert!(is_likely_spam_results(&r)); + } + + #[test] + fn spam_detector_passes_diverse_serp() { + // A normal SERP mixes domains; nothing flagged. + let r = vec![ + entry("https://example.com/a"), + entry("https://wikipedia.org/b"), + entry("https://stackoverflow.com/c"), + entry("https://reddit.com/d"), + entry("https://example.com/e"), + ]; + assert!(!is_likely_spam_results(&r)); + } + + #[test] + fn spam_detector_passes_short_result_set() { + // Two results from the same domain isn't enough signal — false + // positives on legitimate two-link answers (docs + homepage) + // would hurt more than letting them through. + let r = vec![ + entry("https://example.com/a"), + entry("https://example.com/b"), + ]; + assert!(!is_likely_spam_results(&r)); + } + + #[test] + fn spam_detector_threshold_is_sixty_percent() { + // 3-of-5 same domain trips the 60% threshold. + let r3of5 = vec![ + entry("https://spam.example.com/a"), + entry("https://spam.example.com/b"), + entry("https://spam.example.com/c"), + entry("https://other.com/d"), + entry("https://third.com/e"), + ]; + assert!(is_likely_spam_results(&r3of5)); + // 2-of-5 does NOT trip the threshold. + let r2of5 = vec![ + entry("https://spam.example.com/a"), + entry("https://spam.example.com/b"), + entry("https://other.com/c"), + entry("https://third.com/d"), + entry("https://fourth.com/e"), + ]; + assert!(!is_likely_spam_results(&r2of5)); + } + #[test] fn decode_html_entities_handles_named_entities() { assert_eq!(decode_html_entities("&"), "&");