fix(web_search): drop spam-stuffed SERPs in Bing/DDG fallback (#964)

#964 reports that `web_search` returns garbage results — every query in their reproduction case returned eight entries from a single low- quality forum domain (`*.forumgratuit.org`) regardless of input. The root cause is upstream: when Bing's scraping endpoint serves a stuffed page (often when our request looks too bot-like or the query falls into a degraded bucket), the parser happily extracts the b_algo entries and the model receives the junk as authoritative search results. Adds a `is_likely_spam_results` heuristic that runs after both Bing and DDG parsers. When 60% or more of the parsed entries share the same registrable root domain (with at least three entries to avoid false positives on legitimate two-link answers), the batch is discarded. The existing "no results" handling then surfaces a clean error message to the model instead of routing it toward spam. `root_domain` strips subdomains so `astralia.forumgratuit.org` and `russia.forumgratuit.org` collapse to `forumgratuit.org` for the purpose of dominance counting; eTLD+1 is approximated by keeping the last two labels, which is close enough for the threshold check. Five new tests cover the threshold (3-of-5 trips, 2-of-5 doesn't), short-batch passthrough, normal diverse SERPs (Wikipedia + SO + Reddit) staying through, and the precise spam reproduction from #964. Refs #964 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-11 00:14:55 +01:00
parent f56f73e371
commit 29a42ba31a
1 changed files with 162 additions and 1 deletions
@@ -382,6 +382,12 @@ fn parse_duckduckgo_results(html: &str, max_results: usize) -> Vec<WebSearchEntr
        });
    }

+    if is_likely_spam_results(&results) {
+        // Same defence as the Bing path (#964): a DDG fallback page can
+        // also serve a single-domain stuffed result set when the upstream
+        // is degraded. Drop rather than mislead the model.
+        return Vec::new();
+    }
    results
 }

@@ -420,9 +426,60 @@ fn parse_bing_results(html: &str, max_results: usize) -> Vec<WebSearchEntry> {
        });
    }

+    if is_likely_spam_results(&results) {
+        // Bing's scraping endpoint occasionally serves a stuffed page
+        // where the same low-quality domain owns most of the b_algo
+        // entries — #964 reported eight in a row from
+        // `astralia.forumgratuit.org` for unrelated queries. Treat the
+        // batch as "no results" so the caller surfaces a clean failure
+        // message instead of routing the model toward junk.
+        return Vec::new();
+    }
    results
 }

+/// Heuristic spam detector for scraped SERP HTML (#964).
+///
+/// Returns `true` when one root domain owns at least 60% of the result
+/// set and there are at least three results. A real-world top-five page
+/// from Google/Bing/DDG mixes domains; a result page dominated by one
+/// host is almost always SEO spam or a bot-detection-stuffed substitute.
+fn is_likely_spam_results(results: &[WebSearchEntry]) -> bool {
+    if results.len() < 3 {
+        return false;
+    }
+    let mut counts: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
+    for r in results {
+        if let Some(host) = root_domain(&r.url) {
+            *counts.entry(host).or_insert(0) += 1;
+        }
+    }
+    let Some(&max) = counts.values().max() else {
+        return false;
+    };
+    // 60% threshold: 3-of-5, 4-of-6, 5-of-8 all trip; 2-of-5 doesn't.
+    max * 5 >= results.len() * 3
+}
+
+/// Extract the registrable root domain (eTLD+1 approximation) from a URL
+/// so spam detection groups `astralia.forumgratuit.org` with
+/// `russia.forumgratuit.org`. Returns lowercase host minus the leftmost
+/// label, or the bare host when there are only two labels.
+fn root_domain(url: &str) -> Option<String> {
+    let after_scheme = url.split_once("://").map(|(_, r)| r).unwrap_or(url);
+    let host = after_scheme.split(['/', '?', '#']).next()?;
+    let host = host.split('@').next_back()?;
+    let host = host.split(':').next()?.to_ascii_lowercase();
+    if host.is_empty() {
+        return None;
+    }
+    let labels: Vec<&str> = host.split('.').filter(|s| !s.is_empty()).collect();
+    if labels.len() <= 2 {
+        return Some(host);
+    }
+    Some(labels[labels.len().saturating_sub(2)..].join("."))
+}
+
 fn normalize_url(href: &str) -> String {
    if let Some(uddg) = extract_query_param(href, "uddg") {
        let decoded = percent_decode(&uddg);
@@ -564,9 +621,113 @@ fn extract_query_param(url: &str, key: &str) -> Option<String> {

 #[cfg(test)]
 mod tests {
-    use super::{decode_html_entities, extract_search_query, optional_search_max_results};
+    use super::{
+        WebSearchEntry, decode_html_entities, extract_search_query, is_likely_spam_results,
+        optional_search_max_results, root_domain,
+    };
    use serde_json::json;

+    fn entry(url: &str) -> WebSearchEntry {
+        WebSearchEntry {
+            title: "x".into(),
+            url: url.into(),
+            snippet: None,
+        }
+    }
+
+    #[test]
+    fn root_domain_strips_subdomain_keeps_two_labels() {
+        assert_eq!(
+            root_domain("https://astralia.forumgratuit.org/path/page").as_deref(),
+            Some("forumgratuit.org"),
+        );
+        assert_eq!(
+            root_domain("http://www.example.com/").as_deref(),
+            Some("example.com"),
+        );
+        assert_eq!(
+            root_domain("https://example.com").as_deref(),
+            Some("example.com")
+        );
+    }
+
+    #[test]
+    fn root_domain_handles_port_and_userinfo() {
+        assert_eq!(
+            root_domain("http://user:pass@blog.example.com:8080/x").as_deref(),
+            Some("example.com"),
+        );
+    }
+
+    #[test]
+    fn root_domain_returns_none_for_garbage() {
+        assert!(
+            root_domain("not-a-url").as_deref().is_some(),
+            "bare token is treated as host"
+        );
+        assert!(root_domain("https:///path").is_none());
+    }
+
+    #[test]
+    fn spam_detector_flags_single_domain_dominance() {
+        // #964 reproduction: 5/5 results from the same low-quality host.
+        let r = vec![
+            entry("https://astralia.forumgratuit.org/page1"),
+            entry("https://russia.forumgratuit.org/page2"),
+            entry("https://other.forumgratuit.org/page3"),
+            entry("https://hello.forumgratuit.org/page4"),
+            entry("https://world.forumgratuit.org/page5"),
+        ];
+        assert!(is_likely_spam_results(&r));
+    }
+
+    #[test]
+    fn spam_detector_passes_diverse_serp() {
+        // A normal SERP mixes domains; nothing flagged.
+        let r = vec![
+            entry("https://example.com/a"),
+            entry("https://wikipedia.org/b"),
+            entry("https://stackoverflow.com/c"),
+            entry("https://reddit.com/d"),
+            entry("https://example.com/e"),
+        ];
+        assert!(!is_likely_spam_results(&r));
+    }
+
+    #[test]
+    fn spam_detector_passes_short_result_set() {
+        // Two results from the same domain isn't enough signal — false
+        // positives on legitimate two-link answers (docs + homepage)
+        // would hurt more than letting them through.
+        let r = vec![
+            entry("https://example.com/a"),
+            entry("https://example.com/b"),
+        ];
+        assert!(!is_likely_spam_results(&r));
+    }
+
+    #[test]
+    fn spam_detector_threshold_is_sixty_percent() {
+        // 3-of-5 same domain trips the 60% threshold.
+        let r3of5 = vec![
+            entry("https://spam.example.com/a"),
+            entry("https://spam.example.com/b"),
+            entry("https://spam.example.com/c"),
+            entry("https://other.com/d"),
+            entry("https://third.com/e"),
+        ];
+        assert!(is_likely_spam_results(&r3of5));
+        // 2-of-5 does NOT trip the threshold.
+        let r2of5 = vec![
+            entry("https://spam.example.com/a"),
+            entry("https://spam.example.com/b"),
+            entry("https://other.com/c"),
+            entry("https://third.com/d"),
+            entry("https://fourth.com/e"),
+        ];
+        assert!(!is_likely_spam_results(&r2of5));
+    }
+
    #[test]
    fn decode_html_entities_handles_named_entities() {
        assert_eq!(decode_html_entities("&amp;"), "&");