From 29a42ba31ab5e922ad52a66ecf35242ac46fa783 Mon Sep 17 00:00:00 2001
From: LinQ <linzhiqin2003@users.noreply.github.com>
Date: Mon, 11 May 2026 00:14:55 +0100
Subject: [PATCH] fix(web_search): drop spam-stuffed SERPs in Bing/DDG fallback
 (#964)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

#964 reports that `web_search` returns garbage results — every query
in their reproduction case returned eight entries from a single low-
quality forum domain (`*.forumgratuit.org`) regardless of input. The
root cause is upstream: when Bing's scraping endpoint serves a stuffed
page (often when our request looks too bot-like or the query falls
into a degraded bucket), the parser happily extracts the b_algo
entries and the model receives the junk as authoritative search
results.

Adds a `is_likely_spam_results` heuristic that runs after both Bing
and DDG parsers. When 60% or more of the parsed entries share the
same registrable root domain (with at least three entries to avoid
false positives on legitimate two-link answers), the batch is
discarded. The existing "no results" handling then surfaces a clean
error message to the model instead of routing it toward spam.

`root_domain` strips subdomains so `astralia.forumgratuit.org` and
`russia.forumgratuit.org` collapse to `forumgratuit.org` for the
purpose of dominance counting; eTLD+1 is approximated by keeping the
last two labels, which is close enough for the threshold check.

Five new tests cover the threshold (3-of-5 trips, 2-of-5 doesn't),
short-batch passthrough, normal diverse SERPs (Wikipedia + SO +
Reddit) staying through, and the precise spam reproduction from #964.

Refs #964
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/tui/src/tools/web_search.rs | 163 ++++++++++++++++++++++++++++-
 1 file changed, 162 insertions(+), 1 deletion(-)
diff --git a/crates/tui/src/tools/web_search.rs b/crates/tui/src/tools/web_search.rs
index 889a26f6..4d3d2f58 100644
--- a/crates/tui/src/tools/web_search.rs
+++ b/crates/tui/src/tools/web_search.rs
@@ -382,6 +382,12 @@ fn parse_duckduckgo_results(html: &str, max_results: usize) -> Vec<WebSearchEntr
         });
     }
 
+    if is_likely_spam_results(&results) {
+        // Same defence as the Bing path (#964): a DDG fallback page can
+        // also serve a single-domain stuffed result set when the upstream
+        // is degraded. Drop rather than mislead the model.
+        return Vec::new();
+    }
     results
 }
 
@@ -420,9 +426,60 @@ fn parse_bing_results(html: &str, max_results: usize) -> Vec<WebSearchEntry> {
         });
     }
 
+    if is_likely_spam_results(&results) {
+        // Bing's scraping endpoint occasionally serves a stuffed page
+        // where the same low-quality domain owns most of the b_algo
+        // entries — #964 reported eight in a row from
+        // `astralia.forumgratuit.org` for unrelated queries. Treat the
+        // batch as "no results" so the caller surfaces a clean failure
+        // message instead of routing the model toward junk.
+        return Vec::new();
+    }
     results
 }
 
+/// Heuristic spam detector for scraped SERP HTML (#964).
+///
+/// Returns `true` when one root domain owns at least 60% of the result
+/// set and there are at least three results. A real-world top-five page
+/// from Google/Bing/DDG mixes domains; a result page dominated by one
+/// host is almost always SEO spam or a bot-detection-stuffed substitute.
+fn is_likely_spam_results(results: &[WebSearchEntry]) -> bool {
+    if results.len() < 3 {
+        return false;
+    }
+    let mut counts: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
+    for r in results {
+        if let Some(host) = root_domain(&r.url) {
+            *counts.entry(host).or_insert(0) += 1;
+        }
+    }
+    let Some(&max) = counts.values().max() else {
+        return false;
+    };
+    // 60% threshold: 3-of-5, 4-of-6, 5-of-8 all trip; 2-of-5 doesn't.
+    max * 5 >= results.len() * 3
+}
+
+/// Extract the registrable root domain (eTLD+1 approximation) from a URL
+/// so spam detection groups `astralia.forumgratuit.org` with
+/// `russia.forumgratuit.org`. Returns lowercase host minus the leftmost
+/// label, or the bare host when there are only two labels.
+fn root_domain(url: &str) -> Option<String> {
+    let after_scheme = url.split_once("://").map(|(_, r)| r).unwrap_or(url);
+    let host = after_scheme.split(['/', '?', '#']).next()?;
+    let host = host.split('@').next_back()?;
+    let host = host.split(':').next()?.to_ascii_lowercase();
+    if host.is_empty() {
+        return None;
+    }
+    let labels: Vec<&str> = host.split('.').filter(|s| !s.is_empty()).collect();
+    if labels.len() <= 2 {
+        return Some(host);
+    }
+    Some(labels[labels.len().saturating_sub(2)..].join("."))
+}
+
 fn normalize_url(href: &str) -> String {
     if let Some(uddg) = extract_query_param(href, "uddg") {
         let decoded = percent_decode(&uddg);
@@ -564,9 +621,113 @@ fn extract_query_param(url: &str, key: &str) -> Option<String> {
 
 #[cfg(test)]
 mod tests {
-    use super::{decode_html_entities, extract_search_query, optional_search_max_results};
+    use super::{
+        WebSearchEntry, decode_html_entities, extract_search_query, is_likely_spam_results,
+        optional_search_max_results, root_domain,
+    };
     use serde_json::json;
 
+    fn entry(url: &str) -> WebSearchEntry {
+        WebSearchEntry {
+            title: "x".into(),
+            url: url.into(),
+            snippet: None,
+        }
+    }
+
+    #[test]
+    fn root_domain_strips_subdomain_keeps_two_labels() {
+        assert_eq!(
+            root_domain("https://astralia.forumgratuit.org/path/page").as_deref(),
+            Some("forumgratuit.org"),
+        );
+        assert_eq!(
+            root_domain("http://www.example.com/").as_deref(),
+            Some("example.com"),
+        );
+        assert_eq!(
+            root_domain("https://example.com").as_deref(),
+            Some("example.com")
+        );
+    }
+
+    #[test]
+    fn root_domain_handles_port_and_userinfo() {
+        assert_eq!(
+            root_domain("http://user:pass@blog.example.com:8080/x").as_deref(),
+            Some("example.com"),
+        );
+    }
+
+    #[test]
+    fn root_domain_returns_none_for_garbage() {
+        assert!(
+            root_domain("not-a-url").as_deref().is_some(),
+            "bare token is treated as host"
+        );
+        assert!(root_domain("https:///path").is_none());
+    }
+
+    #[test]
+    fn spam_detector_flags_single_domain_dominance() {
+        // #964 reproduction: 5/5 results from the same low-quality host.
+        let r = vec![
+            entry("https://astralia.forumgratuit.org/page1"),
+            entry("https://russia.forumgratuit.org/page2"),
+            entry("https://other.forumgratuit.org/page3"),
+            entry("https://hello.forumgratuit.org/page4"),
+            entry("https://world.forumgratuit.org/page5"),
+        ];
+        assert!(is_likely_spam_results(&r));
+    }
+
+    #[test]
+    fn spam_detector_passes_diverse_serp() {
+        // A normal SERP mixes domains; nothing flagged.
+        let r = vec![
+            entry("https://example.com/a"),
+            entry("https://wikipedia.org/b"),
+            entry("https://stackoverflow.com/c"),
+            entry("https://reddit.com/d"),
+            entry("https://example.com/e"),
+        ];
+        assert!(!is_likely_spam_results(&r));
+    }
+
+    #[test]
+    fn spam_detector_passes_short_result_set() {
+        // Two results from the same domain isn't enough signal — false
+        // positives on legitimate two-link answers (docs + homepage)
+        // would hurt more than letting them through.
+        let r = vec![
+            entry("https://example.com/a"),
+            entry("https://example.com/b"),
+        ];
+        assert!(!is_likely_spam_results(&r));
+    }
+
+    #[test]
+    fn spam_detector_threshold_is_sixty_percent() {
+        // 3-of-5 same domain trips the 60% threshold.
+        let r3of5 = vec![
+            entry("https://spam.example.com/a"),
+            entry("https://spam.example.com/b"),
+            entry("https://spam.example.com/c"),
+            entry("https://other.com/d"),
+            entry("https://third.com/e"),
+        ];
+        assert!(is_likely_spam_results(&r3of5));
+        // 2-of-5 does NOT trip the threshold.
+        let r2of5 = vec![
+            entry("https://spam.example.com/a"),
+            entry("https://spam.example.com/b"),
+            entry("https://other.com/c"),
+            entry("https://third.com/d"),
+            entry("https://fourth.com/e"),
+        ];
+        assert!(!is_likely_spam_results(&r2of5));
+    }
+
     #[test]
     fn decode_html_entities_handles_named_entities() {
         assert_eq!(decode_html_entities("&amp;"), "&");