fix(web_search): drop spam-stuffed SERPs in Bing/DDG fallback (#964)
#964 reports that `web_search` returns garbage results — every query in their reproduction case returned eight entries from a single low- quality forum domain (`*.forumgratuit.org`) regardless of input. The root cause is upstream: when Bing's scraping endpoint serves a stuffed page (often when our request looks too bot-like or the query falls into a degraded bucket), the parser happily extracts the b_algo entries and the model receives the junk as authoritative search results. Adds a `is_likely_spam_results` heuristic that runs after both Bing and DDG parsers. When 60% or more of the parsed entries share the same registrable root domain (with at least three entries to avoid false positives on legitimate two-link answers), the batch is discarded. The existing "no results" handling then surfaces a clean error message to the model instead of routing it toward spam. `root_domain` strips subdomains so `astralia.forumgratuit.org` and `russia.forumgratuit.org` collapse to `forumgratuit.org` for the purpose of dominance counting; eTLD+1 is approximated by keeping the last two labels, which is close enough for the threshold check. Five new tests cover the threshold (3-of-5 trips, 2-of-5 doesn't), short-batch passthrough, normal diverse SERPs (Wikipedia + SO + Reddit) staying through, and the precise spam reproduction from #964. Refs #964 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -382,6 +382,12 @@ fn parse_duckduckgo_results(html: &str, max_results: usize) -> Vec<WebSearchEntr
|
||||
});
|
||||
}
|
||||
|
||||
if is_likely_spam_results(&results) {
|
||||
// Same defence as the Bing path (#964): a DDG fallback page can
|
||||
// also serve a single-domain stuffed result set when the upstream
|
||||
// is degraded. Drop rather than mislead the model.
|
||||
return Vec::new();
|
||||
}
|
||||
results
|
||||
}
|
||||
|
||||
@@ -420,9 +426,60 @@ fn parse_bing_results(html: &str, max_results: usize) -> Vec<WebSearchEntry> {
|
||||
});
|
||||
}
|
||||
|
||||
if is_likely_spam_results(&results) {
|
||||
// Bing's scraping endpoint occasionally serves a stuffed page
|
||||
// where the same low-quality domain owns most of the b_algo
|
||||
// entries — #964 reported eight in a row from
|
||||
// `astralia.forumgratuit.org` for unrelated queries. Treat the
|
||||
// batch as "no results" so the caller surfaces a clean failure
|
||||
// message instead of routing the model toward junk.
|
||||
return Vec::new();
|
||||
}
|
||||
results
|
||||
}
|
||||
|
||||
/// Heuristic spam detector for scraped SERP HTML (#964).
|
||||
///
|
||||
/// Returns `true` when one root domain owns at least 60% of the result
|
||||
/// set and there are at least three results. A real-world top-five page
|
||||
/// from Google/Bing/DDG mixes domains; a result page dominated by one
|
||||
/// host is almost always SEO spam or a bot-detection-stuffed substitute.
|
||||
fn is_likely_spam_results(results: &[WebSearchEntry]) -> bool {
|
||||
if results.len() < 3 {
|
||||
return false;
|
||||
}
|
||||
let mut counts: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
|
||||
for r in results {
|
||||
if let Some(host) = root_domain(&r.url) {
|
||||
*counts.entry(host).or_insert(0) += 1;
|
||||
}
|
||||
}
|
||||
let Some(&max) = counts.values().max() else {
|
||||
return false;
|
||||
};
|
||||
// 60% threshold: 3-of-5, 4-of-6, 5-of-8 all trip; 2-of-5 doesn't.
|
||||
max * 5 >= results.len() * 3
|
||||
}
|
||||
|
||||
/// Extract the registrable root domain (eTLD+1 approximation) from a URL
|
||||
/// so spam detection groups `astralia.forumgratuit.org` with
|
||||
/// `russia.forumgratuit.org`. Returns lowercase host minus the leftmost
|
||||
/// label, or the bare host when there are only two labels.
|
||||
fn root_domain(url: &str) -> Option<String> {
|
||||
let after_scheme = url.split_once("://").map(|(_, r)| r).unwrap_or(url);
|
||||
let host = after_scheme.split(['/', '?', '#']).next()?;
|
||||
let host = host.split('@').next_back()?;
|
||||
let host = host.split(':').next()?.to_ascii_lowercase();
|
||||
if host.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let labels: Vec<&str> = host.split('.').filter(|s| !s.is_empty()).collect();
|
||||
if labels.len() <= 2 {
|
||||
return Some(host);
|
||||
}
|
||||
Some(labels[labels.len().saturating_sub(2)..].join("."))
|
||||
}
|
||||
|
||||
fn normalize_url(href: &str) -> String {
|
||||
if let Some(uddg) = extract_query_param(href, "uddg") {
|
||||
let decoded = percent_decode(&uddg);
|
||||
@@ -564,9 +621,113 @@ fn extract_query_param(url: &str, key: &str) -> Option<String> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{decode_html_entities, extract_search_query, optional_search_max_results};
|
||||
use super::{
|
||||
WebSearchEntry, decode_html_entities, extract_search_query, is_likely_spam_results,
|
||||
optional_search_max_results, root_domain,
|
||||
};
|
||||
use serde_json::json;
|
||||
|
||||
fn entry(url: &str) -> WebSearchEntry {
|
||||
WebSearchEntry {
|
||||
title: "x".into(),
|
||||
url: url.into(),
|
||||
snippet: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn root_domain_strips_subdomain_keeps_two_labels() {
|
||||
assert_eq!(
|
||||
root_domain("https://astralia.forumgratuit.org/path/page").as_deref(),
|
||||
Some("forumgratuit.org"),
|
||||
);
|
||||
assert_eq!(
|
||||
root_domain("http://www.example.com/").as_deref(),
|
||||
Some("example.com"),
|
||||
);
|
||||
assert_eq!(
|
||||
root_domain("https://example.com").as_deref(),
|
||||
Some("example.com")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn root_domain_handles_port_and_userinfo() {
|
||||
assert_eq!(
|
||||
root_domain("http://user:pass@blog.example.com:8080/x").as_deref(),
|
||||
Some("example.com"),
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn root_domain_returns_none_for_garbage() {
|
||||
assert!(
|
||||
root_domain("not-a-url").as_deref().is_some(),
|
||||
"bare token is treated as host"
|
||||
);
|
||||
assert!(root_domain("https:///path").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn spam_detector_flags_single_domain_dominance() {
|
||||
// #964 reproduction: 5/5 results from the same low-quality host.
|
||||
let r = vec![
|
||||
entry("https://astralia.forumgratuit.org/page1"),
|
||||
entry("https://russia.forumgratuit.org/page2"),
|
||||
entry("https://other.forumgratuit.org/page3"),
|
||||
entry("https://hello.forumgratuit.org/page4"),
|
||||
entry("https://world.forumgratuit.org/page5"),
|
||||
];
|
||||
assert!(is_likely_spam_results(&r));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn spam_detector_passes_diverse_serp() {
|
||||
// A normal SERP mixes domains; nothing flagged.
|
||||
let r = vec![
|
||||
entry("https://example.com/a"),
|
||||
entry("https://wikipedia.org/b"),
|
||||
entry("https://stackoverflow.com/c"),
|
||||
entry("https://reddit.com/d"),
|
||||
entry("https://example.com/e"),
|
||||
];
|
||||
assert!(!is_likely_spam_results(&r));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn spam_detector_passes_short_result_set() {
|
||||
// Two results from the same domain isn't enough signal — false
|
||||
// positives on legitimate two-link answers (docs + homepage)
|
||||
// would hurt more than letting them through.
|
||||
let r = vec![
|
||||
entry("https://example.com/a"),
|
||||
entry("https://example.com/b"),
|
||||
];
|
||||
assert!(!is_likely_spam_results(&r));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn spam_detector_threshold_is_sixty_percent() {
|
||||
// 3-of-5 same domain trips the 60% threshold.
|
||||
let r3of5 = vec![
|
||||
entry("https://spam.example.com/a"),
|
||||
entry("https://spam.example.com/b"),
|
||||
entry("https://spam.example.com/c"),
|
||||
entry("https://other.com/d"),
|
||||
entry("https://third.com/e"),
|
||||
];
|
||||
assert!(is_likely_spam_results(&r3of5));
|
||||
// 2-of-5 does NOT trip the threshold.
|
||||
let r2of5 = vec![
|
||||
entry("https://spam.example.com/a"),
|
||||
entry("https://spam.example.com/b"),
|
||||
entry("https://other.com/c"),
|
||||
entry("https://third.com/d"),
|
||||
entry("https://fourth.com/e"),
|
||||
];
|
||||
assert!(!is_likely_spam_results(&r2of5));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_html_entities_handles_named_entities() {
|
||||
assert_eq!(decode_html_entities("&"), "&");
|
||||
|
||||
Reference in New Issue
Block a user