feat: add baidu web search backend

This commit is contained in:
jimmyzhuu
2026-05-30 10:42:18 +08:00
parent 54151a4bc9
commit d7e6c85db5
2 changed files with 311 additions and 7 deletions
+59 -2
View File
@@ -661,6 +661,9 @@ pub enum SearchProvider {
/// or `METASO_API_KEY` env var; configurable via `[search] api_key`.
#[serde(alias = "metaso")]
Metaso,
/// Baidu AI Search API (<https://qianfan.baidubce.com>). Requires api_key.
#[serde(alias = "baidu-search", alias = "baidu_ai_search")]
Baidu,
}
impl SearchProvider {
@@ -671,6 +674,9 @@ impl SearchProvider {
"duckduckgo" | "duck-duck-go" | "duck_duck_go" | "ddg" => Some(Self::DuckDuckGo),
"tavily" => Some(Self::Tavily),
"bocha" => Some(Self::Bocha),
"metaso" => Some(Self::Metaso),
"baidu" | "baidu-search" | "baidu_search" | "baidu-ai-search"
| "baidu_ai_search" => Some(Self::Baidu),
_ => None,
}
}
@@ -683,6 +689,7 @@ impl SearchProvider {
Self::Tavily => "tavily",
Self::Bocha => "bocha",
Self::Metaso => "metaso",
Self::Baidu => "baidu",
}
}
}
@@ -714,11 +721,12 @@ pub struct SearchProviderResolution {
/// Web search provider configuration (`[search]` table in config.toml).
#[derive(Debug, Clone, Deserialize, Default)]
pub struct SearchConfig {
/// Search provider: `bing` | `duckduckgo` | `tavily` | `bocha` | `metaso`. Default: `duckduckgo`.
/// Search provider: `bing` | `duckduckgo` | `tavily` | `bocha` | `metaso` | `baidu`. Default: `duckduckgo`.
#[serde(default)]
pub provider: Option<SearchProvider>,
/// API key for Tavily, Bocha, or Metaso. Not required for Bing or DuckDuckGo.
/// API key for Tavily, Bocha, Metaso, or Baidu. Not required for Bing or DuckDuckGo.
/// Metaso also falls back to `METASO_API_KEY` env var, then a built-in default.
/// Baidu also falls back to `BAIDU_SEARCH_API_KEY` env var.
#[serde(default)]
pub api_key: Option<String>,
}
@@ -4281,6 +4289,35 @@ mod tests {
);
}
#[test]
fn explicit_baidu_search_provider_is_preserved() {
let config: Config = toml::from_str(
r#"
[search]
provider = "baidu"
"#,
)
.expect("search config");
assert_eq!(
config.search.and_then(|search| search.provider),
Some(SearchProvider::Baidu)
);
}
#[test]
fn baidu_search_provider_aliases_parse() {
assert_eq!(SearchProvider::parse("baidu"), Some(SearchProvider::Baidu));
assert_eq!(
SearchProvider::parse("baidu-search"),
Some(SearchProvider::Baidu)
);
assert_eq!(
SearchProvider::parse("baidu_ai_search"),
Some(SearchProvider::Baidu)
);
}
#[test]
fn search_provider_resolution_reports_default_source() {
let _guard = lock_test_env();
@@ -4334,6 +4371,26 @@ mod tests {
assert_eq!(resolution.source, SearchProviderSource::EnvOverride);
}
#[test]
fn search_provider_env_override_accepts_baidu() {
let _guard = lock_test_env();
let prev = env::var_os("DEEPSEEK_SEARCH_PROVIDER");
unsafe { env::set_var("DEEPSEEK_SEARCH_PROVIDER", "baidu") };
let config: Config = toml::from_str(
r#"
[search]
provider = "duckduckgo"
"#,
)
.expect("search config");
let resolution = config.search_provider_resolution();
unsafe { EnvGuard::restore_var("DEEPSEEK_SEARCH_PROVIDER", prev) };
assert_eq!(resolution.provider, SearchProvider::Baidu);
assert_eq!(resolution.source, SearchProviderSource::EnvOverride);
}
#[test]
fn search_provider_resolution_ignores_invalid_env_override() {
let _guard = lock_test_env();
+252 -5
View File
@@ -1,6 +1,6 @@
//! Web search tool backed by multiple providers: Bing HTML scrape, DuckDuckGo
//! (HTML scrape with Bing fallback), Tavily API, Bocha (博查) API, and
//! Metaso API (<https://metaso.cn>).
//! (HTML scrape with Bing fallback), Tavily API, Bocha (博查) API,
//! Metaso API (<https://metaso.cn>), and Baidu AI Search.
//!
//! This is the primary web search surface for agents. For browsing workflows
//! (page open, click, screenshot) use a direct URL approach instead.
@@ -27,6 +27,7 @@ const BING_HOST: &str = "www.bing.com";
const TAVILY_ENDPOINT: &str = "https://api.tavily.com/search";
const BOCHA_ENDPOINT: &str = "https://api.bochaai.com/v1/ai/search";
const METASO_ENDPOINT: &str = "https://metaso.cn/api/v1";
const BAIDU_ENDPOINT: &str = "https://qianfan.baidubce.com/v2/ai_search/web_search";
/// Intentionally public default key provided by Metaso for open-source/community use.
/// Last-resort fallback after config and env var. Rate-limited to ~100 searches/day.
const METASO_DEFAULT_API_KEY: &str = "mk-E384C1DD5E8501BB7EFE27C949AFDE5B";
@@ -57,6 +58,7 @@ static TAG_RE: OnceLock<Regex> = OnceLock::new();
static BING_RESULT_RE: OnceLock<Regex> = OnceLock::new();
static BING_TITLE_RE: OnceLock<Regex> = OnceLock::new();
static BING_SNIPPET_RE: OnceLock<Regex> = OnceLock::new();
static BEARER_TOKEN_RE: OnceLock<Regex> = OnceLock::new();
fn get_title_re() -> &'static Regex {
TITLE_RE.get_or_init(|| {
@@ -99,6 +101,13 @@ fn get_bing_snippet_re() -> &'static Regex {
})
}
fn get_bearer_token_re() -> &'static Regex {
BEARER_TOKEN_RE.get_or_init(|| {
Regex::new(r"(?i)\bBearer\s+[A-Za-z0-9._~+/=-]+")
.expect("bearer token regex pattern is valid")
})
}
const DEFAULT_MAX_RESULTS: usize = 5;
const MAX_RESULTS: usize = 10;
const DEFAULT_TIMEOUT_MS: u64 = 15_000;
@@ -129,7 +138,7 @@ impl ToolSpec for WebSearchTool {
}
fn description(&self) -> &'static str {
"Search the web and return ranked results with URLs and snippets. Default backend is DuckDuckGo with Bing fallback; set `[search] provider = \"bing\" | \"tavily\" | \"bocha\"` in config.toml to switch backends. Use this instead of scraping search engines with `curl` in `exec_shell`. For a known canonical URL, prefer `fetch_url` directly."
"Search the web and return ranked results with URLs and snippets. Default backend is DuckDuckGo with Bing fallback; set `[search] provider = \"bing\" | \"tavily\" | \"bocha\" | \"metaso\" | \"baidu\"` in config.toml to switch backends. Use this instead of scraping search engines with `curl` in `exec_shell`. For a known canonical URL, prefer `fetch_url` directly."
}
fn input_schema(&self) -> Value {
@@ -210,6 +219,13 @@ impl ToolSpec for WebSearchTool {
.run_metaso_search(&query, max_results, timeout_ms, context)
.await;
}
SearchProvider::Baidu => {
let decider = context.network_policy.as_ref();
check_policy(decider, "qianfan.baidubce.com")?;
return self
.run_baidu_search(&query, max_results, timeout_ms, context)
.await;
}
SearchProvider::Bing | SearchProvider::DuckDuckGo => {}
}
@@ -645,6 +661,88 @@ impl WebSearchTool {
search_tool_result(query.to_string(), "metaso", results, None)
}
/// Search via Baidu AI Search API (<https://qianfan.baidubce.com>).
async fn run_baidu_search(
&self,
query: &str,
max_results: usize,
timeout_ms: u64,
context: &ToolContext,
) -> Result<ToolResult, ToolError> {
let env_key = std::env::var("BAIDU_SEARCH_API_KEY").ok();
let api_key = context
.search_api_key
.as_deref()
.or(env_key.as_deref())
.ok_or_else(|| {
ToolError::execution_failed(
"Baidu search requires an API key. Set `BAIDU_SEARCH_API_KEY` or `[search] api_key` in config.toml.",
)
})?;
let client = reqwest::Client::builder()
.timeout(Duration::from_millis(timeout_ms))
.build()
.map_err(|e| {
ToolError::execution_failed(format!("Failed to build HTTP client: {e}"))
})?;
let payload = json!({
"messages": [
{
"role": "user",
"content": query,
}
],
"search_source": "baidu_search",
"resource_type_filter": [
{
"type": "web",
"top_k": max_results,
}
],
});
let resp = client
.post(BAIDU_ENDPOINT)
.header("Content-Type", "application/json")
.header("Authorization", format!("Bearer {api_key}"))
.json(&payload)
.send()
.await
.map_err(|e| {
ToolError::execution_failed(format!("Baidu search request failed: {e}"))
})?;
let status = resp.status();
let body = resp.text().await.map_err(|e| {
ToolError::execution_failed(format!("Failed to read Baidu response: {e}"))
})?;
if !status.is_success() {
let msg = match status.as_u16() {
401 | 403 => "Baidu search API key rejected — check BAIDU_SEARCH_API_KEY or `[search] api_key` in config.toml".to_string(),
429 => "Baidu search rate-limited — wait and retry, or check your Baidu AI Search quota".to_string(),
_ => {
let truncated = truncate_error_body(&body);
format!("Baidu search failed: HTTP {}{truncated}", status.as_u16())
}
};
return Err(ToolError::execution_failed(msg));
}
let parsed: serde_json::Value = serde_json::from_str(&body).map_err(|e| {
ToolError::execution_failed(format!("Failed to parse Baidu response: {e}"))
})?;
if let Some(error) = baidu_error_message(&parsed) {
return Err(ToolError::execution_failed(error));
}
let results = parse_baidu_results(&parsed, max_results);
search_tool_result(query.to_string(), "baidu", results, None)
}
}
fn truncate_error_body(body: &str) -> String {
@@ -662,12 +760,69 @@ fn truncate_error_body(body: &str) -> String {
fn sanitize_error_body(body: &str) -> String {
let stripped = strip_html_tags(body);
stripped
let visible: String = stripped
.chars()
.filter(|c| !c.is_control() || c.is_ascii_whitespace())
.collect();
get_bearer_token_re()
.replace_all(&visible, "Bearer [REDACTED]")
.to_string()
}
fn parse_baidu_results(parsed: &Value, max_results: usize) -> Vec<WebSearchEntry> {
parsed
.get("references")
.and_then(|v| v.as_array())
.into_iter()
.flat_map(|arr| arr.iter())
.filter_map(|item| {
let title = item
.get("title")
.or_else(|| item.get("name"))
.and_then(|s| s.as_str())?
.trim();
let url = item
.get("url")
.or_else(|| item.get("link"))
.and_then(|s| s.as_str())?
.trim();
if title.is_empty() || url.is_empty() {
return None;
}
let snippet = item
.get("content")
.or_else(|| item.get("snippet"))
.or_else(|| item.get("summary"))
.and_then(|s| s.as_str())
.map(str::trim)
.filter(|s| !s.is_empty())
.map(ToString::to_string);
Some(WebSearchEntry {
title: title.to_string(),
url: url.to_string(),
snippet,
})
})
.take(max_results)
.collect()
}
fn baidu_error_message(parsed: &Value) -> Option<String> {
let code = parsed
.get("error_code")
.or_else(|| parsed.get("code"))
.and_then(|v| v.as_i64())?;
if code == 0 {
return None;
}
let message = parsed
.get("error_msg")
.or_else(|| parsed.get("message"))
.and_then(|v| v.as_str())
.unwrap_or("unknown error");
Some(format!("Baidu search API error (code {code}: {message})"))
}
fn extract_search_query(input: &Value) -> Result<String, ToolError> {
for key in ["query", "q"] {
if let Some(value) = input.get(key) {
@@ -1028,7 +1183,7 @@ mod tests {
use super::{
ERROR_BODY_PREVIEW_BYTES, WebSearchEntry, WebSearchTool, decode_html_entities,
extract_search_query, is_likely_spam_results, optional_search_max_results, root_domain,
sanitize_error_body, truncate_error_body,
parse_baidu_results, sanitize_error_body, truncate_error_body,
};
use serde_json::json;
@@ -1295,6 +1450,69 @@ mod tests {
assert_eq!(sanitized, "error");
}
#[test]
fn sanitize_error_body_redacts_bearer_tokens() {
let body =
r#"{"error":"bad token","authorization":"Bearer bce-v3/ALTAK-example/secret"}"#;
let sanitized = sanitize_error_body(body);
assert!(!sanitized.contains("bce-v3/ALTAK-example/secret"));
assert!(sanitized.contains("Bearer [REDACTED]"));
}
#[test]
fn parse_baidu_references_extracts_ranked_results() {
let body = json!({
"references": [
{
"title": "Rust 官方文档",
"url": "https://www.rust-lang.org/",
"content": "Rust 是一门注重性能和可靠性的语言。"
},
{
"title": "Cargo Book",
"url": "https://doc.rust-lang.org/cargo/",
"snippet": "Cargo is Rust's package manager."
}
]
});
let results = parse_baidu_results(&body, 10);
assert_eq!(results.len(), 2);
assert_eq!(results[0].title, "Rust 官方文档");
assert_eq!(results[0].url, "https://www.rust-lang.org/");
assert_eq!(
results[0].snippet.as_deref(),
Some("Rust 是一门注重性能和可靠性的语言。")
);
assert_eq!(results[1].title, "Cargo Book");
assert_eq!(results[1].url, "https://doc.rust-lang.org/cargo/");
assert_eq!(
results[1].snippet.as_deref(),
Some("Cargo is Rust's package manager.")
);
}
#[test]
fn parse_baidu_references_skips_incomplete_entries() {
let body = json!({
"references": [
{"title": "No URL", "content": "missing url"},
{"url": "https://example.com/no-title", "content": "missing title"},
{"title": "Valid", "url": "https://example.com/valid"}
]
});
let results = parse_baidu_results(&body, 10);
assert_eq!(results.len(), 1);
assert_eq!(results[0].title, "Valid");
assert_eq!(results[0].url, "https://example.com/valid");
assert_eq!(results[0].snippet, None);
}
#[tokio::test]
async fn tavily_provider_without_api_key_surfaces_clear_error_not_silent_fallback() {
// Trust-boundary pin: if a user has opted into Tavily but
@@ -1341,6 +1559,35 @@ mod tests {
);
}
#[tokio::test]
async fn baidu_provider_without_api_key_surfaces_clear_error_not_silent_fallback() {
use crate::config::SearchProvider;
use crate::tools::spec::{ToolContext, ToolSpec};
let prev = std::env::var_os("BAIDU_SEARCH_API_KEY");
unsafe { std::env::remove_var("BAIDU_SEARCH_API_KEY") };
let tmp = tempfile::tempdir().expect("tempdir");
let mut ctx = ToolContext::new(tmp.path().to_path_buf());
ctx.search_provider = SearchProvider::Baidu;
ctx.search_api_key = None;
let err = WebSearchTool
.execute(json!({"query": "anything"}), &ctx)
.await
.expect_err("missing api_key must surface as ToolError");
match prev {
Some(value) => unsafe { std::env::set_var("BAIDU_SEARCH_API_KEY", value) },
None => unsafe { std::env::remove_var("BAIDU_SEARCH_API_KEY") },
}
let msg = err.to_string();
assert!(
msg.contains("Baidu") && msg.contains("API key"),
"error must name the provider and missing key; got `{msg}`"
);
}
#[tokio::test]
async fn metaso_provider_uses_built_in_key_when_no_config_key_set() {
// Unlike Tavily/Bocha, Metaso falls back to a built-in default, so