feat(web_search): add Sofya search provider
Harvested from PR #2790 by @yusufgurdogan.
This commit is contained in:
@@ -77,6 +77,10 @@ ChaceLyee2101 = ChaceLyee2101 <95995339+ChaceLyee2101@users.noreply.github.com>
|
||||
ci4ic4 = ci4ic4 <6495973+ci4ic4@users.noreply.github.com>
|
||||
Chavdar Ivanov = ci4ic4 <6495973+ci4ic4@users.noreply.github.com>
|
||||
ci4ic4@gmail.com = ci4ic4 <6495973+ci4ic4@users.noreply.github.com>
|
||||
yusufgurdogan = yusufgurdogan <13736056+yusufgurdogan@users.noreply.github.com>
|
||||
Yusuf Gurdogan = yusufgurdogan <13736056+yusufgurdogan@users.noreply.github.com>
|
||||
hotelswith = yusufgurdogan <13736056+yusufgurdogan@users.noreply.github.com>
|
||||
contact@hotelswith.com = yusufgurdogan <13736056+yusufgurdogan@users.noreply.github.com>
|
||||
AresNing = AresNing <49557311+AresNing@users.noreply.github.com>
|
||||
|
||||
shenjackyuanjie = shenjackyuanjie <54507071+shenjackyuanjie@users.noreply.github.com>
|
||||
|
||||
@@ -61,6 +61,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
TUI sidebar from the command line instead of relying on copy-hostile sidebar
|
||||
state during long transcript work (#2766, #2788). Thanks @mo-vic for the
|
||||
detailed report and @aboimpinto for the fix.
|
||||
- Added Sofya (`provider = "sofya"`) as a search-tool backend with
|
||||
`SOFYA_API_KEY` fallback, while keeping Sofya scoped to web search rather
|
||||
than model-provider routing (#2790). Thanks @yusufgurdogan for the
|
||||
implementation.
|
||||
|
||||
### Changed
|
||||
|
||||
|
||||
+6
-2
@@ -399,7 +399,7 @@ max_subagents = 10 # optional (1-20)
|
||||
# API-backed search.
|
||||
#
|
||||
# [search]
|
||||
# provider = "duckduckgo" # duckduckgo | bing | tavily | bocha | metaso | baidu | volcengine
|
||||
# provider = "duckduckgo" # duckduckgo | bing | tavily | bocha | metaso | baidu | volcengine | sofya
|
||||
# # duckduckgo: HTML scrape with Bing fallback
|
||||
# # bing: HTML scrape, no API key
|
||||
# # tavily: https://tavily.com — AI search, needs api_key
|
||||
@@ -409,8 +409,11 @@ max_subagents = 10 # optional (1-20)
|
||||
# # baidu: 百度 AI Search via qianfan.baidubce.com,需 api_key
|
||||
# # volcengine: 火山引擎 Ark web_search (免费 2 万次/月), 需 api_key
|
||||
# # 也回退到 VOLCENGINE_API_KEY / VOLCENGINE_ARK_API_KEY / ARK_API_KEY 环境变量
|
||||
# # sofya: https://sofya.co — AI search returning full page
|
||||
# # content (not snippets), needs api_key (ay_live_...);
|
||||
# # also falls back to the SOFYA_API_KEY env var
|
||||
# base_url = "https://search.example/html/" # optional DuckDuckGo-compatible HTML endpoint
|
||||
# api_key = "YOUR_SEARCH_KEY" # required for tavily, bocha, and baidu; optional for metaso
|
||||
# api_key = "YOUR_SEARCH_KEY" # required for tavily, bocha, baidu, volcengine, and sofya; optional for metaso
|
||||
# # WARNING: treat config.toml like a secret file when
|
||||
# # storing API keys. Prefer env vars for local smoke tests.
|
||||
#
|
||||
@@ -421,6 +424,7 @@ max_subagents = 10 # optional (1-20)
|
||||
# DEEPSEEK_SEARCH_BASE_URL → search.base_url (legacy alias)
|
||||
# METASO_API_KEY → metaso key fallback
|
||||
# BAIDU_SEARCH_API_KEY → baidu key fallback
|
||||
# SOFYA_API_KEY → sofya key fallback
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────────
|
||||
# Network Policy (#135)
|
||||
|
||||
@@ -61,6 +61,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
TUI sidebar from the command line instead of relying on copy-hostile sidebar
|
||||
state during long transcript work (#2766, #2788). Thanks @mo-vic for the
|
||||
detailed report and @aboimpinto for the fix.
|
||||
- Added Sofya (`provider = "sofya"`) as a search-tool backend with
|
||||
`SOFYA_API_KEY` fallback, while keeping Sofya scoped to web search rather
|
||||
than model-provider routing (#2790). Thanks @yusufgurdogan for the
|
||||
implementation.
|
||||
|
||||
### Changed
|
||||
|
||||
|
||||
@@ -1068,6 +1068,11 @@ pub enum SearchProvider {
|
||||
alias = "volc-ark"
|
||||
)]
|
||||
Volcengine,
|
||||
/// Sofya web search API (<https://sofya.co>). Requires api_key
|
||||
/// (`ay_live_...`). Returns full extracted page content rather than
|
||||
/// snippets; falls back to the `SOFYA_API_KEY` env var when
|
||||
/// `[search] api_key` is not set.
|
||||
Sofya,
|
||||
}
|
||||
|
||||
impl SearchProvider {
|
||||
@@ -1083,6 +1088,7 @@ impl SearchProvider {
|
||||
Some(Self::Baidu)
|
||||
}
|
||||
"volcengine" | "ark" | "volc" | "volcengine-ark" => Some(Self::Volcengine),
|
||||
"sofya" => Some(Self::Sofya),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
@@ -1097,6 +1103,7 @@ impl SearchProvider {
|
||||
Self::Metaso => "metaso",
|
||||
Self::Baidu => "baidu",
|
||||
Self::Volcengine => "volcengine",
|
||||
Self::Sofya => "sofya",
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -5647,6 +5654,29 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn explicit_sofya_search_provider_is_preserved() {
|
||||
let config: Config = toml::from_str(
|
||||
r#"
|
||||
[search]
|
||||
provider = "sofya"
|
||||
"#,
|
||||
)
|
||||
.expect("sofya search config");
|
||||
|
||||
assert_eq!(
|
||||
config.search.and_then(|search| search.provider),
|
||||
Some(SearchProvider::Sofya)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sofya_search_provider_parses_and_round_trips() {
|
||||
assert_eq!(SearchProvider::parse("sofya"), Some(SearchProvider::Sofya));
|
||||
assert_eq!(SearchProvider::parse("Sofya"), Some(SearchProvider::Sofya));
|
||||
assert_eq!(SearchProvider::Sofya.as_str(), "sofya");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn search_provider_resolution_reports_default_source() {
|
||||
let _guard = lock_test_env();
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
//! Web search tool backed by multiple providers: Bing HTML scrape, DuckDuckGo
|
||||
//! (HTML scrape with Bing fallback), Tavily API, Bocha (博查) API,
|
||||
//! Metaso API (<https://metaso.cn>), Baidu AI Search, and Volcengine Ark.
|
||||
//! Metaso API (<https://metaso.cn>), Baidu AI Search, Volcengine Ark, and
|
||||
//! Sofya (<https://sofya.co>).
|
||||
//!
|
||||
//! This is the primary web search surface for agents. For browsing workflows
|
||||
//! (page open, click, screenshot) use a direct URL approach instead.
|
||||
//!
|
||||
//! Set `[search]` in config.toml to switch providers:
|
||||
//! provider = "duckduckgo" # or tavily/bocha/metaso/baidu/volcengine
|
||||
//! provider = "duckduckgo" # or tavily/bocha/metaso/baidu/volcengine/sofya
|
||||
//! base_url = "https://search.example/html/" # optional DDG-compatible URL
|
||||
//! api_key = "tvly-..."
|
||||
|
||||
@@ -30,6 +31,7 @@ const BOCHA_ENDPOINT: &str = "https://api.bochaai.com/v1/ai/search";
|
||||
const METASO_ENDPOINT: &str = "https://metaso.cn/api/v1";
|
||||
const BAIDU_ENDPOINT: &str = "https://qianfan.baidubce.com/v2/ai_search/web_search";
|
||||
const VOLCENGINE_RESPONSES_ENDPOINT: &str = "https://ark.cn-beijing.volces.com/api/v3/responses";
|
||||
const SOFYA_ENDPOINT: &str = "https://sofya.co/v1/search";
|
||||
/// Intentionally public default key provided by Metaso for open-source/community use.
|
||||
/// Last-resort fallback after config and env var. Rate-limited to ~100 searches/day.
|
||||
const METASO_DEFAULT_API_KEY: &str = "mk-E384C1DD5E8501BB7EFE27C949AFDE5B";
|
||||
@@ -140,7 +142,7 @@ impl ToolSpec for WebSearchTool {
|
||||
}
|
||||
|
||||
fn description(&self) -> &'static str {
|
||||
"Search the web and return ranked results with URLs and snippets. Default backend is DuckDuckGo with Bing fallback; set `[search] provider = \"bing\" | \"tavily\" | \"bocha\" | \"metaso\" | \"baidu\"` in config.toml to switch backends, or `[search] base_url` for a DuckDuckGo-compatible endpoint. Use this instead of scraping search engines with `curl` in `exec_shell`. For a known canonical URL, prefer `fetch_url` directly."
|
||||
"Search the web and return ranked results with URLs and snippets. Default backend is DuckDuckGo with Bing fallback; set `[search] provider = \"bing\" | \"tavily\" | \"bocha\" | \"metaso\" | \"baidu\" | \"volcengine\" | \"sofya\"` in config.toml to switch backends, or `[search] base_url` for a DuckDuckGo-compatible endpoint. Use this instead of scraping search engines with `curl` in `exec_shell`. For a known canonical URL, prefer `fetch_url` directly."
|
||||
}
|
||||
|
||||
fn input_schema(&self) -> Value {
|
||||
@@ -248,6 +250,13 @@ impl ToolSpec for WebSearchTool {
|
||||
.run_volcengine_search(&query, max_results, timeout_ms, context)
|
||||
.await;
|
||||
}
|
||||
SearchProvider::Sofya => {
|
||||
let decider = context.network_policy.as_ref();
|
||||
check_policy(decider, "sofya.co")?;
|
||||
return self
|
||||
.run_sofya_search(&query, max_results, timeout_ms, context)
|
||||
.await;
|
||||
}
|
||||
SearchProvider::Bing | SearchProvider::DuckDuckGo => {}
|
||||
}
|
||||
|
||||
@@ -485,6 +494,88 @@ impl WebSearchTool {
|
||||
ToolResult::json(&response).map_err(|e| ToolError::execution_failed(e.to_string()))
|
||||
}
|
||||
|
||||
/// Search via Sofya web search API (<https://sofya.co>).
|
||||
///
|
||||
/// Sofya returns full extracted page content rather than snippets. The API
|
||||
/// key (`ay_live_...`) comes from `[search] api_key`, falling back to the
|
||||
/// `SOFYA_API_KEY` env var, and is sent as a `Bearer` token.
|
||||
async fn run_sofya_search(
|
||||
&self,
|
||||
query: &str,
|
||||
max_results: usize,
|
||||
timeout_ms: u64,
|
||||
context: &ToolContext,
|
||||
) -> Result<ToolResult, ToolError> {
|
||||
let env_key = std::env::var("SOFYA_API_KEY").ok();
|
||||
let api_key = context
|
||||
.search_api_key
|
||||
.as_deref()
|
||||
.or(env_key.as_deref())
|
||||
.ok_or_else(|| {
|
||||
ToolError::execution_failed(
|
||||
"Sofya search requires an API key. Set `[search] api_key = \"ay_live_...\"` in config.toml or the SOFYA_API_KEY env var.",
|
||||
)
|
||||
})?;
|
||||
|
||||
let client = crate::tls::reqwest_client_builder()
|
||||
.timeout(Duration::from_millis(timeout_ms))
|
||||
.build()
|
||||
.map_err(|e| {
|
||||
ToolError::execution_failed(format!("Failed to build HTTP client: {e}"))
|
||||
})?;
|
||||
|
||||
let payload = json!({
|
||||
"query": query,
|
||||
"max_results": max_results,
|
||||
});
|
||||
|
||||
let resp = client
|
||||
.post(SOFYA_ENDPOINT)
|
||||
.header("Content-Type", "application/json")
|
||||
.bearer_auth(api_key)
|
||||
.json(&payload)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
ToolError::execution_failed(format!("Sofya search request failed: {e}"))
|
||||
})?;
|
||||
|
||||
let status = resp.status();
|
||||
let body = resp.text().await.map_err(|e| {
|
||||
ToolError::execution_failed(format!("Failed to read Sofya response: {e}"))
|
||||
})?;
|
||||
|
||||
if !status.is_success() {
|
||||
let truncated = truncate_error_body(&body);
|
||||
return Err(ToolError::execution_failed(format!(
|
||||
"Sofya search failed: HTTP {} — {truncated}",
|
||||
status.as_u16()
|
||||
)));
|
||||
}
|
||||
|
||||
let parsed: serde_json::Value = serde_json::from_str(&body).map_err(|e| {
|
||||
ToolError::execution_failed(format!("Failed to parse Sofya response: {e}"))
|
||||
})?;
|
||||
|
||||
let results = parse_sofya_results(&parsed, max_results);
|
||||
|
||||
let message = if results.is_empty() {
|
||||
"No results found".to_string()
|
||||
} else {
|
||||
format!("Found {} result(s)", results.len())
|
||||
};
|
||||
|
||||
let response = WebSearchResponse {
|
||||
query: query.to_string(),
|
||||
source: "sofya".to_string(),
|
||||
count: results.len(),
|
||||
message,
|
||||
results,
|
||||
};
|
||||
|
||||
ToolResult::json(&response).map_err(|e| ToolError::execution_failed(e.to_string()))
|
||||
}
|
||||
|
||||
/// Search via Bocha AI Search API (<https://bochaai.com>).
|
||||
async fn run_bocha_search(
|
||||
&self,
|
||||
@@ -967,6 +1058,36 @@ fn baidu_error_message(parsed: &Value) -> Option<String> {
|
||||
Some(format!("Baidu search API error (code {code}: {message})"))
|
||||
}
|
||||
|
||||
fn parse_sofya_results(parsed: &Value, max_results: usize) -> Vec<WebSearchEntry> {
|
||||
parsed
|
||||
.get("results")
|
||||
.and_then(|v| v.as_array())
|
||||
.into_iter()
|
||||
.flat_map(|arr| arr.iter())
|
||||
.filter_map(|item| {
|
||||
let title = item.get("title")?.as_str()?.to_string();
|
||||
let url = item.get("url")?.as_str()?.to_string();
|
||||
let snippet = first_non_empty_string(item, &["content", "description"]);
|
||||
Some(WebSearchEntry {
|
||||
title,
|
||||
url,
|
||||
snippet,
|
||||
})
|
||||
})
|
||||
.take(max_results)
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn first_non_empty_string(item: &Value, keys: &[&str]) -> Option<String> {
|
||||
keys.iter().find_map(|key| {
|
||||
item.get(*key)
|
||||
.and_then(Value::as_str)
|
||||
.map(str::trim)
|
||||
.filter(|value| !value.is_empty())
|
||||
.map(str::to_string)
|
||||
})
|
||||
}
|
||||
|
||||
fn baidu_search_payload(query: &str, max_results: usize) -> Value {
|
||||
json!({
|
||||
"messages": [
|
||||
@@ -1488,8 +1609,8 @@ mod tests {
|
||||
use super::{
|
||||
ERROR_BODY_PREVIEW_BYTES, WebSearchEntry, WebSearchTool, baidu_search_payload,
|
||||
decode_html_entities, duckduckgo_search_url, extract_search_query, is_likely_spam_results,
|
||||
normalize_bing_url, optional_search_max_results, parse_baidu_results, root_domain,
|
||||
sanitize_error_body, truncate_error_body, volcengine_extract_text,
|
||||
normalize_bing_url, optional_search_max_results, parse_baidu_results, parse_sofya_results,
|
||||
root_domain, sanitize_error_body, truncate_error_body, volcengine_extract_text,
|
||||
};
|
||||
use serde_json::json;
|
||||
|
||||
@@ -1857,6 +1978,63 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_sofya_results_falls_back_to_description_for_empty_content() {
|
||||
let body = json!({
|
||||
"results": [
|
||||
{
|
||||
"title": "Full content",
|
||||
"url": "https://example.com/full",
|
||||
"content": "full extracted page content",
|
||||
"description": "unused description"
|
||||
},
|
||||
{
|
||||
"title": "Null content",
|
||||
"url": "https://example.com/null",
|
||||
"content": null,
|
||||
"description": "description for null content"
|
||||
},
|
||||
{
|
||||
"title": "Empty content",
|
||||
"url": "https://example.com/empty",
|
||||
"content": "",
|
||||
"description": "description for empty content"
|
||||
},
|
||||
{
|
||||
"title": "Whitespace content",
|
||||
"url": "https://example.com/blank",
|
||||
"content": " ",
|
||||
"description": "description for blank content"
|
||||
},
|
||||
{
|
||||
"title": "No snippet",
|
||||
"url": "https://example.com/no-snippet"
|
||||
}
|
||||
]
|
||||
});
|
||||
|
||||
let results = parse_sofya_results(&body, 10);
|
||||
|
||||
assert_eq!(results.len(), 5);
|
||||
assert_eq!(
|
||||
results[0].snippet.as_deref(),
|
||||
Some("full extracted page content")
|
||||
);
|
||||
assert_eq!(
|
||||
results[1].snippet.as_deref(),
|
||||
Some("description for null content")
|
||||
);
|
||||
assert_eq!(
|
||||
results[2].snippet.as_deref(),
|
||||
Some("description for empty content")
|
||||
);
|
||||
assert_eq!(
|
||||
results[3].snippet.as_deref(),
|
||||
Some("description for blank content")
|
||||
);
|
||||
assert_eq!(results[4].snippet, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn volcengine_extract_text_skips_non_text_content_blocks() {
|
||||
let body = json!({
|
||||
@@ -1952,6 +2130,42 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[allow(clippy::await_holding_lock)]
|
||||
async fn sofya_provider_without_api_key_surfaces_clear_error_not_silent_fallback() {
|
||||
// Same trust-boundary pin as Tavily/Bocha: opting into Sofya without a
|
||||
// key must surface a ToolError naming the provider, not silently fall
|
||||
// through to DuckDuckGo.
|
||||
use crate::config::SearchProvider;
|
||||
use crate::tools::spec::{ToolContext, ToolSpec};
|
||||
|
||||
// This test holds the process-env lock through the awaited tool
|
||||
// execution because the tool reads SOFYA_API_KEY during that call.
|
||||
let _guard = crate::test_support::lock_test_env();
|
||||
let prev = std::env::var_os("SOFYA_API_KEY");
|
||||
unsafe { std::env::remove_var("SOFYA_API_KEY") };
|
||||
|
||||
let tmp = tempfile::tempdir().expect("tempdir");
|
||||
let mut ctx = ToolContext::new(tmp.path().to_path_buf());
|
||||
ctx.search_provider = SearchProvider::Sofya;
|
||||
ctx.search_api_key = None;
|
||||
let err = WebSearchTool
|
||||
.execute(json!({"query": "anything"}), &ctx)
|
||||
.await
|
||||
.expect_err("missing api_key must surface as ToolError");
|
||||
|
||||
match prev {
|
||||
Some(value) => unsafe { std::env::set_var("SOFYA_API_KEY", value) },
|
||||
None => unsafe { std::env::remove_var("SOFYA_API_KEY") },
|
||||
}
|
||||
|
||||
let msg = err.to_string();
|
||||
assert!(
|
||||
msg.contains("Sofya") && msg.contains("API key"),
|
||||
"error must name the provider and missing key; got `{msg}`"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[allow(clippy::await_holding_lock)]
|
||||
async fn volcengine_provider_without_api_key_lists_supported_env_fallbacks() {
|
||||
|
||||
@@ -1088,8 +1088,8 @@ Use `codewhale-tui features list` to inspect known flags and their effective sta
|
||||
`web_search` uses DuckDuckGo by default and does not require an API key. The
|
||||
DuckDuckGo path keeps a Bing fallback when DDG returns a bot challenge or no
|
||||
parseable results. Bing remains selectable for users who explicitly want it,
|
||||
and Tavily, Bocha, Metaso, or Baidu can be selected when an API-backed provider
|
||||
is preferred.
|
||||
and Tavily, Bocha, Metaso, Baidu, Volcengine, or Sofya can be selected when an
|
||||
API-backed provider is preferred.
|
||||
|
||||
For a private/internal search service that serves DuckDuckGo-compatible HTML,
|
||||
keep `provider = "duckduckgo"` and set `base_url`; CodeWhale appends the `q`
|
||||
@@ -1106,11 +1106,16 @@ set `METASO_API_KEY` or `[search] api_key` for a higher quota.
|
||||
`BAIDU_SEARCH_API_KEY` or `[search] api_key`. This is a search-tool backend
|
||||
only; it does not add a Baidu model provider.
|
||||
|
||||
**Sofya** ([sofya.co](https://sofya.co)) returns full extracted page content
|
||||
rather than snippets. Set `[search] api_key` to your `ay_live_...` key, or the
|
||||
`SOFYA_API_KEY` env var. This is a search-tool backend only; it does not add a
|
||||
Sofya model provider.
|
||||
|
||||
```toml
|
||||
[search]
|
||||
provider = "baidu" # duckduckgo | bing | tavily | bocha | metaso | baidu
|
||||
provider = "baidu" # duckduckgo | bing | tavily | bocha | metaso | baidu | volcengine | sofya
|
||||
# base_url = "https://search.example/html/" # optional with provider = "duckduckgo"
|
||||
# api_key = "YOUR_KEY" # required for tavily, bocha, and baidu; optional for metaso
|
||||
# api_key = "YOUR_KEY" # required for tavily, bocha, baidu, volcengine, and sofya; optional for metaso
|
||||
```
|
||||
|
||||
## Local Media Attachments
|
||||
|
||||
Reference in New Issue
Block a user