Merge pull request #2797 from Hmbown/codex/harvest-2790-sofya-search

feat(web_search): add Sofya search provider
This commit is contained in:
Hunter Bown
2026-06-05 08:30:06 -07:00
committed by GitHub
7 changed files with 276 additions and 11 deletions
+4
View File
@@ -77,6 +77,10 @@ ChaceLyee2101 = ChaceLyee2101 <95995339+ChaceLyee2101@users.noreply.github.com>
ci4ic4 = ci4ic4 <6495973+ci4ic4@users.noreply.github.com>
Chavdar Ivanov = ci4ic4 <6495973+ci4ic4@users.noreply.github.com>
ci4ic4@gmail.com = ci4ic4 <6495973+ci4ic4@users.noreply.github.com>
yusufgurdogan = yusufgurdogan <13736056+yusufgurdogan@users.noreply.github.com>
Yusuf Gurdogan = yusufgurdogan <13736056+yusufgurdogan@users.noreply.github.com>
hotelswith = yusufgurdogan <13736056+yusufgurdogan@users.noreply.github.com>
contact@hotelswith.com = yusufgurdogan <13736056+yusufgurdogan@users.noreply.github.com>
AresNing = AresNing <49557311+AresNing@users.noreply.github.com>
shenjackyuanjie = shenjackyuanjie <54507071+shenjackyuanjie@users.noreply.github.com>
+4
View File
@@ -61,6 +61,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
TUI sidebar from the command line instead of relying on copy-hostile sidebar
state during long transcript work (#2766, #2788). Thanks @mo-vic for the
detailed report and @aboimpinto for the fix.
- Added Sofya (`provider = "sofya"`) as a search-tool backend with
`SOFYA_API_KEY` fallback, while keeping Sofya scoped to web search rather
than model-provider routing (#2790). Thanks @yusufgurdogan for the
implementation.
### Changed
+6 -2
View File
@@ -399,7 +399,7 @@ max_subagents = 10 # optional (1-20)
# API-backed search.
#
# [search]
# provider = "duckduckgo" # duckduckgo | bing | tavily | bocha | metaso | baidu | volcengine
# provider = "duckduckgo" # duckduckgo | bing | tavily | bocha | metaso | baidu | volcengine | sofya
# # duckduckgo: HTML scrape with Bing fallback
# # bing: HTML scrape, no API key
# # tavily: https://tavily.com — AI search, needs api_key
@@ -409,8 +409,11 @@ max_subagents = 10 # optional (1-20)
# # baidu: 百度 AI Search via qianfan.baidubce.com,需 api_key
# # volcengine: 火山引擎 Ark web_search (免费 2 万次/月), 需 api_key
# # 也回退到 VOLCENGINE_API_KEY / VOLCENGINE_ARK_API_KEY / ARK_API_KEY 环境变量
# # sofya: https://sofya.co — AI search returning full page
# # content (not snippets), needs api_key (ay_live_...);
# # also falls back to the SOFYA_API_KEY env var
# base_url = "https://search.example/html/" # optional DuckDuckGo-compatible HTML endpoint
# api_key = "YOUR_SEARCH_KEY" # required for tavily, bocha, and baidu; optional for metaso
# api_key = "YOUR_SEARCH_KEY" # required for tavily, bocha, baidu, volcengine, and sofya; optional for metaso
# # WARNING: treat config.toml like a secret file when
# # storing API keys. Prefer env vars for local smoke tests.
#
@@ -421,6 +424,7 @@ max_subagents = 10 # optional (1-20)
# DEEPSEEK_SEARCH_BASE_URL → search.base_url (legacy alias)
# METASO_API_KEY → metaso key fallback
# BAIDU_SEARCH_API_KEY → baidu key fallback
# SOFYA_API_KEY → sofya key fallback
# ─────────────────────────────────────────────────────────────────────────────────
# Network Policy (#135)
+4
View File
@@ -61,6 +61,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
TUI sidebar from the command line instead of relying on copy-hostile sidebar
state during long transcript work (#2766, #2788). Thanks @mo-vic for the
detailed report and @aboimpinto for the fix.
- Added Sofya (`provider = "sofya"`) as a search-tool backend with
`SOFYA_API_KEY` fallback, while keeping Sofya scoped to web search rather
than model-provider routing (#2790). Thanks @yusufgurdogan for the
implementation.
### Changed
+30
View File
@@ -1068,6 +1068,11 @@ pub enum SearchProvider {
alias = "volc-ark"
)]
Volcengine,
/// Sofya web search API (<https://sofya.co>). Requires api_key
/// (`ay_live_...`). Returns full extracted page content rather than
/// snippets; falls back to the `SOFYA_API_KEY` env var when
/// `[search] api_key` is not set.
Sofya,
}
impl SearchProvider {
@@ -1083,6 +1088,7 @@ impl SearchProvider {
Some(Self::Baidu)
}
"volcengine" | "ark" | "volc" | "volcengine-ark" => Some(Self::Volcengine),
"sofya" => Some(Self::Sofya),
_ => None,
}
}
@@ -1097,6 +1103,7 @@ impl SearchProvider {
Self::Metaso => "metaso",
Self::Baidu => "baidu",
Self::Volcengine => "volcengine",
Self::Sofya => "sofya",
}
}
}
@@ -5647,6 +5654,29 @@ mod tests {
);
}
#[test]
fn explicit_sofya_search_provider_is_preserved() {
let config: Config = toml::from_str(
r#"
[search]
provider = "sofya"
"#,
)
.expect("sofya search config");
assert_eq!(
config.search.and_then(|search| search.provider),
Some(SearchProvider::Sofya)
);
}
#[test]
fn sofya_search_provider_parses_and_round_trips() {
assert_eq!(SearchProvider::parse("sofya"), Some(SearchProvider::Sofya));
assert_eq!(SearchProvider::parse("Sofya"), Some(SearchProvider::Sofya));
assert_eq!(SearchProvider::Sofya.as_str(), "sofya");
}
#[test]
fn search_provider_resolution_reports_default_source() {
let _guard = lock_test_env();
+219 -5
View File
@@ -1,12 +1,13 @@
//! Web search tool backed by multiple providers: Bing HTML scrape, DuckDuckGo
//! (HTML scrape with Bing fallback), Tavily API, Bocha (博查) API,
//! Metaso API (<https://metaso.cn>), Baidu AI Search, and Volcengine Ark.
//! Metaso API (<https://metaso.cn>), Baidu AI Search, Volcengine Ark, and
//! Sofya (<https://sofya.co>).
//!
//! This is the primary web search surface for agents. For browsing workflows
//! (page open, click, screenshot) use a direct URL approach instead.
//!
//! Set `[search]` in config.toml to switch providers:
//! provider = "duckduckgo" # or tavily/bocha/metaso/baidu/volcengine
//! provider = "duckduckgo" # or tavily/bocha/metaso/baidu/volcengine/sofya
//! base_url = "https://search.example/html/" # optional DDG-compatible URL
//! api_key = "tvly-..."
@@ -30,6 +31,7 @@ const BOCHA_ENDPOINT: &str = "https://api.bochaai.com/v1/ai/search";
const METASO_ENDPOINT: &str = "https://metaso.cn/api/v1";
const BAIDU_ENDPOINT: &str = "https://qianfan.baidubce.com/v2/ai_search/web_search";
const VOLCENGINE_RESPONSES_ENDPOINT: &str = "https://ark.cn-beijing.volces.com/api/v3/responses";
const SOFYA_ENDPOINT: &str = "https://sofya.co/v1/search";
/// Intentionally public default key provided by Metaso for open-source/community use.
/// Last-resort fallback after config and env var. Rate-limited to ~100 searches/day.
const METASO_DEFAULT_API_KEY: &str = "mk-E384C1DD5E8501BB7EFE27C949AFDE5B";
@@ -140,7 +142,7 @@ impl ToolSpec for WebSearchTool {
}
fn description(&self) -> &'static str {
"Search the web and return ranked results with URLs and snippets. Default backend is DuckDuckGo with Bing fallback; set `[search] provider = \"bing\" | \"tavily\" | \"bocha\" | \"metaso\" | \"baidu\"` in config.toml to switch backends, or `[search] base_url` for a DuckDuckGo-compatible endpoint. Use this instead of scraping search engines with `curl` in `exec_shell`. For a known canonical URL, prefer `fetch_url` directly."
"Search the web and return ranked results with URLs and snippets. Default backend is DuckDuckGo with Bing fallback; set `[search] provider = \"bing\" | \"tavily\" | \"bocha\" | \"metaso\" | \"baidu\" | \"volcengine\" | \"sofya\"` in config.toml to switch backends, or `[search] base_url` for a DuckDuckGo-compatible endpoint. Use this instead of scraping search engines with `curl` in `exec_shell`. For a known canonical URL, prefer `fetch_url` directly."
}
fn input_schema(&self) -> Value {
@@ -248,6 +250,13 @@ impl ToolSpec for WebSearchTool {
.run_volcengine_search(&query, max_results, timeout_ms, context)
.await;
}
SearchProvider::Sofya => {
let decider = context.network_policy.as_ref();
check_policy(decider, "sofya.co")?;
return self
.run_sofya_search(&query, max_results, timeout_ms, context)
.await;
}
SearchProvider::Bing | SearchProvider::DuckDuckGo => {}
}
@@ -485,6 +494,88 @@ impl WebSearchTool {
ToolResult::json(&response).map_err(|e| ToolError::execution_failed(e.to_string()))
}
/// Search via Sofya web search API (<https://sofya.co>).
///
/// Sofya returns full extracted page content rather than snippets. The API
/// key (`ay_live_...`) comes from `[search] api_key`, falling back to the
/// `SOFYA_API_KEY` env var, and is sent as a `Bearer` token.
async fn run_sofya_search(
&self,
query: &str,
max_results: usize,
timeout_ms: u64,
context: &ToolContext,
) -> Result<ToolResult, ToolError> {
let env_key = std::env::var("SOFYA_API_KEY").ok();
let api_key = context
.search_api_key
.as_deref()
.or(env_key.as_deref())
.ok_or_else(|| {
ToolError::execution_failed(
"Sofya search requires an API key. Set `[search] api_key = \"ay_live_...\"` in config.toml or the SOFYA_API_KEY env var.",
)
})?;
let client = crate::tls::reqwest_client_builder()
.timeout(Duration::from_millis(timeout_ms))
.build()
.map_err(|e| {
ToolError::execution_failed(format!("Failed to build HTTP client: {e}"))
})?;
let payload = json!({
"query": query,
"max_results": max_results,
});
let resp = client
.post(SOFYA_ENDPOINT)
.header("Content-Type", "application/json")
.bearer_auth(api_key)
.json(&payload)
.send()
.await
.map_err(|e| {
ToolError::execution_failed(format!("Sofya search request failed: {e}"))
})?;
let status = resp.status();
let body = resp.text().await.map_err(|e| {
ToolError::execution_failed(format!("Failed to read Sofya response: {e}"))
})?;
if !status.is_success() {
let truncated = truncate_error_body(&body);
return Err(ToolError::execution_failed(format!(
"Sofya search failed: HTTP {} — {truncated}",
status.as_u16()
)));
}
let parsed: serde_json::Value = serde_json::from_str(&body).map_err(|e| {
ToolError::execution_failed(format!("Failed to parse Sofya response: {e}"))
})?;
let results = parse_sofya_results(&parsed, max_results);
let message = if results.is_empty() {
"No results found".to_string()
} else {
format!("Found {} result(s)", results.len())
};
let response = WebSearchResponse {
query: query.to_string(),
source: "sofya".to_string(),
count: results.len(),
message,
results,
};
ToolResult::json(&response).map_err(|e| ToolError::execution_failed(e.to_string()))
}
/// Search via Bocha AI Search API (<https://bochaai.com>).
async fn run_bocha_search(
&self,
@@ -967,6 +1058,36 @@ fn baidu_error_message(parsed: &Value) -> Option<String> {
Some(format!("Baidu search API error (code {code}: {message})"))
}
fn parse_sofya_results(parsed: &Value, max_results: usize) -> Vec<WebSearchEntry> {
parsed
.get("results")
.and_then(|v| v.as_array())
.into_iter()
.flat_map(|arr| arr.iter())
.filter_map(|item| {
let title = item.get("title")?.as_str()?.to_string();
let url = item.get("url")?.as_str()?.to_string();
let snippet = first_non_empty_string(item, &["content", "description"]);
Some(WebSearchEntry {
title,
url,
snippet,
})
})
.take(max_results)
.collect()
}
fn first_non_empty_string(item: &Value, keys: &[&str]) -> Option<String> {
keys.iter().find_map(|key| {
item.get(*key)
.and_then(Value::as_str)
.map(str::trim)
.filter(|value| !value.is_empty())
.map(str::to_string)
})
}
fn baidu_search_payload(query: &str, max_results: usize) -> Value {
json!({
"messages": [
@@ -1488,8 +1609,8 @@ mod tests {
use super::{
ERROR_BODY_PREVIEW_BYTES, WebSearchEntry, WebSearchTool, baidu_search_payload,
decode_html_entities, duckduckgo_search_url, extract_search_query, is_likely_spam_results,
normalize_bing_url, optional_search_max_results, parse_baidu_results, root_domain,
sanitize_error_body, truncate_error_body, volcengine_extract_text,
normalize_bing_url, optional_search_max_results, parse_baidu_results, parse_sofya_results,
root_domain, sanitize_error_body, truncate_error_body, volcengine_extract_text,
};
use serde_json::json;
@@ -1857,6 +1978,63 @@ mod tests {
);
}
#[test]
fn parse_sofya_results_falls_back_to_description_for_empty_content() {
let body = json!({
"results": [
{
"title": "Full content",
"url": "https://example.com/full",
"content": "full extracted page content",
"description": "unused description"
},
{
"title": "Null content",
"url": "https://example.com/null",
"content": null,
"description": "description for null content"
},
{
"title": "Empty content",
"url": "https://example.com/empty",
"content": "",
"description": "description for empty content"
},
{
"title": "Whitespace content",
"url": "https://example.com/blank",
"content": " ",
"description": "description for blank content"
},
{
"title": "No snippet",
"url": "https://example.com/no-snippet"
}
]
});
let results = parse_sofya_results(&body, 10);
assert_eq!(results.len(), 5);
assert_eq!(
results[0].snippet.as_deref(),
Some("full extracted page content")
);
assert_eq!(
results[1].snippet.as_deref(),
Some("description for null content")
);
assert_eq!(
results[2].snippet.as_deref(),
Some("description for empty content")
);
assert_eq!(
results[3].snippet.as_deref(),
Some("description for blank content")
);
assert_eq!(results[4].snippet, None);
}
#[test]
fn volcengine_extract_text_skips_non_text_content_blocks() {
let body = json!({
@@ -1952,6 +2130,42 @@ mod tests {
);
}
#[tokio::test]
#[allow(clippy::await_holding_lock)]
async fn sofya_provider_without_api_key_surfaces_clear_error_not_silent_fallback() {
// Same trust-boundary pin as Tavily/Bocha: opting into Sofya without a
// key must surface a ToolError naming the provider, not silently fall
// through to DuckDuckGo.
use crate::config::SearchProvider;
use crate::tools::spec::{ToolContext, ToolSpec};
// This test holds the process-env lock through the awaited tool
// execution because the tool reads SOFYA_API_KEY during that call.
let _guard = crate::test_support::lock_test_env();
let prev = std::env::var_os("SOFYA_API_KEY");
unsafe { std::env::remove_var("SOFYA_API_KEY") };
let tmp = tempfile::tempdir().expect("tempdir");
let mut ctx = ToolContext::new(tmp.path().to_path_buf());
ctx.search_provider = SearchProvider::Sofya;
ctx.search_api_key = None;
let err = WebSearchTool
.execute(json!({"query": "anything"}), &ctx)
.await
.expect_err("missing api_key must surface as ToolError");
match prev {
Some(value) => unsafe { std::env::set_var("SOFYA_API_KEY", value) },
None => unsafe { std::env::remove_var("SOFYA_API_KEY") },
}
let msg = err.to_string();
assert!(
msg.contains("Sofya") && msg.contains("API key"),
"error must name the provider and missing key; got `{msg}`"
);
}
#[tokio::test]
#[allow(clippy::await_holding_lock)]
async fn volcengine_provider_without_api_key_lists_supported_env_fallbacks() {
+9 -4
View File
@@ -1088,8 +1088,8 @@ Use `codewhale-tui features list` to inspect known flags and their effective sta
`web_search` uses DuckDuckGo by default and does not require an API key. The
DuckDuckGo path keeps a Bing fallback when DDG returns a bot challenge or no
parseable results. Bing remains selectable for users who explicitly want it,
and Tavily, Bocha, Metaso, or Baidu can be selected when an API-backed provider
is preferred.
and Tavily, Bocha, Metaso, Baidu, Volcengine, or Sofya can be selected when an
API-backed provider is preferred.
For a private/internal search service that serves DuckDuckGo-compatible HTML,
keep `provider = "duckduckgo"` and set `base_url`; CodeWhale appends the `q`
@@ -1106,11 +1106,16 @@ set `METASO_API_KEY` or `[search] api_key` for a higher quota.
`BAIDU_SEARCH_API_KEY` or `[search] api_key`. This is a search-tool backend
only; it does not add a Baidu model provider.
**Sofya** ([sofya.co](https://sofya.co)) returns full extracted page content
rather than snippets. Set `[search] api_key` to your `ay_live_...` key, or the
`SOFYA_API_KEY` env var. This is a search-tool backend only; it does not add a
Sofya model provider.
```toml
[search]
provider = "baidu" # duckduckgo | bing | tavily | bocha | metaso | baidu
provider = "baidu" # duckduckgo | bing | tavily | bocha | metaso | baidu | volcengine | sofya
# base_url = "https://search.example/html/" # optional with provider = "duckduckgo"
# api_key = "YOUR_KEY" # required for tavily, bocha, and baidu; optional for metaso
# api_key = "YOUR_KEY" # required for tavily, bocha, baidu, volcengine, and sofya; optional for metaso
```
## Local Media Attachments