From af8ff03618bdbc15cc946e72135b351a5ce9aa11 Mon Sep 17 00:00:00 2001 From: yusufgurdogan <13736056+yusufgurdogan@users.noreply.github.com> Date: Fri, 5 Jun 2026 08:29:02 -0700 Subject: [PATCH] feat(web_search): add Sofya search provider Harvested from PR #2790 by @yusufgurdogan. --- .github/AUTHOR_MAP | 4 + CHANGELOG.md | 4 + config.example.toml | 8 +- crates/tui/CHANGELOG.md | 4 + crates/tui/src/config.rs | 30 ++++ crates/tui/src/tools/web_search.rs | 224 ++++++++++++++++++++++++++++- docs/CONFIGURATION.md | 13 +- 7 files changed, 276 insertions(+), 11 deletions(-) diff --git a/.github/AUTHOR_MAP b/.github/AUTHOR_MAP index a3a218cd..2e9f7ea3 100644 --- a/.github/AUTHOR_MAP +++ b/.github/AUTHOR_MAP @@ -77,6 +77,10 @@ ChaceLyee2101 = ChaceLyee2101 <95995339+ChaceLyee2101@users.noreply.github.com> ci4ic4 = ci4ic4 <6495973+ci4ic4@users.noreply.github.com> Chavdar Ivanov = ci4ic4 <6495973+ci4ic4@users.noreply.github.com> ci4ic4@gmail.com = ci4ic4 <6495973+ci4ic4@users.noreply.github.com> +yusufgurdogan = yusufgurdogan <13736056+yusufgurdogan@users.noreply.github.com> +Yusuf Gurdogan = yusufgurdogan <13736056+yusufgurdogan@users.noreply.github.com> +hotelswith = yusufgurdogan <13736056+yusufgurdogan@users.noreply.github.com> +contact@hotelswith.com = yusufgurdogan <13736056+yusufgurdogan@users.noreply.github.com> AresNing = AresNing <49557311+AresNing@users.noreply.github.com> shenjackyuanjie = shenjackyuanjie <54507071+shenjackyuanjie@users.noreply.github.com> diff --git a/CHANGELOG.md b/CHANGELOG.md index 84c0f54f..853b9c57 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -61,6 +61,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 TUI sidebar from the command line instead of relying on copy-hostile sidebar state during long transcript work (#2766, #2788). Thanks @mo-vic for the detailed report and @aboimpinto for the fix. +- Added Sofya (`provider = "sofya"`) as a search-tool backend with + `SOFYA_API_KEY` fallback, while keeping Sofya scoped to web search rather + than model-provider routing (#2790). Thanks @yusufgurdogan for the + implementation. ### Changed diff --git a/config.example.toml b/config.example.toml index 1f56333b..35df687f 100644 --- a/config.example.toml +++ b/config.example.toml @@ -399,7 +399,7 @@ max_subagents = 10 # optional (1-20) # API-backed search. # # [search] -# provider = "duckduckgo" # duckduckgo | bing | tavily | bocha | metaso | baidu | volcengine +# provider = "duckduckgo" # duckduckgo | bing | tavily | bocha | metaso | baidu | volcengine | sofya # # duckduckgo: HTML scrape with Bing fallback # # bing: HTML scrape, no API key # # tavily: https://tavily.com — AI search, needs api_key @@ -409,8 +409,11 @@ max_subagents = 10 # optional (1-20) # # baidu: 百度 AI Search via qianfan.baidubce.com,需 api_key # # volcengine: 火山引擎 Ark web_search (免费 2 万次/月), 需 api_key # # 也回退到 VOLCENGINE_API_KEY / VOLCENGINE_ARK_API_KEY / ARK_API_KEY 环境变量 +# # sofya: https://sofya.co — AI search returning full page +# # content (not snippets), needs api_key (ay_live_...); +# # also falls back to the SOFYA_API_KEY env var # base_url = "https://search.example/html/" # optional DuckDuckGo-compatible HTML endpoint -# api_key = "YOUR_SEARCH_KEY" # required for tavily, bocha, and baidu; optional for metaso +# api_key = "YOUR_SEARCH_KEY" # required for tavily, bocha, baidu, volcengine, and sofya; optional for metaso # # WARNING: treat config.toml like a secret file when # # storing API keys. Prefer env vars for local smoke tests. # @@ -421,6 +424,7 @@ max_subagents = 10 # optional (1-20) # DEEPSEEK_SEARCH_BASE_URL → search.base_url (legacy alias) # METASO_API_KEY → metaso key fallback # BAIDU_SEARCH_API_KEY → baidu key fallback +# SOFYA_API_KEY → sofya key fallback # ───────────────────────────────────────────────────────────────────────────────── # Network Policy (#135) diff --git a/crates/tui/CHANGELOG.md b/crates/tui/CHANGELOG.md index 84c0f54f..853b9c57 100644 --- a/crates/tui/CHANGELOG.md +++ b/crates/tui/CHANGELOG.md @@ -61,6 +61,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 TUI sidebar from the command line instead of relying on copy-hostile sidebar state during long transcript work (#2766, #2788). Thanks @mo-vic for the detailed report and @aboimpinto for the fix. +- Added Sofya (`provider = "sofya"`) as a search-tool backend with + `SOFYA_API_KEY` fallback, while keeping Sofya scoped to web search rather + than model-provider routing (#2790). Thanks @yusufgurdogan for the + implementation. ### Changed diff --git a/crates/tui/src/config.rs b/crates/tui/src/config.rs index 02c36a2f..0c858f95 100644 --- a/crates/tui/src/config.rs +++ b/crates/tui/src/config.rs @@ -1068,6 +1068,11 @@ pub enum SearchProvider { alias = "volc-ark" )] Volcengine, + /// Sofya web search API (). Requires api_key + /// (`ay_live_...`). Returns full extracted page content rather than + /// snippets; falls back to the `SOFYA_API_KEY` env var when + /// `[search] api_key` is not set. + Sofya, } impl SearchProvider { @@ -1083,6 +1088,7 @@ impl SearchProvider { Some(Self::Baidu) } "volcengine" | "ark" | "volc" | "volcengine-ark" => Some(Self::Volcengine), + "sofya" => Some(Self::Sofya), _ => None, } } @@ -1097,6 +1103,7 @@ impl SearchProvider { Self::Metaso => "metaso", Self::Baidu => "baidu", Self::Volcengine => "volcengine", + Self::Sofya => "sofya", } } } @@ -5647,6 +5654,29 @@ mod tests { ); } + #[test] + fn explicit_sofya_search_provider_is_preserved() { + let config: Config = toml::from_str( + r#" + [search] + provider = "sofya" + "#, + ) + .expect("sofya search config"); + + assert_eq!( + config.search.and_then(|search| search.provider), + Some(SearchProvider::Sofya) + ); + } + + #[test] + fn sofya_search_provider_parses_and_round_trips() { + assert_eq!(SearchProvider::parse("sofya"), Some(SearchProvider::Sofya)); + assert_eq!(SearchProvider::parse("Sofya"), Some(SearchProvider::Sofya)); + assert_eq!(SearchProvider::Sofya.as_str(), "sofya"); + } + #[test] fn search_provider_resolution_reports_default_source() { let _guard = lock_test_env(); diff --git a/crates/tui/src/tools/web_search.rs b/crates/tui/src/tools/web_search.rs index cc33276c..330b6ca9 100644 --- a/crates/tui/src/tools/web_search.rs +++ b/crates/tui/src/tools/web_search.rs @@ -1,12 +1,13 @@ //! Web search tool backed by multiple providers: Bing HTML scrape, DuckDuckGo //! (HTML scrape with Bing fallback), Tavily API, Bocha (博查) API, -//! Metaso API (), Baidu AI Search, and Volcengine Ark. +//! Metaso API (), Baidu AI Search, Volcengine Ark, and +//! Sofya (). //! //! This is the primary web search surface for agents. For browsing workflows //! (page open, click, screenshot) use a direct URL approach instead. //! //! Set `[search]` in config.toml to switch providers: -//! provider = "duckduckgo" # or tavily/bocha/metaso/baidu/volcengine +//! provider = "duckduckgo" # or tavily/bocha/metaso/baidu/volcengine/sofya //! base_url = "https://search.example/html/" # optional DDG-compatible URL //! api_key = "tvly-..." @@ -30,6 +31,7 @@ const BOCHA_ENDPOINT: &str = "https://api.bochaai.com/v1/ai/search"; const METASO_ENDPOINT: &str = "https://metaso.cn/api/v1"; const BAIDU_ENDPOINT: &str = "https://qianfan.baidubce.com/v2/ai_search/web_search"; const VOLCENGINE_RESPONSES_ENDPOINT: &str = "https://ark.cn-beijing.volces.com/api/v3/responses"; +const SOFYA_ENDPOINT: &str = "https://sofya.co/v1/search"; /// Intentionally public default key provided by Metaso for open-source/community use. /// Last-resort fallback after config and env var. Rate-limited to ~100 searches/day. const METASO_DEFAULT_API_KEY: &str = "mk-E384C1DD5E8501BB7EFE27C949AFDE5B"; @@ -140,7 +142,7 @@ impl ToolSpec for WebSearchTool { } fn description(&self) -> &'static str { - "Search the web and return ranked results with URLs and snippets. Default backend is DuckDuckGo with Bing fallback; set `[search] provider = \"bing\" | \"tavily\" | \"bocha\" | \"metaso\" | \"baidu\"` in config.toml to switch backends, or `[search] base_url` for a DuckDuckGo-compatible endpoint. Use this instead of scraping search engines with `curl` in `exec_shell`. For a known canonical URL, prefer `fetch_url` directly." + "Search the web and return ranked results with URLs and snippets. Default backend is DuckDuckGo with Bing fallback; set `[search] provider = \"bing\" | \"tavily\" | \"bocha\" | \"metaso\" | \"baidu\" | \"volcengine\" | \"sofya\"` in config.toml to switch backends, or `[search] base_url` for a DuckDuckGo-compatible endpoint. Use this instead of scraping search engines with `curl` in `exec_shell`. For a known canonical URL, prefer `fetch_url` directly." } fn input_schema(&self) -> Value { @@ -248,6 +250,13 @@ impl ToolSpec for WebSearchTool { .run_volcengine_search(&query, max_results, timeout_ms, context) .await; } + SearchProvider::Sofya => { + let decider = context.network_policy.as_ref(); + check_policy(decider, "sofya.co")?; + return self + .run_sofya_search(&query, max_results, timeout_ms, context) + .await; + } SearchProvider::Bing | SearchProvider::DuckDuckGo => {} } @@ -485,6 +494,88 @@ impl WebSearchTool { ToolResult::json(&response).map_err(|e| ToolError::execution_failed(e.to_string())) } + /// Search via Sofya web search API (). + /// + /// Sofya returns full extracted page content rather than snippets. The API + /// key (`ay_live_...`) comes from `[search] api_key`, falling back to the + /// `SOFYA_API_KEY` env var, and is sent as a `Bearer` token. + async fn run_sofya_search( + &self, + query: &str, + max_results: usize, + timeout_ms: u64, + context: &ToolContext, + ) -> Result { + let env_key = std::env::var("SOFYA_API_KEY").ok(); + let api_key = context + .search_api_key + .as_deref() + .or(env_key.as_deref()) + .ok_or_else(|| { + ToolError::execution_failed( + "Sofya search requires an API key. Set `[search] api_key = \"ay_live_...\"` in config.toml or the SOFYA_API_KEY env var.", + ) + })?; + + let client = crate::tls::reqwest_client_builder() + .timeout(Duration::from_millis(timeout_ms)) + .build() + .map_err(|e| { + ToolError::execution_failed(format!("Failed to build HTTP client: {e}")) + })?; + + let payload = json!({ + "query": query, + "max_results": max_results, + }); + + let resp = client + .post(SOFYA_ENDPOINT) + .header("Content-Type", "application/json") + .bearer_auth(api_key) + .json(&payload) + .send() + .await + .map_err(|e| { + ToolError::execution_failed(format!("Sofya search request failed: {e}")) + })?; + + let status = resp.status(); + let body = resp.text().await.map_err(|e| { + ToolError::execution_failed(format!("Failed to read Sofya response: {e}")) + })?; + + if !status.is_success() { + let truncated = truncate_error_body(&body); + return Err(ToolError::execution_failed(format!( + "Sofya search failed: HTTP {} — {truncated}", + status.as_u16() + ))); + } + + let parsed: serde_json::Value = serde_json::from_str(&body).map_err(|e| { + ToolError::execution_failed(format!("Failed to parse Sofya response: {e}")) + })?; + + let results = parse_sofya_results(&parsed, max_results); + + let message = if results.is_empty() { + "No results found".to_string() + } else { + format!("Found {} result(s)", results.len()) + }; + + let response = WebSearchResponse { + query: query.to_string(), + source: "sofya".to_string(), + count: results.len(), + message, + results, + }; + + ToolResult::json(&response).map_err(|e| ToolError::execution_failed(e.to_string())) + } + /// Search via Bocha AI Search API (). async fn run_bocha_search( &self, @@ -967,6 +1058,36 @@ fn baidu_error_message(parsed: &Value) -> Option { Some(format!("Baidu search API error (code {code}: {message})")) } +fn parse_sofya_results(parsed: &Value, max_results: usize) -> Vec { + parsed + .get("results") + .and_then(|v| v.as_array()) + .into_iter() + .flat_map(|arr| arr.iter()) + .filter_map(|item| { + let title = item.get("title")?.as_str()?.to_string(); + let url = item.get("url")?.as_str()?.to_string(); + let snippet = first_non_empty_string(item, &["content", "description"]); + Some(WebSearchEntry { + title, + url, + snippet, + }) + }) + .take(max_results) + .collect() +} + +fn first_non_empty_string(item: &Value, keys: &[&str]) -> Option { + keys.iter().find_map(|key| { + item.get(*key) + .and_then(Value::as_str) + .map(str::trim) + .filter(|value| !value.is_empty()) + .map(str::to_string) + }) +} + fn baidu_search_payload(query: &str, max_results: usize) -> Value { json!({ "messages": [ @@ -1488,8 +1609,8 @@ mod tests { use super::{ ERROR_BODY_PREVIEW_BYTES, WebSearchEntry, WebSearchTool, baidu_search_payload, decode_html_entities, duckduckgo_search_url, extract_search_query, is_likely_spam_results, - normalize_bing_url, optional_search_max_results, parse_baidu_results, root_domain, - sanitize_error_body, truncate_error_body, volcengine_extract_text, + normalize_bing_url, optional_search_max_results, parse_baidu_results, parse_sofya_results, + root_domain, sanitize_error_body, truncate_error_body, volcengine_extract_text, }; use serde_json::json; @@ -1857,6 +1978,63 @@ mod tests { ); } + #[test] + fn parse_sofya_results_falls_back_to_description_for_empty_content() { + let body = json!({ + "results": [ + { + "title": "Full content", + "url": "https://example.com/full", + "content": "full extracted page content", + "description": "unused description" + }, + { + "title": "Null content", + "url": "https://example.com/null", + "content": null, + "description": "description for null content" + }, + { + "title": "Empty content", + "url": "https://example.com/empty", + "content": "", + "description": "description for empty content" + }, + { + "title": "Whitespace content", + "url": "https://example.com/blank", + "content": " ", + "description": "description for blank content" + }, + { + "title": "No snippet", + "url": "https://example.com/no-snippet" + } + ] + }); + + let results = parse_sofya_results(&body, 10); + + assert_eq!(results.len(), 5); + assert_eq!( + results[0].snippet.as_deref(), + Some("full extracted page content") + ); + assert_eq!( + results[1].snippet.as_deref(), + Some("description for null content") + ); + assert_eq!( + results[2].snippet.as_deref(), + Some("description for empty content") + ); + assert_eq!( + results[3].snippet.as_deref(), + Some("description for blank content") + ); + assert_eq!(results[4].snippet, None); + } + #[test] fn volcengine_extract_text_skips_non_text_content_blocks() { let body = json!({ @@ -1952,6 +2130,42 @@ mod tests { ); } + #[tokio::test] + #[allow(clippy::await_holding_lock)] + async fn sofya_provider_without_api_key_surfaces_clear_error_not_silent_fallback() { + // Same trust-boundary pin as Tavily/Bocha: opting into Sofya without a + // key must surface a ToolError naming the provider, not silently fall + // through to DuckDuckGo. + use crate::config::SearchProvider; + use crate::tools::spec::{ToolContext, ToolSpec}; + + // This test holds the process-env lock through the awaited tool + // execution because the tool reads SOFYA_API_KEY during that call. + let _guard = crate::test_support::lock_test_env(); + let prev = std::env::var_os("SOFYA_API_KEY"); + unsafe { std::env::remove_var("SOFYA_API_KEY") }; + + let tmp = tempfile::tempdir().expect("tempdir"); + let mut ctx = ToolContext::new(tmp.path().to_path_buf()); + ctx.search_provider = SearchProvider::Sofya; + ctx.search_api_key = None; + let err = WebSearchTool + .execute(json!({"query": "anything"}), &ctx) + .await + .expect_err("missing api_key must surface as ToolError"); + + match prev { + Some(value) => unsafe { std::env::set_var("SOFYA_API_KEY", value) }, + None => unsafe { std::env::remove_var("SOFYA_API_KEY") }, + } + + let msg = err.to_string(); + assert!( + msg.contains("Sofya") && msg.contains("API key"), + "error must name the provider and missing key; got `{msg}`" + ); + } + #[tokio::test] #[allow(clippy::await_holding_lock)] async fn volcengine_provider_without_api_key_lists_supported_env_fallbacks() { diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index 5b82a66b..25206184 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -1088,8 +1088,8 @@ Use `codewhale-tui features list` to inspect known flags and their effective sta `web_search` uses DuckDuckGo by default and does not require an API key. The DuckDuckGo path keeps a Bing fallback when DDG returns a bot challenge or no parseable results. Bing remains selectable for users who explicitly want it, -and Tavily, Bocha, Metaso, or Baidu can be selected when an API-backed provider -is preferred. +and Tavily, Bocha, Metaso, Baidu, Volcengine, or Sofya can be selected when an +API-backed provider is preferred. For a private/internal search service that serves DuckDuckGo-compatible HTML, keep `provider = "duckduckgo"` and set `base_url`; CodeWhale appends the `q` @@ -1106,11 +1106,16 @@ set `METASO_API_KEY` or `[search] api_key` for a higher quota. `BAIDU_SEARCH_API_KEY` or `[search] api_key`. This is a search-tool backend only; it does not add a Baidu model provider. +**Sofya** ([sofya.co](https://sofya.co)) returns full extracted page content +rather than snippets. Set `[search] api_key` to your `ay_live_...` key, or the +`SOFYA_API_KEY` env var. This is a search-tool backend only; it does not add a +Sofya model provider. + ```toml [search] -provider = "baidu" # duckduckgo | bing | tavily | bocha | metaso | baidu +provider = "baidu" # duckduckgo | bing | tavily | bocha | metaso | baidu | volcengine | sofya # base_url = "https://search.example/html/" # optional with provider = "duckduckgo" -# api_key = "YOUR_KEY" # required for tavily, bocha, and baidu; optional for metaso +# api_key = "YOUR_KEY" # required for tavily, bocha, baidu, volcengine, and sofya; optional for metaso ``` ## Local Media Attachments