From 7f2c382343f0319a78cfbe4897459a22fd4d8649 Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Sat, 25 Apr 2026 13:33:22 -0500 Subject: [PATCH] feat(#33): add fetch_url tool for direct HTTP GET MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Complements `web_search` for cases where the URL is already known — GitHub repos, blog posts, spec pages — and a search-engine round trip is overkill or actively unhelpful (which #25 had been making worse). Surface: - `fetch_url(url, format?, max_bytes?, timeout_ms?)` - `format`: `markdown` (default), `text`, `raw` - HTTPS preferred, http:// allowed; non-http schemes rejected up front - Follows up to 5 redirects; 1 MB default cap (10 MB hard ceiling); 15 s default timeout (60 s ceiling) - HTML responses are stripped to readable text via the same regex pattern used by `web_search` (script/style strip → tag strip → entity decode → whitespace collapse) - 4xx / 5xx responses still return the body (with `success: false`) so the caller can read JSON error envelopes Capabilities: `ReadOnly + Network`. Approval: `Auto` (matches `web_search`). Registered in `with_web_tools` so it's available wherever `web_search` is. Tests cover: format parsing aliases, scheme rejection, missing/empty url validation, html-to-text stripping. The over-the-wire cases (redirect chains, oversized truncation) are exercised by integration tests once the test suite is wired to a local mock HTTP server — deferring that since the unit tests already lock in the input validation and HTML processing. Closes #33. --- crates/tui/src/tools/fetch_url.rs | 315 ++++++++++++++++++++++++++++++ crates/tui/src/tools/mod.rs | 1 + crates/tui/src/tools/registry.rs | 2 + 3 files changed, 318 insertions(+) create mode 100644 crates/tui/src/tools/fetch_url.rs diff --git a/crates/tui/src/tools/fetch_url.rs b/crates/tui/src/tools/fetch_url.rs new file mode 100644 index 00000000..e017ea43 --- /dev/null +++ b/crates/tui/src/tools/fetch_url.rs @@ -0,0 +1,315 @@ +//! Direct-fetch HTTP tool. Complements `web_search` for cases where the user +//! already knows the URL — a known repo, a blog post, a spec page — and +//! search is overkill or actively unhelpful. +//! +//! Returns a structured `{url, status, content_type, content, truncated}` +//! payload. HTML responses are stripped to readable text by default +//! (`format = "markdown"`); pass `format = "raw"` to keep the bytes intact +//! when the model wants to do its own parsing. + +use super::spec::{ + ApprovalRequirement, ToolCapability, ToolContext, ToolError, ToolResult, ToolSpec, optional_u64, +}; +use async_trait::async_trait; +use regex::Regex; +use serde::Serialize; +use serde_json::{Value, json}; +use std::sync::OnceLock; +use std::time::Duration; + +const DEFAULT_MAX_BYTES: u64 = 1_000_000; +const HARD_MAX_BYTES: u64 = 10 * 1024 * 1024; +const DEFAULT_TIMEOUT_MS: u64 = 15_000; +const HARD_MAX_TIMEOUT_MS: u64 = 60_000; +const MAX_REDIRECTS: usize = 5; +const USER_AGENT: &str = + "Mozilla/5.0 (compatible; deepseek-tui/0.5; +https://github.com/Hmbown/DeepSeek-TUI)"; + +static SCRIPT_RE: OnceLock = OnceLock::new(); +static STYLE_RE: OnceLock = OnceLock::new(); +static TAG_RE: OnceLock = OnceLock::new(); +static WHITESPACE_RE: OnceLock = OnceLock::new(); + +fn script_re() -> &'static Regex { + SCRIPT_RE.get_or_init(|| Regex::new(r"(?is)]*>.*?").expect("script re")) +} +fn style_re() -> &'static Regex { + STYLE_RE.get_or_init(|| Regex::new(r"(?is)]*>.*?").expect("style re")) +} +fn tag_re() -> &'static Regex { + TAG_RE.get_or_init(|| Regex::new(r"<[^>]+>").expect("tag re")) +} +fn whitespace_re() -> &'static Regex { + WHITESPACE_RE.get_or_init(|| Regex::new(r"\s+").expect("ws re")) +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum Format { + Text, + Markdown, + Raw, +} + +impl Format { + fn parse(value: Option<&str>) -> Result { + match value + .unwrap_or("markdown") + .trim() + .to_ascii_lowercase() + .as_str() + { + "text" | "txt" | "plain" => Ok(Self::Text), + "markdown" | "md" => Ok(Self::Markdown), + "raw" | "html" | "bytes" => Ok(Self::Raw), + other => Err(ToolError::invalid_input(format!( + "unknown format `{other}` (allowed: text, markdown, raw)" + ))), + } + } +} + +#[derive(Debug, Serialize)] +struct FetchResponse { + url: String, + status: u16, + content_type: String, + content: String, + truncated: bool, +} + +pub struct FetchUrlTool; + +#[async_trait] +impl ToolSpec for FetchUrlTool { + fn name(&self) -> &'static str { + "fetch_url" + } + + fn description(&self) -> &'static str { + "Fetch a known URL directly (HTTP GET) and return its content. Use this when the user gives a URL or you already know the canonical link — it's faster and more reliable than web_search for known pages." + } + + fn input_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "Absolute HTTP/HTTPS URL to fetch." + }, + "format": { + "type": "string", + "enum": ["text", "markdown", "raw"], + "description": "Post-processing for the response body. `markdown` (default) and `text` strip HTML tags to readable text; `raw` returns the body bytes as-is." + }, + "max_bytes": { + "type": "integer", + "description": "Truncate response body after this many bytes (default 1,000,000; hard max 10,485,760)." + }, + "timeout_ms": { + "type": "integer", + "description": "Request timeout in milliseconds (default 15,000; max 60,000)." + } + }, + "required": ["url"] + }) + } + + fn capabilities(&self) -> Vec { + vec![ToolCapability::ReadOnly, ToolCapability::Network] + } + + fn approval_requirement(&self) -> ApprovalRequirement { + ApprovalRequirement::Auto + } + + async fn execute(&self, input: Value, _context: &ToolContext) -> Result { + let url = input + .get("url") + .and_then(Value::as_str) + .ok_or_else(|| ToolError::invalid_input("`url` is required"))? + .trim() + .to_string(); + + if url.is_empty() { + return Err(ToolError::invalid_input("`url` cannot be empty")); + } + let scheme_ok = url.starts_with("http://") || url.starts_with("https://"); + if !scheme_ok { + return Err(ToolError::invalid_input( + "only http:// and https:// URLs are supported", + )); + } + + let format = Format::parse(input.get("format").and_then(Value::as_str))?; + let max_bytes = optional_u64(&input, "max_bytes", DEFAULT_MAX_BYTES).min(HARD_MAX_BYTES); + let timeout_ms = + optional_u64(&input, "timeout_ms", DEFAULT_TIMEOUT_MS).min(HARD_MAX_TIMEOUT_MS); + + let client = reqwest::Client::builder() + .timeout(Duration::from_millis(timeout_ms)) + .user_agent(USER_AGENT) + .redirect(reqwest::redirect::Policy::limited(MAX_REDIRECTS)) + .build() + .map_err(|e| { + ToolError::execution_failed(format!("failed to build HTTP client: {e}")) + })?; + + let resp = client + .get(&url) + .header("Accept", "text/html,text/plain,application/json,*/*;q=0.5") + .header("Accept-Language", "en-US,en;q=0.5") + .send() + .await + .map_err(|e| ToolError::execution_failed(format!("request failed: {e}")))?; + + let final_url = resp.url().to_string(); + let status = resp.status(); + let content_type = resp + .headers() + .get(reqwest::header::CONTENT_TYPE) + .and_then(|v| v.to_str().ok()) + .unwrap_or("application/octet-stream") + .to_string(); + + let bytes = resp + .bytes() + .await + .map_err(|e| ToolError::execution_failed(format!("failed to read body: {e}")))?; + let total_bytes = bytes.len() as u64; + let truncated = total_bytes > max_bytes; + let usable = if truncated { + &bytes[..max_bytes as usize] + } else { + &bytes[..] + }; + + let body_text = String::from_utf8_lossy(usable).to_string(); + let processed = match format { + Format::Raw => body_text, + Format::Text | Format::Markdown => { + if content_type.contains("text/html") || body_text.contains("` / ` + + + +

Hello & welcome

+

This is important.

+ + + "#; + let text = html_to_text(html); + assert!(text.contains("Hello & welcome")); + assert!(text.contains("This is important")); + assert!(!text.contains("alert")); + assert!(!text.contains("color: red")); + } + + #[test] + fn format_parse_accepts_aliases_and_rejects_unknown() { + assert_eq!(Format::parse(Some("markdown")).unwrap(), Format::Markdown); + assert_eq!(Format::parse(Some("MD")).unwrap(), Format::Markdown); + assert_eq!(Format::parse(Some("text")).unwrap(), Format::Text); + assert_eq!(Format::parse(Some("raw")).unwrap(), Format::Raw); + assert_eq!(Format::parse(None).unwrap(), Format::Markdown); + assert!(Format::parse(Some("yaml")).is_err()); + } + + #[tokio::test] + async fn rejects_non_http_schemes() { + let tool = FetchUrlTool; + let res = tool + .execute(json!({"url": "file:///etc/passwd"}), &ctx()) + .await; + let err = res.unwrap_err(); + assert!(format!("{err:?}").contains("http")); + } + + #[tokio::test] + async fn rejects_empty_url() { + let tool = FetchUrlTool; + let res = tool.execute(json!({"url": " "}), &ctx()).await; + assert!(res.is_err()); + } + + #[tokio::test] + async fn rejects_missing_url() { + let tool = FetchUrlTool; + let res = tool.execute(json!({}), &ctx()).await; + assert!(res.is_err()); + } +} diff --git a/crates/tui/src/tools/mod.rs b/crates/tui/src/tools/mod.rs index e0feeb67..e6884ca4 100644 --- a/crates/tui/src/tools/mod.rs +++ b/crates/tui/src/tools/mod.rs @@ -6,6 +6,7 @@ pub mod file; pub mod file_search; pub mod finance; +pub mod fetch_url; pub mod git; pub mod git_history; pub mod parallel; diff --git a/crates/tui/src/tools/registry.rs b/crates/tui/src/tools/registry.rs index 852ffe4b..b9175d07 100644 --- a/crates/tui/src/tools/registry.rs +++ b/crates/tui/src/tools/registry.rs @@ -345,10 +345,12 @@ impl ToolRegistryBuilder { /// Include web search tools. #[must_use] pub fn with_web_tools(self) -> Self { + use super::fetch_url::FetchUrlTool; use super::finance::FinanceTool; use super::web_run::WebRunTool; use super::web_search::WebSearchTool; self.with_tool(Arc::new(WebSearchTool)) + .with_tool(Arc::new(FetchUrlTool)) .with_tool(Arc::new(FinanceTool::new())) .with_tool(Arc::new(WebRunTool)) }