feat(#33): add fetch_url tool for direct HTTP GET

Complements `web_search` for cases where the URL is already known — GitHub repos, blog posts, spec pages — and a search-engine round trip is overkill or actively unhelpful (which #25 had been making worse). Surface: - `fetch_url(url, format?, max_bytes?, timeout_ms?)` - `format`: `markdown` (default), `text`, `raw` - HTTPS preferred, http:// allowed; non-http schemes rejected up front - Follows up to 5 redirects; 1 MB default cap (10 MB hard ceiling); 15 s default timeout (60 s ceiling) - HTML responses are stripped to readable text via the same regex pattern used by `web_search` (script/style strip → tag strip → entity decode → whitespace collapse) - 4xx / 5xx responses still return the body (with `success: false`) so the caller can read JSON error envelopes Capabilities: `ReadOnly + Network`. Approval: `Auto` (matches `web_search`). Registered in `with_web_tools` so it's available wherever `web_search` is. Tests cover: format parsing aliases, scheme rejection, missing/empty url validation, html-to-text stripping. The over-the-wire cases (redirect chains, oversized truncation) are exercised by integration tests once the test suite is wired to a local mock HTTP server — deferring that since the unit tests already lock in the input validation and HTML processing. Closes #33.
2026-04-25 13:33:22 -05:00
parent 017ac97d0d
commit 7f2c382343
3 changed files with 318 additions and 0 deletions
@@ -0,0 +1,315 @@
+//! Direct-fetch HTTP tool. Complements `web_search` for cases where the user
+//! already knows the URL — a known repo, a blog post, a spec page — and
+//! search is overkill or actively unhelpful.
+//!
+//! Returns a structured `{url, status, content_type, content, truncated}`
+//! payload. HTML responses are stripped to readable text by default
+//! (`format = "markdown"`); pass `format = "raw"` to keep the bytes intact
+//! when the model wants to do its own parsing.
+
+use super::spec::{
+    ApprovalRequirement, ToolCapability, ToolContext, ToolError, ToolResult, ToolSpec, optional_u64,
+};
+use async_trait::async_trait;
+use regex::Regex;
+use serde::Serialize;
+use serde_json::{Value, json};
+use std::sync::OnceLock;
+use std::time::Duration;
+
+const DEFAULT_MAX_BYTES: u64 = 1_000_000;
+const HARD_MAX_BYTES: u64 = 10 * 1024 * 1024;
+const DEFAULT_TIMEOUT_MS: u64 = 15_000;
+const HARD_MAX_TIMEOUT_MS: u64 = 60_000;
+const MAX_REDIRECTS: usize = 5;
+const USER_AGENT: &str =
+    "Mozilla/5.0 (compatible; deepseek-tui/0.5; +https://github.com/Hmbown/DeepSeek-TUI)";
+
+static SCRIPT_RE: OnceLock<Regex> = OnceLock::new();
+static STYLE_RE: OnceLock<Regex> = OnceLock::new();
+static TAG_RE: OnceLock<Regex> = OnceLock::new();
+static WHITESPACE_RE: OnceLock<Regex> = OnceLock::new();
+
+fn script_re() -> &'static Regex {
+    SCRIPT_RE.get_or_init(|| Regex::new(r"(?is)<script[^>]*>.*?</script>").expect("script re"))
+}
+fn style_re() -> &'static Regex {
+    STYLE_RE.get_or_init(|| Regex::new(r"(?is)<style[^>]*>.*?</style>").expect("style re"))
+}
+fn tag_re() -> &'static Regex {
+    TAG_RE.get_or_init(|| Regex::new(r"<[^>]+>").expect("tag re"))
+}
+fn whitespace_re() -> &'static Regex {
+    WHITESPACE_RE.get_or_init(|| Regex::new(r"\s+").expect("ws re"))
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum Format {
+    Text,
+    Markdown,
+    Raw,
+}
+
+impl Format {
+    fn parse(value: Option<&str>) -> Result<Self, ToolError> {
+        match value
+            .unwrap_or("markdown")
+            .trim()
+            .to_ascii_lowercase()
+            .as_str()
+        {
+            "text" | "txt" | "plain" => Ok(Self::Text),
+            "markdown" | "md" => Ok(Self::Markdown),
+            "raw" | "html" | "bytes" => Ok(Self::Raw),
+            other => Err(ToolError::invalid_input(format!(
+                "unknown format `{other}` (allowed: text, markdown, raw)"
+            ))),
+        }
+    }
+}
+
+#[derive(Debug, Serialize)]
+struct FetchResponse {
+    url: String,
+    status: u16,
+    content_type: String,
+    content: String,
+    truncated: bool,
+}
+
+pub struct FetchUrlTool;
+
+#[async_trait]
+impl ToolSpec for FetchUrlTool {
+    fn name(&self) -> &'static str {
+        "fetch_url"
+    }
+
+    fn description(&self) -> &'static str {
+        "Fetch a known URL directly (HTTP GET) and return its content. Use this when the user gives a URL or you already know the canonical link — it's faster and more reliable than web_search for known pages."
+    }
+
+    fn input_schema(&self) -> Value {
+        json!({
+            "type": "object",
+            "properties": {
+                "url": {
+                    "type": "string",
+                    "description": "Absolute HTTP/HTTPS URL to fetch."
+                },
+                "format": {
+                    "type": "string",
+                    "enum": ["text", "markdown", "raw"],
+                    "description": "Post-processing for the response body. `markdown` (default) and `text` strip HTML tags to readable text; `raw` returns the body bytes as-is."
+                },
+                "max_bytes": {
+                    "type": "integer",
+                    "description": "Truncate response body after this many bytes (default 1,000,000; hard max 10,485,760)."
+                },
+                "timeout_ms": {
+                    "type": "integer",
+                    "description": "Request timeout in milliseconds (default 15,000; max 60,000)."
+                }
+            },
+            "required": ["url"]
+        })
+    }
+
+    fn capabilities(&self) -> Vec<ToolCapability> {
+        vec![ToolCapability::ReadOnly, ToolCapability::Network]
+    }
+
+    fn approval_requirement(&self) -> ApprovalRequirement {
+        ApprovalRequirement::Auto
+    }
+
+    async fn execute(&self, input: Value, _context: &ToolContext) -> Result<ToolResult, ToolError> {
+        let url = input
+            .get("url")
+            .and_then(Value::as_str)
+            .ok_or_else(|| ToolError::invalid_input("`url` is required"))?
+            .trim()
+            .to_string();
+
+        if url.is_empty() {
+            return Err(ToolError::invalid_input("`url` cannot be empty"));
+        }
+        let scheme_ok = url.starts_with("http://") || url.starts_with("https://");
+        if !scheme_ok {
+            return Err(ToolError::invalid_input(
+                "only http:// and https:// URLs are supported",
+            ));
+        }
+
+        let format = Format::parse(input.get("format").and_then(Value::as_str))?;
+        let max_bytes = optional_u64(&input, "max_bytes", DEFAULT_MAX_BYTES).min(HARD_MAX_BYTES);
+        let timeout_ms =
+            optional_u64(&input, "timeout_ms", DEFAULT_TIMEOUT_MS).min(HARD_MAX_TIMEOUT_MS);
+
+        let client = reqwest::Client::builder()
+            .timeout(Duration::from_millis(timeout_ms))
+            .user_agent(USER_AGENT)
+            .redirect(reqwest::redirect::Policy::limited(MAX_REDIRECTS))
+            .build()
+            .map_err(|e| {
+                ToolError::execution_failed(format!("failed to build HTTP client: {e}"))
+            })?;
+
+        let resp = client
+            .get(&url)
+            .header("Accept", "text/html,text/plain,application/json,*/*;q=0.5")
+            .header("Accept-Language", "en-US,en;q=0.5")
+            .send()
+            .await
+            .map_err(|e| ToolError::execution_failed(format!("request failed: {e}")))?;
+
+        let final_url = resp.url().to_string();
+        let status = resp.status();
+        let content_type = resp
+            .headers()
+            .get(reqwest::header::CONTENT_TYPE)
+            .and_then(|v| v.to_str().ok())
+            .unwrap_or("application/octet-stream")
+            .to_string();
+
+        let bytes = resp
+            .bytes()
+            .await
+            .map_err(|e| ToolError::execution_failed(format!("failed to read body: {e}")))?;
+        let total_bytes = bytes.len() as u64;
+        let truncated = total_bytes > max_bytes;
+        let usable = if truncated {
+            &bytes[..max_bytes as usize]
+        } else {
+            &bytes[..]
+        };
+
+        let body_text = String::from_utf8_lossy(usable).to_string();
+        let processed = match format {
+            Format::Raw => body_text,
+            Format::Text | Format::Markdown => {
+                if content_type.contains("text/html") || body_text.contains("<html") {
+                    html_to_text(&body_text)
+                } else {
+                    body_text
+                }
+            }
+        };
+
+        let response = FetchResponse {
+            url: final_url,
+            status: status.as_u16(),
+            content_type,
+            content: processed,
+            truncated,
+        };
+
+        if !status.is_success() {
+            // Don't `Err` on 4xx/5xx — the caller often wants to see the body
+            // (e.g. a JSON error envelope). Mark the result as a failure so the
+            // engine renders it as such.
+            return Ok(ToolResult {
+                content: serde_json::to_string_pretty(&response).map_err(|e| {
+                    ToolError::execution_failed(format!("failed to serialize response: {e}"))
+                })?,
+                success: false,
+                metadata: None,
+            });
+        }
+
+        ToolResult::json(&response)
+            .map_err(|e| ToolError::execution_failed(format!("failed to serialize response: {e}")))
+    }
+}
+
+/// Strip `<script>` / `<style>` blocks, drop remaining tags, and collapse
+/// whitespace. Good enough for "let the model read this page" — not a full
+/// HTML-to-Markdown converter.
+fn html_to_text(html: &str) -> String {
+    let no_script = script_re().replace_all(html, "");
+    let no_style = style_re().replace_all(&no_script, "");
+    let no_tags = tag_re().replace_all(&no_style, " ");
+    let decoded = decode_entities(&no_tags);
+    whitespace_re()
+        .replace_all(&decoded, " ")
+        .trim()
+        .to_string()
+}
+
+/// Decode the handful of HTML entities we expect to hit in stripped text.
+/// Pulling in `html-escape` for the long tail isn't worth the dep weight.
+fn decode_entities(s: &str) -> String {
+    s.replace("&amp;", "&")
+        .replace("&lt;", "<")
+        .replace("&gt;", ">")
+        .replace("&quot;", "\"")
+        .replace("&#39;", "'")
+        .replace("&apos;", "'")
+        .replace("&nbsp;", " ")
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::tools::spec::ToolContext;
+    use std::path::PathBuf;
+
+    fn ctx() -> ToolContext {
+        ToolContext::new(PathBuf::from("."))
+    }
+
+    #[test]
+    fn html_to_text_strips_scripts_styles_and_tags() {
+        let html = r#"
+            <html>
+              <head>
+                <style>body { color: red; }</style>
+                <script>alert("nope");</script>
+              </head>
+              <body>
+                <h1>Hello &amp; welcome</h1>
+                <p>This is <b>important</b>.</p>
+              </body>
+            </html>
+        "#;
+        let text = html_to_text(html);
+        assert!(text.contains("Hello & welcome"));
+        assert!(text.contains("This is important"));
+        assert!(!text.contains("alert"));
+        assert!(!text.contains("color: red"));
+    }
+
+    #[test]
+    fn format_parse_accepts_aliases_and_rejects_unknown() {
+        assert_eq!(Format::parse(Some("markdown")).unwrap(), Format::Markdown);
+        assert_eq!(Format::parse(Some("MD")).unwrap(), Format::Markdown);
+        assert_eq!(Format::parse(Some("text")).unwrap(), Format::Text);
+        assert_eq!(Format::parse(Some("raw")).unwrap(), Format::Raw);
+        assert_eq!(Format::parse(None).unwrap(), Format::Markdown);
+        assert!(Format::parse(Some("yaml")).is_err());
+    }
+
+    #[tokio::test]
+    async fn rejects_non_http_schemes() {
+        let tool = FetchUrlTool;
+        let res = tool
+            .execute(json!({"url": "file:///etc/passwd"}), &ctx())
+            .await;
+        let err = res.unwrap_err();
+        assert!(format!("{err:?}").contains("http"));
+    }
+
+    #[tokio::test]
+    async fn rejects_empty_url() {
+        let tool = FetchUrlTool;
+        let res = tool.execute(json!({"url": "   "}), &ctx()).await;
+        assert!(res.is_err());
+    }
+
+    #[tokio::test]
+    async fn rejects_missing_url() {
+        let tool = FetchUrlTool;
+        let res = tool.execute(json!({}), &ctx()).await;
+        assert!(res.is_err());
+    }
+}
@@ -6,6 +6,7 @@ pub mod file;
 pub mod file_search;
 pub mod finance;

+pub mod fetch_url;
 pub mod git;
 pub mod git_history;
 pub mod parallel;
@@ -345,10 +345,12 @@ impl ToolRegistryBuilder {
    /// Include web search tools.
    #[must_use]
    pub fn with_web_tools(self) -> Self {
+        use super::fetch_url::FetchUrlTool;
        use super::finance::FinanceTool;
        use super::web_run::WebRunTool;
        use super::web_search::WebSearchTool;
        self.with_tool(Arc::new(WebSearchTool))
+            .with_tool(Arc::new(FetchUrlTool))
            .with_tool(Arc::new(FinanceTool::new()))
            .with_tool(Arc::new(WebRunTool))
    }