feat(#33): add fetch_url tool for direct HTTP GET

Complements `web_search` for cases where the URL is already known —
GitHub repos, blog posts, spec pages — and a search-engine round trip is
overkill or actively unhelpful (which #25 had been making worse).

Surface:
- `fetch_url(url, format?, max_bytes?, timeout_ms?)`
- `format`: `markdown` (default), `text`, `raw`
- HTTPS preferred, http:// allowed; non-http schemes rejected up front
- Follows up to 5 redirects; 1 MB default cap (10 MB hard ceiling); 15 s
  default timeout (60 s ceiling)
- HTML responses are stripped to readable text via the same regex
  pattern used by `web_search` (script/style strip → tag strip → entity
  decode → whitespace collapse)
- 4xx / 5xx responses still return the body (with `success: false`) so
  the caller can read JSON error envelopes

Capabilities: `ReadOnly + Network`. Approval: `Auto` (matches
`web_search`). Registered in `with_web_tools` so it's available wherever
`web_search` is.

Tests cover: format parsing aliases, scheme rejection, missing/empty
url validation, html-to-text stripping. The over-the-wire cases
(redirect chains, oversized truncation) are exercised by integration
tests once the test suite is wired to a local mock HTTP server —
deferring that since the unit tests already lock in the input
validation and HTML processing.

Closes #33.
This commit is contained in:
Hunter Bown
2026-04-25 13:33:22 -05:00
parent 017ac97d0d
commit 7f2c382343
3 changed files with 318 additions and 0 deletions
+315
View File
@@ -0,0 +1,315 @@
//! Direct-fetch HTTP tool. Complements `web_search` for cases where the user
//! already knows the URL — a known repo, a blog post, a spec page — and
//! search is overkill or actively unhelpful.
//!
//! Returns a structured `{url, status, content_type, content, truncated}`
//! payload. HTML responses are stripped to readable text by default
//! (`format = "markdown"`); pass `format = "raw"` to keep the bytes intact
//! when the model wants to do its own parsing.
use super::spec::{
ApprovalRequirement, ToolCapability, ToolContext, ToolError, ToolResult, ToolSpec, optional_u64,
};
use async_trait::async_trait;
use regex::Regex;
use serde::Serialize;
use serde_json::{Value, json};
use std::sync::OnceLock;
use std::time::Duration;
const DEFAULT_MAX_BYTES: u64 = 1_000_000;
const HARD_MAX_BYTES: u64 = 10 * 1024 * 1024;
const DEFAULT_TIMEOUT_MS: u64 = 15_000;
const HARD_MAX_TIMEOUT_MS: u64 = 60_000;
const MAX_REDIRECTS: usize = 5;
const USER_AGENT: &str =
"Mozilla/5.0 (compatible; deepseek-tui/0.5; +https://github.com/Hmbown/DeepSeek-TUI)";
static SCRIPT_RE: OnceLock<Regex> = OnceLock::new();
static STYLE_RE: OnceLock<Regex> = OnceLock::new();
static TAG_RE: OnceLock<Regex> = OnceLock::new();
static WHITESPACE_RE: OnceLock<Regex> = OnceLock::new();
fn script_re() -> &'static Regex {
SCRIPT_RE.get_or_init(|| Regex::new(r"(?is)<script[^>]*>.*?</script>").expect("script re"))
}
fn style_re() -> &'static Regex {
STYLE_RE.get_or_init(|| Regex::new(r"(?is)<style[^>]*>.*?</style>").expect("style re"))
}
fn tag_re() -> &'static Regex {
TAG_RE.get_or_init(|| Regex::new(r"<[^>]+>").expect("tag re"))
}
fn whitespace_re() -> &'static Regex {
WHITESPACE_RE.get_or_init(|| Regex::new(r"\s+").expect("ws re"))
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum Format {
Text,
Markdown,
Raw,
}
impl Format {
fn parse(value: Option<&str>) -> Result<Self, ToolError> {
match value
.unwrap_or("markdown")
.trim()
.to_ascii_lowercase()
.as_str()
{
"text" | "txt" | "plain" => Ok(Self::Text),
"markdown" | "md" => Ok(Self::Markdown),
"raw" | "html" | "bytes" => Ok(Self::Raw),
other => Err(ToolError::invalid_input(format!(
"unknown format `{other}` (allowed: text, markdown, raw)"
))),
}
}
}
#[derive(Debug, Serialize)]
struct FetchResponse {
url: String,
status: u16,
content_type: String,
content: String,
truncated: bool,
}
pub struct FetchUrlTool;
#[async_trait]
impl ToolSpec for FetchUrlTool {
fn name(&self) -> &'static str {
"fetch_url"
}
fn description(&self) -> &'static str {
"Fetch a known URL directly (HTTP GET) and return its content. Use this when the user gives a URL or you already know the canonical link — it's faster and more reliable than web_search for known pages."
}
fn input_schema(&self) -> Value {
json!({
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "Absolute HTTP/HTTPS URL to fetch."
},
"format": {
"type": "string",
"enum": ["text", "markdown", "raw"],
"description": "Post-processing for the response body. `markdown` (default) and `text` strip HTML tags to readable text; `raw` returns the body bytes as-is."
},
"max_bytes": {
"type": "integer",
"description": "Truncate response body after this many bytes (default 1,000,000; hard max 10,485,760)."
},
"timeout_ms": {
"type": "integer",
"description": "Request timeout in milliseconds (default 15,000; max 60,000)."
}
},
"required": ["url"]
})
}
fn capabilities(&self) -> Vec<ToolCapability> {
vec![ToolCapability::ReadOnly, ToolCapability::Network]
}
fn approval_requirement(&self) -> ApprovalRequirement {
ApprovalRequirement::Auto
}
async fn execute(&self, input: Value, _context: &ToolContext) -> Result<ToolResult, ToolError> {
let url = input
.get("url")
.and_then(Value::as_str)
.ok_or_else(|| ToolError::invalid_input("`url` is required"))?
.trim()
.to_string();
if url.is_empty() {
return Err(ToolError::invalid_input("`url` cannot be empty"));
}
let scheme_ok = url.starts_with("http://") || url.starts_with("https://");
if !scheme_ok {
return Err(ToolError::invalid_input(
"only http:// and https:// URLs are supported",
));
}
let format = Format::parse(input.get("format").and_then(Value::as_str))?;
let max_bytes = optional_u64(&input, "max_bytes", DEFAULT_MAX_BYTES).min(HARD_MAX_BYTES);
let timeout_ms =
optional_u64(&input, "timeout_ms", DEFAULT_TIMEOUT_MS).min(HARD_MAX_TIMEOUT_MS);
let client = reqwest::Client::builder()
.timeout(Duration::from_millis(timeout_ms))
.user_agent(USER_AGENT)
.redirect(reqwest::redirect::Policy::limited(MAX_REDIRECTS))
.build()
.map_err(|e| {
ToolError::execution_failed(format!("failed to build HTTP client: {e}"))
})?;
let resp = client
.get(&url)
.header("Accept", "text/html,text/plain,application/json,*/*;q=0.5")
.header("Accept-Language", "en-US,en;q=0.5")
.send()
.await
.map_err(|e| ToolError::execution_failed(format!("request failed: {e}")))?;
let final_url = resp.url().to_string();
let status = resp.status();
let content_type = resp
.headers()
.get(reqwest::header::CONTENT_TYPE)
.and_then(|v| v.to_str().ok())
.unwrap_or("application/octet-stream")
.to_string();
let bytes = resp
.bytes()
.await
.map_err(|e| ToolError::execution_failed(format!("failed to read body: {e}")))?;
let total_bytes = bytes.len() as u64;
let truncated = total_bytes > max_bytes;
let usable = if truncated {
&bytes[..max_bytes as usize]
} else {
&bytes[..]
};
let body_text = String::from_utf8_lossy(usable).to_string();
let processed = match format {
Format::Raw => body_text,
Format::Text | Format::Markdown => {
if content_type.contains("text/html") || body_text.contains("<html") {
html_to_text(&body_text)
} else {
body_text
}
}
};
let response = FetchResponse {
url: final_url,
status: status.as_u16(),
content_type,
content: processed,
truncated,
};
if !status.is_success() {
// Don't `Err` on 4xx/5xx — the caller often wants to see the body
// (e.g. a JSON error envelope). Mark the result as a failure so the
// engine renders it as such.
return Ok(ToolResult {
content: serde_json::to_string_pretty(&response).map_err(|e| {
ToolError::execution_failed(format!("failed to serialize response: {e}"))
})?,
success: false,
metadata: None,
});
}
ToolResult::json(&response)
.map_err(|e| ToolError::execution_failed(format!("failed to serialize response: {e}")))
}
}
/// Strip `<script>` / `<style>` blocks, drop remaining tags, and collapse
/// whitespace. Good enough for "let the model read this page" — not a full
/// HTML-to-Markdown converter.
fn html_to_text(html: &str) -> String {
let no_script = script_re().replace_all(html, "");
let no_style = style_re().replace_all(&no_script, "");
let no_tags = tag_re().replace_all(&no_style, " ");
let decoded = decode_entities(&no_tags);
whitespace_re()
.replace_all(&decoded, " ")
.trim()
.to_string()
}
/// Decode the handful of HTML entities we expect to hit in stripped text.
/// Pulling in `html-escape` for the long tail isn't worth the dep weight.
fn decode_entities(s: &str) -> String {
s.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
.replace("&quot;", "\"")
.replace("&#39;", "'")
.replace("&apos;", "'")
.replace("&nbsp;", " ")
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tools::spec::ToolContext;
use std::path::PathBuf;
fn ctx() -> ToolContext {
ToolContext::new(PathBuf::from("."))
}
#[test]
fn html_to_text_strips_scripts_styles_and_tags() {
let html = r#"
<html>
<head>
<style>body { color: red; }</style>
<script>alert("nope");</script>
</head>
<body>
<h1>Hello &amp; welcome</h1>
<p>This is <b>important</b>.</p>
</body>
</html>
"#;
let text = html_to_text(html);
assert!(text.contains("Hello & welcome"));
assert!(text.contains("This is important"));
assert!(!text.contains("alert"));
assert!(!text.contains("color: red"));
}
#[test]
fn format_parse_accepts_aliases_and_rejects_unknown() {
assert_eq!(Format::parse(Some("markdown")).unwrap(), Format::Markdown);
assert_eq!(Format::parse(Some("MD")).unwrap(), Format::Markdown);
assert_eq!(Format::parse(Some("text")).unwrap(), Format::Text);
assert_eq!(Format::parse(Some("raw")).unwrap(), Format::Raw);
assert_eq!(Format::parse(None).unwrap(), Format::Markdown);
assert!(Format::parse(Some("yaml")).is_err());
}
#[tokio::test]
async fn rejects_non_http_schemes() {
let tool = FetchUrlTool;
let res = tool
.execute(json!({"url": "file:///etc/passwd"}), &ctx())
.await;
let err = res.unwrap_err();
assert!(format!("{err:?}").contains("http"));
}
#[tokio::test]
async fn rejects_empty_url() {
let tool = FetchUrlTool;
let res = tool.execute(json!({"url": " "}), &ctx()).await;
assert!(res.is_err());
}
#[tokio::test]
async fn rejects_missing_url() {
let tool = FetchUrlTool;
let res = tool.execute(json!({}), &ctx()).await;
assert!(res.is_err());
}
}
+1
View File
@@ -6,6 +6,7 @@ pub mod file;
pub mod file_search;
pub mod finance;
pub mod fetch_url;
pub mod git;
pub mod git_history;
pub mod parallel;
+2
View File
@@ -345,10 +345,12 @@ impl ToolRegistryBuilder {
/// Include web search tools.
#[must_use]
pub fn with_web_tools(self) -> Self {
use super::fetch_url::FetchUrlTool;
use super::finance::FinanceTool;
use super::web_run::WebRunTool;
use super::web_search::WebSearchTool;
self.with_tool(Arc::new(WebSearchTool))
.with_tool(Arc::new(FetchUrlTool))
.with_tool(Arc::new(FinanceTool::new()))
.with_tool(Arc::new(WebRunTool))
}