From ba40ae4aac1714e9f9a841b54a74b58af98dd374 Mon Sep 17 00:00:00 2001
From: Hunter Bown <hmbown@gmail.com>
Date: Sat, 25 Apr 2026 13:36:30 -0500
Subject: [PATCH] feat(#34): auto-extract text from PDFs in read_file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`read_file` now detects PDFs by extension or `%PDF-` magic bytes and
shells out to `pdftotext -layout` (poppler) to return plain text
directly to the model. New optional `pages` arg accepts `N` or `N-M`
slices so big papers can be read in pieces without burning context.

When `pdftotext` isn't on `$PATH`, the tool returns a structured
`{type: "binary_unavailable", kind: "pdf", reason, hint}` payload with
install hints (`brew install poppler` / `apt install poppler-utils`)
instead of crashing or returning UTF-8 garbage from a binary file.

Tests cover extension detection (case-insensitive), magic-byte sniffing
on extension-less files, the negative case for plain text, the pages
arg parser (single, range, whitespace, invalid forms), and the
binary_unavailable branch when `pdftotext` is absent.

.docx / .epub / .html stripping deferred — same dispatch can take more
extractors later.

Closes #34.
---
 crates/tui/src/tools/file.rs | 184 ++++++++++++++++++++++++++++++++++-
 1 file changed, 183 insertions(+), 1 deletion(-)

diff --git a/crates/tui/src/tools/file.rs b/crates/tui/src/tools/file.rs
index 53b62e4b..1da978a4 100644
--- a/crates/tui/src/tools/file.rs
+++ b/crates/tui/src/tools/file.rs
@@ -10,6 +10,8 @@ use super::spec::{
 use async_trait::async_trait;
 use serde_json::{Value, json};
 use std::fs;
+use std::path::Path;
+use std::process::{Command, Stdio};
 
 // === ReadFileTool ===
 
@@ -23,7 +25,7 @@ impl ToolSpec for ReadFileTool {
     }
 
     fn description(&self) -> &'static str {
-        "Read a UTF-8 file from the workspace."
+        "Read a file from the workspace. Plain text is returned as-is; PDFs are auto-extracted via `pdftotext` (poppler) when available."
     }
 
     fn input_schema(&self) -> Value {
@@ -33,6 +35,10 @@ impl ToolSpec for ReadFileTool {
                 "path": {
                     "type": "string",
                     "description": "Path to the file (relative to workspace or absolute)"
+                },
+                "pages": {
+                    "type": "string",
+                    "description": "PDF only: page range to extract, e.g. \"1-5\" or \"10\". Ignored for non-PDF files."
                 }
             },
             "required": ["path"]
@@ -50,6 +56,11 @@ impl ToolSpec for ReadFileTool {
     async fn execute(&self, input: Value, context: &ToolContext) -> Result<ToolResult, ToolError> {
         let path_str = required_str(&input, "path")?;
         let file_path = context.resolve_path(path_str)?;
+        let pages = optional_str(&input, "pages");
+
+        if is_pdf(&file_path)? {
+            return read_pdf(&file_path, pages);
+        }
 
         let contents = fs::read_to_string(&file_path).map_err(|e| {
             ToolError::execution_failed(format!("Failed to read {}: {}", file_path.display(), e))
@@ -59,6 +70,114 @@ impl ToolSpec for ReadFileTool {
     }
 }
 
+/// Detect a PDF by extension OR by sniffing the `%PDF-` magic bytes.
+/// Files without an extension are still recognized as PDFs when the header
+/// matches.
+fn is_pdf(path: &Path) -> Result<bool, ToolError> {
+    if path
+        .extension()
+        .and_then(|e| e.to_str())
+        .is_some_and(|ext| ext.eq_ignore_ascii_case("pdf"))
+    {
+        return Ok(true);
+    }
+    // Sniff first 4 bytes. Don't error if the file doesn't exist — let the
+    // caller's `read_to_string` produce the canonical not-found error.
+    let mut buf = [0u8; 4];
+    let result = match fs::File::open(path) {
+        Ok(mut f) => {
+            use std::io::Read;
+            f.read_exact(&mut buf).map(|_| buf)
+        }
+        Err(_) => return Ok(false),
+    };
+    Ok(matches!(result, Ok(b) if &b == b"%PDF"))
+}
+
+fn parse_pages_arg(spec: &str) -> Option<(u32, u32)> {
+    let trimmed = spec.trim();
+    if trimmed.is_empty() {
+        return None;
+    }
+    if let Some((a, b)) = trimmed.split_once('-') {
+        let start: u32 = a.trim().parse().ok()?;
+        let end: u32 = b.trim().parse().ok()?;
+        if start == 0 || end < start {
+            return None;
+        }
+        Some((start, end))
+    } else {
+        let n: u32 = trimmed.parse().ok()?;
+        if n == 0 {
+            return None;
+        }
+        Some((n, n))
+    }
+}
+
+fn read_pdf(path: &Path, pages: Option<&str>) -> Result<ToolResult, ToolError> {
+    // Try pdftotext (from the poppler suite). Other extractors (mutool,
+    // pdfminer) could be added later behind the same dispatch.
+    let mut cmd = Command::new("pdftotext");
+    cmd.arg("-layout");
+
+    if let Some(spec) = pages {
+        match parse_pages_arg(spec) {
+            Some((start, end)) => {
+                cmd.arg("-f").arg(start.to_string());
+                cmd.arg("-l").arg(end.to_string());
+            }
+            None => {
+                return Err(ToolError::invalid_input(format!(
+                    "invalid `pages` value `{spec}` (expected `N` or `N-M`, e.g. `1-5`)"
+                )));
+            }
+        }
+    }
+
+    cmd.arg(path).arg("-"); // output to stdout
+    cmd.stdin(Stdio::null())
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped());
+
+    let child = match cmd.spawn() {
+        Ok(c) => c,
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
+            // Structured "binary unavailable" — caller knows what to suggest.
+            return ToolResult::json(&json!({
+                "type": "binary_unavailable",
+                "path": path.display().to_string(),
+                "kind": "pdf",
+                "reason": "pdftotext not installed",
+                "hint": "install poppler (macOS: `brew install poppler`; Debian/Ubuntu: `apt install poppler-utils`)"
+            }))
+            .map_err(|e| {
+                ToolError::execution_failed(format!("failed to serialize response: {e}"))
+            });
+        }
+        Err(e) => {
+            return Err(ToolError::execution_failed(format!(
+                "failed to launch pdftotext: {e}"
+            )));
+        }
+    };
+
+    let output = child
+        .wait_with_output()
+        .map_err(|e| ToolError::execution_failed(format!("pdftotext failed to complete: {e}")))?;
+
+    if !output.status.success() {
+        let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
+        return Err(ToolError::execution_failed(format!(
+            "pdftotext failed (exit {:?}): {stderr}",
+            output.status.code()
+        )));
+    }
+
+    let text = String::from_utf8_lossy(&output.stdout).to_string();
+    Ok(ToolResult::success(text))
+}
+
 // === WriteFileTool ===
 
 /// Tool for writing UTF-8 files to the workspace.
@@ -330,6 +449,69 @@ mod tests {
         );
     }
 
+    #[test]
+    fn pdf_detected_by_extension() {
+        let tmp = tempdir().expect("tempdir");
+        let path = tmp.path().join("paper.PDF");
+        fs::write(&path, b"not really a pdf, but extension says yes").unwrap();
+        assert!(is_pdf(&path).unwrap());
+    }
+
+    #[test]
+    fn pdf_detected_by_magic_bytes_without_extension() {
+        let tmp = tempdir().expect("tempdir");
+        let path = tmp.path().join("blob");
+        fs::write(&path, b"%PDF-1.7\nrest of bytes").unwrap();
+        assert!(is_pdf(&path).unwrap());
+    }
+
+    #[test]
+    fn non_pdf_not_detected() {
+        let tmp = tempdir().expect("tempdir");
+        let path = tmp.path().join("notes.txt");
+        fs::write(&path, "hello").unwrap();
+        assert!(!is_pdf(&path).unwrap());
+    }
+
+    #[test]
+    fn pages_arg_parses_single_and_range() {
+        assert_eq!(parse_pages_arg("5"), Some((5, 5)));
+        assert_eq!(parse_pages_arg("1-10"), Some((1, 10)));
+        assert_eq!(parse_pages_arg(" 3 - 7 "), Some((3, 7)));
+        assert_eq!(parse_pages_arg("0"), None);
+        assert_eq!(parse_pages_arg("10-3"), None);
+        assert_eq!(parse_pages_arg(""), None);
+        assert_eq!(parse_pages_arg("abc"), None);
+    }
+
+    #[tokio::test]
+    async fn read_file_returns_binary_unavailable_when_pdftotext_missing() {
+        // We can't reliably remove pdftotext from $PATH in a test, but if
+        // it's missing on the runner this test exercises that branch. If
+        // it's installed, the test exits early — covered by the parse_pages
+        // and is_pdf tests above.
+        if Command::new("pdftotext")
+            .arg("-v")
+            .stdout(Stdio::null())
+            .stderr(Stdio::null())
+            .status()
+            .is_ok()
+        {
+            return;
+        }
+        let tmp = tempdir().expect("tempdir");
+        let path = tmp.path().join("doc.pdf");
+        fs::write(&path, b"%PDF-1.7\n%%EOF").unwrap();
+        let ctx = ToolContext::new(tmp.path().to_path_buf());
+        let result = ReadFileTool
+            .execute(json!({"path": "doc.pdf"}), &ctx)
+            .await
+            .expect("structured response, not error");
+        assert!(result.success);
+        assert!(result.content.contains("binary_unavailable"));
+        assert!(result.content.contains("pdftotext"));
+    }
+
     #[tokio::test]
     async fn test_write_file_tool() {
         let tmp = tempdir().expect("tempdir");