feat(tools): swap read_file PDF path to bundled pdf-extract; pdftotext now opt-in

Before v0.8.32 the PDF branch in `read_file` shelled out to `pdftotext` (Poppler), which meant first-time users on hosts without it saw the tool return a `binary_unavailable` sentinel and had to install Poppler before the model could open a PDF. The `pdf-extract` crate already powered URL-fetched PDFs in `web_run`, so this change reuses it for the local PDF path too — extraction is now zero-dependency on every supported host. The `pages` parameter still filters by 1-indexed inclusive page range; both whole-file and per-page variants run with no system dependency. Users with column-heavy or complex-table PDFs (academic papers, financial filings) where `pdftotext -layout` still wins can opt into the external path with `prefer_external_pdftotext = true` in `~/.config/deepseek/settings.toml`. When set, the previous Poppler dispatch returns — including the `binary_unavailable` install hint when the binary is missing on PATH. `deepseek doctor` reframes `pdftotext` as optional and explains the opt-in instead of treating it as a missing dependency. Tests round-trip a checked-in academic PDF through the new path and verify the `pages` window still slices correctly. The pre-v0.8.32 "binary_unavailable on missing pdftotext" branch is exercised against the new opt-in setting via a serialised env-var guard. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-12 00:21:24 -05:00
parent c188cade88
commit f94bedf1ea
4 changed files with 338 additions and 52 deletions
@@ -2127,26 +2127,59 @@ async fn run_doctor(config: &Config, workspace: &Path, config_path_override: Opt
        }
    }

+    // PDF reader: pure-Rust `pdf-extract` is the v0.8.32 default, so
+    // `pdftotext` is no longer required for `read_file` to handle PDFs.
+    // We still surface its presence (a) so users with column-heavy PDFs
+    // know they can opt in via `prefer_external_pdftotext = true`, and
+    // (b) so users who *did* opt in get a clean signal when the binary
+    // is missing rather than discovering it on the next PDF read.
+    let prefer_external = crate::settings::Settings::load()
+        .map(|s| s.prefer_external_pdftotext)
+        .unwrap_or(false);
    match crate::dependencies::resolve_pdftotext() {
-        Some(_) => println!(
-            "  {} pdftotext: available → read_file extracts PDF text",
-            "✓".truecolor(aqua_r, aqua_g, aqua_b),
-        ),
+        Some(_) => {
+            if prefer_external {
+                println!(
+                    "  {} pdftotext: available → read_file routes PDFs through Poppler (prefer_external_pdftotext = true)",
+                    "✓".truecolor(aqua_r, aqua_g, aqua_b),
+                );
+            } else {
+                println!(
+                    "  {} pdftotext: available (optional — pure-Rust extractor is the default in v0.8.32)",
+                    "✓".truecolor(aqua_r, aqua_g, aqua_b),
+                );
+                println!(
+                    "    Set `prefer_external_pdftotext = true` in settings.toml for column-heavy PDFs."
+                );
+            }
+        }
        None => {
-            println!(
-                "  {} pdftotext: not found → read_file falls back to a not-supported error for .pdf files",
-                "!".truecolor(sky_r, sky_g, sky_b),
-            );
-            println!("    (Text files still work without pdftotext; this only affects PDF reads.)");
-            match std::env::consts::OS {
-                "macos" => println!("    Install via: brew install poppler"),
-                "linux" => {
-                    println!("    Install via: sudo apt install poppler-utils   (Debian/Ubuntu)")
+            if prefer_external {
+                println!(
+                    "  {} pdftotext: not found, but `prefer_external_pdftotext = true` is set → PDF reads will return `binary_unavailable`",
+                    "✗".truecolor(red_r, red_g, red_b),
+                );
+                println!(
+                    "    Either install Poppler or unset `prefer_external_pdftotext` to fall back to the bundled pure-Rust extractor."
+                );
+                match std::env::consts::OS {
+                    "macos" => println!("    Install via: brew install poppler"),
+                    "linux" => println!(
+                        "    Install via: sudo apt install poppler-utils   (Debian/Ubuntu)"
+                    ),
+                    "windows" => println!(
+                        "    Install Poppler for Windows from https://blog.alivate.com.au/poppler-windows/"
+                    ),
+                    _ => {}
                }
-                "windows" => println!(
-                    "    Install Poppler for Windows from https://blog.alivate.com.au/poppler-windows/"
-                ),
-                _ => {}
+            } else {
+                println!(
+                    "  {} pdftotext: not found (optional — pure-Rust extractor is the default in v0.8.32)",
+                    "·".dimmed(),
+                );
+                println!(
+                    "    Install Poppler only if you want to opt into pdftotext for column-heavy PDFs."
+                );
            }
        }
    }
@@ -253,6 +253,17 @@ pub struct Settings {
    ///   *do* support DEC 2026; it is purely a rendering-quality knob,
    ///   not a correctness one.
    pub synchronized_output: String,
+    /// Prefer the external `pdftotext` binary (Poppler) over the bundled
+    /// pure-Rust `pdf-extract` extractor for PDF reads in `read_file`.
+    /// Pure-Rust extraction is the v0.8.32 default because it removes the
+    /// install-poppler-first hurdle most users hit, but `pdftotext -layout`
+    /// still wins for column-heavy or complex-table PDFs (academic papers
+    /// laid out in two columns, financial filings, etc.). Set to `true` to
+    /// route every PDF read through `pdftotext` instead — when the binary
+    /// is missing in that mode the tool returns the structured
+    /// `binary_unavailable` response with an install hint, matching the
+    /// pre-v0.8.32 behavior.
+    pub prefer_external_pdftotext: bool,
 }

 impl Default for Settings {
@@ -292,6 +303,7 @@ impl Default for Settings {
            provider_models: None,
            status_indicator: "whale".to_string(),
            synchronized_output: "auto".to_string(),
+            prefer_external_pdftotext: false,
        }
    }
 }
@@ -513,6 +525,9 @@ impl Settings {
                }
                self.synchronized_output = normalized.to_string();
            }
+            "prefer_external_pdftotext" | "external_pdftotext" | "pdftotext" => {
+                self.prefer_external_pdftotext = parse_bool(value)?;
+            }
            "default_mode" | "mode" => {
                let normalized = normalize_mode(value);
                if !["agent", "plan", "yolo"].contains(&normalized) {
@@ -629,6 +644,10 @@ impl Settings {
            "  synchronized_output: {}",
            self.synchronized_output
        ));
+        lines.push(format!(
+            "  prefer_external_pdftotext: {}",
+            self.prefer_external_pdftotext
+        ));
        lines.push(format!("  default_mode:       {}", self.default_mode));
        lines.push(format!(
            "  sidebar_width:      {}%",
@@ -705,6 +724,10 @@ impl Settings {
                "synchronized_output",
                "DEC 2026 synchronized output: auto, on, off (set off if your terminal flickers)",
            ),
+            (
+                "prefer_external_pdftotext",
+                "Route PDF reads through Poppler's pdftotext instead of the bundled pure-Rust extractor: on/off (default off)",
+            ),
            ("default_mode", "Default mode: agent, plan, yolo"),
            ("sidebar_width", "Sidebar width percentage: 10-50"),
            (
@@ -26,7 +26,7 @@ impl ToolSpec for ReadFileTool {
    }

    fn description(&self) -> &'static str {
-        "Read a UTF-8 file from the workspace. Use this instead of `cat`, `head`, `tail`, or `sed -n '..p'` in `exec_shell` — it's faster, sandbox-aware, and skips the approval prompt. Plain text is returned as-is; PDFs are auto-extracted via `pdftotext` (poppler) when available. Cannot read images or non-PDF binaries.\n\nFor large files, use `start_line` and `max_lines` to read in chunks. By default, returns at most 200 lines (~16KB). If `truncated=\"true\"` in the response, use `next_start_line` to continue reading. For PDFs, use `pages` instead — `start_line`/`max_lines` only apply to text files."
+        "Read a UTF-8 file from the workspace. Use this instead of `cat`, `head`, `tail`, or `sed -n '..p'` in `exec_shell` — it's faster, sandbox-aware, and skips the approval prompt. Plain text is returned as-is; PDFs are auto-extracted via the bundled pure-Rust extractor (no Poppler install required). Cannot read images or non-PDF binaries.\n\nFor large files, use `start_line` and `max_lines` to read in chunks. By default, returns at most 200 lines (~16KB). If `truncated=\"true\"` in the response, use `next_start_line` to continue reading. For PDFs, use `pages` instead — `start_line`/`max_lines` only apply to text files."
    }

    fn input_schema(&self) -> Value {
@@ -234,23 +234,87 @@ fn parse_pages_arg(spec: &str) -> Option<(u32, u32)> {
 }

 fn read_pdf(path: &Path, pages: Option<&str>) -> Result<ToolResult, ToolError> {
-    // Try pdftotext (from the poppler suite). Other extractors (mutool,
-    // pdfminer) could be added later behind the same dispatch.
-    let mut cmd = Command::new("pdftotext");
-    cmd.arg("-layout");
-
-    if let Some(spec) = pages {
-        match parse_pages_arg(spec) {
-            Some((start, end)) => {
-                cmd.arg("-f").arg(start.to_string());
-                cmd.arg("-l").arg(end.to_string());
-            }
+    // Validate the `pages` spec once, up front, so both extractor paths
+    // surface the same error shape on bad input.
+    let page_range = match pages {
+        Some(spec) => match parse_pages_arg(spec) {
+            Some((start, end)) => Some((start, end)),
            None => {
                return Err(ToolError::invalid_input(format!(
                    "invalid `pages` value `{spec}` (expected `N` or `N-M`, e.g. `1-5`)"
                )));
            }
+        },
+        None => None,
+    };
+
+    // Default to the bundled pure-Rust `pdf-extract` reader: it removes
+    // the install-poppler prerequisite that bit every new user, and the
+    // crate is already a workspace dep (used by `web_run`'s URL fetch
+    // path). Users with column-heavy / complex-table PDFs (academic
+    // papers, financial filings) can opt into the historical
+    // `pdftotext -layout` route by setting
+    // `prefer_external_pdftotext = true` in `~/.config/deepseek/settings.toml`.
+    let prefer_external = crate::settings::Settings::load()
+        .map(|s| s.prefer_external_pdftotext)
+        .unwrap_or(false);
+
+    if prefer_external {
+        read_pdf_via_pdftotext(path, page_range)
+    } else {
+        read_pdf_via_pdf_extract(path, page_range)
+    }
+}
+
+fn read_pdf_via_pdf_extract(
+    path: &Path,
+    page_range: Option<(u32, u32)>,
+) -> Result<ToolResult, ToolError> {
+    let text = if let Some((start, end)) = page_range {
+        // Page-by-page extraction so we can slice the requested window
+        // without dragging every page through the caller's context.
+        // pdf-extract returns pages in document order; `start`/`end` are
+        // 1-indexed inclusive (validated above), so we convert to a
+        // 0-indexed half-open slice with bounds clamping.
+        let pages = pdf_extract::extract_text_by_pages(path).map_err(|e| {
+            ToolError::execution_failed(format!(
+                "pdf-extract failed on {}: {e} (set `prefer_external_pdftotext = true` in settings.toml to retry via pdftotext)",
+                path.display()
+            ))
+        })?;
+        let total = pages.len();
+        if total == 0 {
+            String::new()
+        } else {
+            let start_idx = (start as usize).saturating_sub(1).min(total);
+            let end_idx = (end as usize).min(total);
+            if start_idx >= end_idx {
+                String::new()
+            } else {
+                pages[start_idx..end_idx].join("\n")
+            }
        }
+    } else {
+        pdf_extract::extract_text(path).map_err(|e| {
+            ToolError::execution_failed(format!(
+                "pdf-extract failed on {}: {e} (set `prefer_external_pdftotext = true` in settings.toml to retry via pdftotext)",
+                path.display()
+            ))
+        })?
+    };
+    Ok(ToolResult::success(text))
+}
+
+fn read_pdf_via_pdftotext(
+    path: &Path,
+    page_range: Option<(u32, u32)>,
+) -> Result<ToolResult, ToolError> {
+    let mut cmd = Command::new("pdftotext");
+    cmd.arg("-layout");
+
+    if let Some((start, end)) = page_range {
+        cmd.arg("-f").arg(start.to_string());
+        cmd.arg("-l").arg(end.to_string());
    }

    cmd.arg(path).arg("-"); // output to stdout
@@ -261,13 +325,15 @@ fn read_pdf(path: &Path, pages: Option<&str>) -> Result<ToolResult, ToolError> {
    let child = match cmd.spawn() {
        Ok(c) => c,
        Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
-            // Structured "binary unavailable" — caller knows what to suggest.
+            // Structured "binary unavailable" — only reachable when the
+            // user explicitly opted into the external path. Hints back at
+            // both the install command and the in-tree default.
            return ToolResult::json(&json!({
                "type": "binary_unavailable",
                "path": path.display().to_string(),
                "kind": "pdf",
-                "reason": "pdftotext not installed",
-                "hint": "install poppler (macOS: `brew install poppler`; Debian/Ubuntu: `apt install poppler-utils`)"
+                "reason": "pdftotext not installed (prefer_external_pdftotext = true in settings)",
+                "hint": "install poppler (macOS: `brew install poppler`; Debian/Ubuntu: `apt install poppler-utils`) — or unset `prefer_external_pdftotext` to use the bundled pure-Rust extractor"
            }))
            .map_err(|e| {
                ToolError::execution_failed(format!("failed to serialize response: {e}"))
@@ -850,32 +916,168 @@ mod tests {
        assert_eq!(parse_pages_arg("abc"), None);
    }

+    /// Sample PDF shipped with the repo for parity tests against the
+    /// pure-Rust extractor. 38 pages, born-digital LaTeX (arXiv 2512.24601).
+    /// Path is workspace-root-relative because the fixture lives outside
+    /// the tui crate.
+    const SAMPLE_PDF_PATH: &str = "../../docs/2512.24601v2.pdf";
+
+    fn sample_pdf_present() -> bool {
+        std::path::Path::new(SAMPLE_PDF_PATH).exists()
+    }
+
+    #[test]
+    fn read_pdf_via_pdf_extract_finds_known_title() {
+        // Skip when the fixture isn't checked out (sparse clones, shallow
+        // worktrees). Local dev + CI both have it.
+        if !sample_pdf_present() {
+            // Fixture not present (sparse / shallow checkout). Silent
+            // skip — `cargo test` reports the same `ok` either way.
+            return;
+        }
+        let path = std::path::PathBuf::from(SAMPLE_PDF_PATH);
+        let result = read_pdf_via_pdf_extract(&path, None).expect("extract whole PDF");
+        assert!(result.success);
+        assert!(
+            result.content.contains("Recursive Language Models"),
+            "pdf-extract should recover the document title; got prefix {:?}",
+            &result.content.chars().take(200).collect::<String>()
+        );
+    }
+
+    #[test]
+    fn read_pdf_via_pdf_extract_respects_pages_window() {
+        if !sample_pdf_present() {
+            // Fixture not present (sparse / shallow checkout). Silent
+            // skip — `cargo test` reports the same `ok` either way.
+            return;
+        }
+        let path = std::path::PathBuf::from(SAMPLE_PDF_PATH);
+        let single = read_pdf_via_pdf_extract(&path, Some((1, 1))).expect("single page");
+        let two = read_pdf_via_pdf_extract(&path, Some((1, 2))).expect("two pages");
+        assert!(single.success);
+        assert!(two.success);
+        // A two-page slice must be at least as long as the one-page slice
+        // (most documents have non-trivial body text past page 1).
+        assert!(
+            two.content.len() >= single.content.len(),
+            "expected pages 1-2 ({} bytes) >= page 1 ({} bytes)",
+            two.content.len(),
+            single.content.len()
+        );
+        // Title text lives on page 1 — must survive the window crop.
+        assert!(single.content.contains("Recursive Language Models"));
+    }
+
    #[tokio::test]
-    async fn read_file_returns_binary_unavailable_when_pdftotext_missing() {
-        // We can't reliably remove pdftotext from $PATH in a test, but if
-        // it's missing on the runner this test exercises that branch. If
-        // it's installed, the test exits early — covered by the parse_pages
-        // and is_pdf tests above.
-        if Command::new("pdftotext")
+    async fn read_file_pdf_path_uses_pdf_extract_by_default() {
+        if !sample_pdf_present() {
+            // Fixture not present (sparse / shallow checkout). Silent
+            // skip — `cargo test` reports the same `ok` either way.
+            return;
+        }
+        // The fixture lives outside the tui crate, so we point ToolContext
+        // at the workspace root and read by relative path. This exercises
+        // the full ReadFileTool::execute → is_pdf → read_pdf dispatch on
+        // the bundled extractor (no pdftotext required on the test host).
+        let workspace = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../");
+        let ctx = ToolContext::new(workspace);
+        let result = ReadFileTool
+            .execute(json!({"path": "docs/2512.24601v2.pdf", "pages": "1"}), &ctx)
+            .await
+            .expect("execute");
+        assert!(result.success);
+        assert!(
+            result.content.contains("Recursive Language Models"),
+            "page-1 extraction must surface the title"
+        );
+    }
+
+    /// Serialises tests that mutate `DEEPSEEK_CONFIG_PATH` so they don't
+    /// race against each other — env vars are process-global and the
+    /// settings loader inspects this var on every call.
+    static DS_CONFIG_PATH_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
+
+    struct ConfigPathEnvGuard {
+        prior: Option<std::ffi::OsString>,
+    }
+    impl ConfigPathEnvGuard {
+        fn capture() -> Self {
+            Self {
+                prior: std::env::var_os("DEEPSEEK_CONFIG_PATH"),
+            }
+        }
+    }
+    impl Drop for ConfigPathEnvGuard {
+        fn drop(&mut self) {
+            // Safety: scoped to test process; reverts to the captured value.
+            match &self.prior {
+                Some(v) => unsafe { std::env::set_var("DEEPSEEK_CONFIG_PATH", v) },
+                None => unsafe { std::env::remove_var("DEEPSEEK_CONFIG_PATH") },
+            }
+        }
+    }
+
+    #[test]
+    fn read_pdf_routes_to_pdftotext_when_setting_opted_in() {
+        // Two concerns in one test: with `prefer_external_pdftotext = true`
+        // the dispatch must (a) call pdftotext when present, and (b) return
+        // the structured `binary_unavailable` response when pdftotext is
+        // missing — both branches were covered by the pre-v0.8.32 default.
+        // Sync test (calls `read_pdf` directly, not the async ReadFileTool
+        // wrapper) so the env-var lock is never held across an `.await`.
+        let _lock = DS_CONFIG_PATH_LOCK.lock().unwrap();
+        let _guard = ConfigPathEnvGuard::capture();
+
+        let tmp = tempdir().expect("tempdir");
+        let config_dir = tmp.path().join("cfg");
+        fs::create_dir_all(&config_dir).unwrap();
+        let config_path = config_dir.join("config.toml");
+        fs::write(&config_path, "").unwrap();
+        // The sibling settings.toml is what Settings::load() reads.
+        fs::write(
+            config_dir.join("settings.toml"),
+            "prefer_external_pdftotext = true\n",
+        )
+        .unwrap();
+        // Safety: serialised by DS_CONFIG_PATH_LOCK; reverted by guard.
+        unsafe {
+            std::env::set_var("DEEPSEEK_CONFIG_PATH", &config_path);
+        }
+
+        let pdf_path = tmp.path().join("doc.pdf");
+        fs::write(&pdf_path, b"%PDF-1.7\n%%EOF").unwrap();
+        let outcome = read_pdf(&pdf_path, None);
+
+        let pdftotext_present = Command::new("pdftotext")
            .arg("-v")
            .stdout(Stdio::null())
            .stderr(Stdio::null())
            .status()
-            .is_ok()
-        {
-            return;
+            .is_ok();
+
+        if pdftotext_present {
+            // pdftotext on a stub `%PDF-1.7\n%%EOF` cannot find a real
+            // trailer/xref table and fails with `exit 1`. That failure
+            // text mentions pdftotext explicitly — proof we routed
+            // through Poppler rather than falling back to the bundled
+            // extractor. Validate by inspecting the error message.
+            let err = outcome.expect_err("malformed PDF must surface the pdftotext error");
+            let msg = err.to_string();
+            assert!(
+                msg.contains("pdftotext"),
+                "error message must reference pdftotext; got {msg}"
+            );
+        } else {
+            let result = outcome.expect("binary_unavailable is a structured success, not an Err");
+            assert!(result.success);
+            assert!(result.content.contains("binary_unavailable"));
+            assert!(result.content.contains("pdftotext"));
+            assert!(
+                result.content.contains("prefer_external_pdftotext"),
+                "hint must reference the opt-in flag the user set"
+            );
        }
-        let tmp = tempdir().expect("tempdir");
-        let path = tmp.path().join("doc.pdf");
-        fs::write(&path, b"%PDF-1.7\n%%EOF").unwrap();
-        let ctx = ToolContext::new(tmp.path().to_path_buf());
-        let result = ReadFileTool
-            .execute(json!({"path": "doc.pdf"}), &ctx)
-            .await
-            .expect("structured response, not error");
-        assert!(result.success);
-        assert!(result.content.contains("binary_unavailable"));
-        assert!(result.content.contains("pdftotext"));
    }

    #[tokio::test]