From f94bedf1eaae3b6df13bc9dbdbabe7c8e31d64f1 Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Tue, 12 May 2026 00:21:24 -0500 Subject: [PATCH] feat(tools): swap read_file PDF path to bundled pdf-extract; pdftotext now opt-in MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before v0.8.32 the PDF branch in `read_file` shelled out to `pdftotext` (Poppler), which meant first-time users on hosts without it saw the tool return a `binary_unavailable` sentinel and had to install Poppler before the model could open a PDF. The `pdf-extract` crate already powered URL-fetched PDFs in `web_run`, so this change reuses it for the local PDF path too — extraction is now zero-dependency on every supported host. The `pages` parameter still filters by 1-indexed inclusive page range; both whole-file and per-page variants run with no system dependency. Users with column-heavy or complex-table PDFs (academic papers, financial filings) where `pdftotext -layout` still wins can opt into the external path with `prefer_external_pdftotext = true` in `~/.config/deepseek/settings.toml`. When set, the previous Poppler dispatch returns — including the `binary_unavailable` install hint when the binary is missing on PATH. `deepseek doctor` reframes `pdftotext` as optional and explains the opt-in instead of treating it as a missing dependency. Tests round-trip a checked-in academic PDF through the new path and verify the `pages` window still slices correctly. The pre-v0.8.32 "binary_unavailable on missing pdftotext" branch is exercised against the new opt-in setting via a serialised env-var guard. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 28 ++++ crates/tui/src/main.rs | 67 ++++++--- crates/tui/src/settings.rs | 23 +++ crates/tui/src/tools/file.rs | 272 ++++++++++++++++++++++++++++++----- 4 files changed, 338 insertions(+), 52 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 170ca76a..7422c991 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,34 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +A "more useful tools" release in progress. v0.8.31 made the tool +surface reliable on every host; v0.8.32 expands it. Anchor is the +question every new contributor asks: "what does the model actually +have to work with?" — and the answer is now closer to "everything +you'd reach for from a shell, including the document formats the +real world uses." + +### Changed + +- **`read_file` now extracts PDFs in pure Rust by default — no + Poppler install required.** Before v0.8.32 the PDF path shelled + out to `pdftotext` (Poppler), so first-time users on hosts without + it saw `read_file` return a `binary_unavailable` sentinel and had + to `brew install poppler` / `apt install poppler-utils` before + the model could open a PDF. The bundled `pdf-extract` crate + (which already powered URL-fetched PDFs in `web_run`) now drives + the local `read_file` path too. The `pages` parameter still + filters by 1-indexed inclusive page range; both the whole-file + and per-page variants run with no system dependency. Users with + column-heavy or complex-table PDFs (academic papers, financial + filings) where `pdftotext -layout` still wins can opt into the + external path with `prefer_external_pdftotext = true` in + `~/.config/deepseek/settings.toml` — when set, the previous + Poppler dispatch (and the `binary_unavailable` install hint when + the binary is missing) returns. `deepseek doctor` now reports + `pdftotext` as optional and explains how to opt in instead of + framing it as a missing dependency. + ## [0.8.31] - 2026-05-12 A "tools that actually work" release. `code_execution` no longer diff --git a/crates/tui/src/main.rs b/crates/tui/src/main.rs index 409690de..5b4917a0 100644 --- a/crates/tui/src/main.rs +++ b/crates/tui/src/main.rs @@ -2127,26 +2127,59 @@ async fn run_doctor(config: &Config, workspace: &Path, config_path_override: Opt } } + // PDF reader: pure-Rust `pdf-extract` is the v0.8.32 default, so + // `pdftotext` is no longer required for `read_file` to handle PDFs. + // We still surface its presence (a) so users with column-heavy PDFs + // know they can opt in via `prefer_external_pdftotext = true`, and + // (b) so users who *did* opt in get a clean signal when the binary + // is missing rather than discovering it on the next PDF read. + let prefer_external = crate::settings::Settings::load() + .map(|s| s.prefer_external_pdftotext) + .unwrap_or(false); match crate::dependencies::resolve_pdftotext() { - Some(_) => println!( - " {} pdftotext: available → read_file extracts PDF text", - "✓".truecolor(aqua_r, aqua_g, aqua_b), - ), + Some(_) => { + if prefer_external { + println!( + " {} pdftotext: available → read_file routes PDFs through Poppler (prefer_external_pdftotext = true)", + "✓".truecolor(aqua_r, aqua_g, aqua_b), + ); + } else { + println!( + " {} pdftotext: available (optional — pure-Rust extractor is the default in v0.8.32)", + "✓".truecolor(aqua_r, aqua_g, aqua_b), + ); + println!( + " Set `prefer_external_pdftotext = true` in settings.toml for column-heavy PDFs." + ); + } + } None => { - println!( - " {} pdftotext: not found → read_file falls back to a not-supported error for .pdf files", - "!".truecolor(sky_r, sky_g, sky_b), - ); - println!(" (Text files still work without pdftotext; this only affects PDF reads.)"); - match std::env::consts::OS { - "macos" => println!(" Install via: brew install poppler"), - "linux" => { - println!(" Install via: sudo apt install poppler-utils (Debian/Ubuntu)") + if prefer_external { + println!( + " {} pdftotext: not found, but `prefer_external_pdftotext = true` is set → PDF reads will return `binary_unavailable`", + "✗".truecolor(red_r, red_g, red_b), + ); + println!( + " Either install Poppler or unset `prefer_external_pdftotext` to fall back to the bundled pure-Rust extractor." + ); + match std::env::consts::OS { + "macos" => println!(" Install via: brew install poppler"), + "linux" => println!( + " Install via: sudo apt install poppler-utils (Debian/Ubuntu)" + ), + "windows" => println!( + " Install Poppler for Windows from https://blog.alivate.com.au/poppler-windows/" + ), + _ => {} } - "windows" => println!( - " Install Poppler for Windows from https://blog.alivate.com.au/poppler-windows/" - ), - _ => {} + } else { + println!( + " {} pdftotext: not found (optional — pure-Rust extractor is the default in v0.8.32)", + "·".dimmed(), + ); + println!( + " Install Poppler only if you want to opt into pdftotext for column-heavy PDFs." + ); } } } diff --git a/crates/tui/src/settings.rs b/crates/tui/src/settings.rs index abefc0b4..18fec2ed 100644 --- a/crates/tui/src/settings.rs +++ b/crates/tui/src/settings.rs @@ -253,6 +253,17 @@ pub struct Settings { /// *do* support DEC 2026; it is purely a rendering-quality knob, /// not a correctness one. pub synchronized_output: String, + /// Prefer the external `pdftotext` binary (Poppler) over the bundled + /// pure-Rust `pdf-extract` extractor for PDF reads in `read_file`. + /// Pure-Rust extraction is the v0.8.32 default because it removes the + /// install-poppler-first hurdle most users hit, but `pdftotext -layout` + /// still wins for column-heavy or complex-table PDFs (academic papers + /// laid out in two columns, financial filings, etc.). Set to `true` to + /// route every PDF read through `pdftotext` instead — when the binary + /// is missing in that mode the tool returns the structured + /// `binary_unavailable` response with an install hint, matching the + /// pre-v0.8.32 behavior. + pub prefer_external_pdftotext: bool, } impl Default for Settings { @@ -292,6 +303,7 @@ impl Default for Settings { provider_models: None, status_indicator: "whale".to_string(), synchronized_output: "auto".to_string(), + prefer_external_pdftotext: false, } } } @@ -513,6 +525,9 @@ impl Settings { } self.synchronized_output = normalized.to_string(); } + "prefer_external_pdftotext" | "external_pdftotext" | "pdftotext" => { + self.prefer_external_pdftotext = parse_bool(value)?; + } "default_mode" | "mode" => { let normalized = normalize_mode(value); if !["agent", "plan", "yolo"].contains(&normalized) { @@ -629,6 +644,10 @@ impl Settings { " synchronized_output: {}", self.synchronized_output )); + lines.push(format!( + " prefer_external_pdftotext: {}", + self.prefer_external_pdftotext + )); lines.push(format!(" default_mode: {}", self.default_mode)); lines.push(format!( " sidebar_width: {}%", @@ -705,6 +724,10 @@ impl Settings { "synchronized_output", "DEC 2026 synchronized output: auto, on, off (set off if your terminal flickers)", ), + ( + "prefer_external_pdftotext", + "Route PDF reads through Poppler's pdftotext instead of the bundled pure-Rust extractor: on/off (default off)", + ), ("default_mode", "Default mode: agent, plan, yolo"), ("sidebar_width", "Sidebar width percentage: 10-50"), ( diff --git a/crates/tui/src/tools/file.rs b/crates/tui/src/tools/file.rs index d0d9ad55..9e4b51bd 100644 --- a/crates/tui/src/tools/file.rs +++ b/crates/tui/src/tools/file.rs @@ -26,7 +26,7 @@ impl ToolSpec for ReadFileTool { } fn description(&self) -> &'static str { - "Read a UTF-8 file from the workspace. Use this instead of `cat`, `head`, `tail`, or `sed -n '..p'` in `exec_shell` — it's faster, sandbox-aware, and skips the approval prompt. Plain text is returned as-is; PDFs are auto-extracted via `pdftotext` (poppler) when available. Cannot read images or non-PDF binaries.\n\nFor large files, use `start_line` and `max_lines` to read in chunks. By default, returns at most 200 lines (~16KB). If `truncated=\"true\"` in the response, use `next_start_line` to continue reading. For PDFs, use `pages` instead — `start_line`/`max_lines` only apply to text files." + "Read a UTF-8 file from the workspace. Use this instead of `cat`, `head`, `tail`, or `sed -n '..p'` in `exec_shell` — it's faster, sandbox-aware, and skips the approval prompt. Plain text is returned as-is; PDFs are auto-extracted via the bundled pure-Rust extractor (no Poppler install required). Cannot read images or non-PDF binaries.\n\nFor large files, use `start_line` and `max_lines` to read in chunks. By default, returns at most 200 lines (~16KB). If `truncated=\"true\"` in the response, use `next_start_line` to continue reading. For PDFs, use `pages` instead — `start_line`/`max_lines` only apply to text files." } fn input_schema(&self) -> Value { @@ -234,23 +234,87 @@ fn parse_pages_arg(spec: &str) -> Option<(u32, u32)> { } fn read_pdf(path: &Path, pages: Option<&str>) -> Result { - // Try pdftotext (from the poppler suite). Other extractors (mutool, - // pdfminer) could be added later behind the same dispatch. - let mut cmd = Command::new("pdftotext"); - cmd.arg("-layout"); - - if let Some(spec) = pages { - match parse_pages_arg(spec) { - Some((start, end)) => { - cmd.arg("-f").arg(start.to_string()); - cmd.arg("-l").arg(end.to_string()); - } + // Validate the `pages` spec once, up front, so both extractor paths + // surface the same error shape on bad input. + let page_range = match pages { + Some(spec) => match parse_pages_arg(spec) { + Some((start, end)) => Some((start, end)), None => { return Err(ToolError::invalid_input(format!( "invalid `pages` value `{spec}` (expected `N` or `N-M`, e.g. `1-5`)" ))); } + }, + None => None, + }; + + // Default to the bundled pure-Rust `pdf-extract` reader: it removes + // the install-poppler prerequisite that bit every new user, and the + // crate is already a workspace dep (used by `web_run`'s URL fetch + // path). Users with column-heavy / complex-table PDFs (academic + // papers, financial filings) can opt into the historical + // `pdftotext -layout` route by setting + // `prefer_external_pdftotext = true` in `~/.config/deepseek/settings.toml`. + let prefer_external = crate::settings::Settings::load() + .map(|s| s.prefer_external_pdftotext) + .unwrap_or(false); + + if prefer_external { + read_pdf_via_pdftotext(path, page_range) + } else { + read_pdf_via_pdf_extract(path, page_range) + } +} + +fn read_pdf_via_pdf_extract( + path: &Path, + page_range: Option<(u32, u32)>, +) -> Result { + let text = if let Some((start, end)) = page_range { + // Page-by-page extraction so we can slice the requested window + // without dragging every page through the caller's context. + // pdf-extract returns pages in document order; `start`/`end` are + // 1-indexed inclusive (validated above), so we convert to a + // 0-indexed half-open slice with bounds clamping. + let pages = pdf_extract::extract_text_by_pages(path).map_err(|e| { + ToolError::execution_failed(format!( + "pdf-extract failed on {}: {e} (set `prefer_external_pdftotext = true` in settings.toml to retry via pdftotext)", + path.display() + )) + })?; + let total = pages.len(); + if total == 0 { + String::new() + } else { + let start_idx = (start as usize).saturating_sub(1).min(total); + let end_idx = (end as usize).min(total); + if start_idx >= end_idx { + String::new() + } else { + pages[start_idx..end_idx].join("\n") + } } + } else { + pdf_extract::extract_text(path).map_err(|e| { + ToolError::execution_failed(format!( + "pdf-extract failed on {}: {e} (set `prefer_external_pdftotext = true` in settings.toml to retry via pdftotext)", + path.display() + )) + })? + }; + Ok(ToolResult::success(text)) +} + +fn read_pdf_via_pdftotext( + path: &Path, + page_range: Option<(u32, u32)>, +) -> Result { + let mut cmd = Command::new("pdftotext"); + cmd.arg("-layout"); + + if let Some((start, end)) = page_range { + cmd.arg("-f").arg(start.to_string()); + cmd.arg("-l").arg(end.to_string()); } cmd.arg(path).arg("-"); // output to stdout @@ -261,13 +325,15 @@ fn read_pdf(path: &Path, pages: Option<&str>) -> Result { let child = match cmd.spawn() { Ok(c) => c, Err(e) if e.kind() == std::io::ErrorKind::NotFound => { - // Structured "binary unavailable" — caller knows what to suggest. + // Structured "binary unavailable" — only reachable when the + // user explicitly opted into the external path. Hints back at + // both the install command and the in-tree default. return ToolResult::json(&json!({ "type": "binary_unavailable", "path": path.display().to_string(), "kind": "pdf", - "reason": "pdftotext not installed", - "hint": "install poppler (macOS: `brew install poppler`; Debian/Ubuntu: `apt install poppler-utils`)" + "reason": "pdftotext not installed (prefer_external_pdftotext = true in settings)", + "hint": "install poppler (macOS: `brew install poppler`; Debian/Ubuntu: `apt install poppler-utils`) — or unset `prefer_external_pdftotext` to use the bundled pure-Rust extractor" })) .map_err(|e| { ToolError::execution_failed(format!("failed to serialize response: {e}")) @@ -850,32 +916,168 @@ mod tests { assert_eq!(parse_pages_arg("abc"), None); } + /// Sample PDF shipped with the repo for parity tests against the + /// pure-Rust extractor. 38 pages, born-digital LaTeX (arXiv 2512.24601). + /// Path is workspace-root-relative because the fixture lives outside + /// the tui crate. + const SAMPLE_PDF_PATH: &str = "../../docs/2512.24601v2.pdf"; + + fn sample_pdf_present() -> bool { + std::path::Path::new(SAMPLE_PDF_PATH).exists() + } + + #[test] + fn read_pdf_via_pdf_extract_finds_known_title() { + // Skip when the fixture isn't checked out (sparse clones, shallow + // worktrees). Local dev + CI both have it. + if !sample_pdf_present() { + // Fixture not present (sparse / shallow checkout). Silent + // skip — `cargo test` reports the same `ok` either way. + return; + } + let path = std::path::PathBuf::from(SAMPLE_PDF_PATH); + let result = read_pdf_via_pdf_extract(&path, None).expect("extract whole PDF"); + assert!(result.success); + assert!( + result.content.contains("Recursive Language Models"), + "pdf-extract should recover the document title; got prefix {:?}", + &result.content.chars().take(200).collect::() + ); + } + + #[test] + fn read_pdf_via_pdf_extract_respects_pages_window() { + if !sample_pdf_present() { + // Fixture not present (sparse / shallow checkout). Silent + // skip — `cargo test` reports the same `ok` either way. + return; + } + let path = std::path::PathBuf::from(SAMPLE_PDF_PATH); + let single = read_pdf_via_pdf_extract(&path, Some((1, 1))).expect("single page"); + let two = read_pdf_via_pdf_extract(&path, Some((1, 2))).expect("two pages"); + assert!(single.success); + assert!(two.success); + // A two-page slice must be at least as long as the one-page slice + // (most documents have non-trivial body text past page 1). + assert!( + two.content.len() >= single.content.len(), + "expected pages 1-2 ({} bytes) >= page 1 ({} bytes)", + two.content.len(), + single.content.len() + ); + // Title text lives on page 1 — must survive the window crop. + assert!(single.content.contains("Recursive Language Models")); + } + #[tokio::test] - async fn read_file_returns_binary_unavailable_when_pdftotext_missing() { - // We can't reliably remove pdftotext from $PATH in a test, but if - // it's missing on the runner this test exercises that branch. If - // it's installed, the test exits early — covered by the parse_pages - // and is_pdf tests above. - if Command::new("pdftotext") + async fn read_file_pdf_path_uses_pdf_extract_by_default() { + if !sample_pdf_present() { + // Fixture not present (sparse / shallow checkout). Silent + // skip — `cargo test` reports the same `ok` either way. + return; + } + // The fixture lives outside the tui crate, so we point ToolContext + // at the workspace root and read by relative path. This exercises + // the full ReadFileTool::execute → is_pdf → read_pdf dispatch on + // the bundled extractor (no pdftotext required on the test host). + let workspace = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../"); + let ctx = ToolContext::new(workspace); + let result = ReadFileTool + .execute(json!({"path": "docs/2512.24601v2.pdf", "pages": "1"}), &ctx) + .await + .expect("execute"); + assert!(result.success); + assert!( + result.content.contains("Recursive Language Models"), + "page-1 extraction must surface the title" + ); + } + + /// Serialises tests that mutate `DEEPSEEK_CONFIG_PATH` so they don't + /// race against each other — env vars are process-global and the + /// settings loader inspects this var on every call. + static DS_CONFIG_PATH_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); + + struct ConfigPathEnvGuard { + prior: Option, + } + impl ConfigPathEnvGuard { + fn capture() -> Self { + Self { + prior: std::env::var_os("DEEPSEEK_CONFIG_PATH"), + } + } + } + impl Drop for ConfigPathEnvGuard { + fn drop(&mut self) { + // Safety: scoped to test process; reverts to the captured value. + match &self.prior { + Some(v) => unsafe { std::env::set_var("DEEPSEEK_CONFIG_PATH", v) }, + None => unsafe { std::env::remove_var("DEEPSEEK_CONFIG_PATH") }, + } + } + } + + #[test] + fn read_pdf_routes_to_pdftotext_when_setting_opted_in() { + // Two concerns in one test: with `prefer_external_pdftotext = true` + // the dispatch must (a) call pdftotext when present, and (b) return + // the structured `binary_unavailable` response when pdftotext is + // missing — both branches were covered by the pre-v0.8.32 default. + // Sync test (calls `read_pdf` directly, not the async ReadFileTool + // wrapper) so the env-var lock is never held across an `.await`. + let _lock = DS_CONFIG_PATH_LOCK.lock().unwrap(); + let _guard = ConfigPathEnvGuard::capture(); + + let tmp = tempdir().expect("tempdir"); + let config_dir = tmp.path().join("cfg"); + fs::create_dir_all(&config_dir).unwrap(); + let config_path = config_dir.join("config.toml"); + fs::write(&config_path, "").unwrap(); + // The sibling settings.toml is what Settings::load() reads. + fs::write( + config_dir.join("settings.toml"), + "prefer_external_pdftotext = true\n", + ) + .unwrap(); + // Safety: serialised by DS_CONFIG_PATH_LOCK; reverted by guard. + unsafe { + std::env::set_var("DEEPSEEK_CONFIG_PATH", &config_path); + } + + let pdf_path = tmp.path().join("doc.pdf"); + fs::write(&pdf_path, b"%PDF-1.7\n%%EOF").unwrap(); + let outcome = read_pdf(&pdf_path, None); + + let pdftotext_present = Command::new("pdftotext") .arg("-v") .stdout(Stdio::null()) .stderr(Stdio::null()) .status() - .is_ok() - { - return; + .is_ok(); + + if pdftotext_present { + // pdftotext on a stub `%PDF-1.7\n%%EOF` cannot find a real + // trailer/xref table and fails with `exit 1`. That failure + // text mentions pdftotext explicitly — proof we routed + // through Poppler rather than falling back to the bundled + // extractor. Validate by inspecting the error message. + let err = outcome.expect_err("malformed PDF must surface the pdftotext error"); + let msg = err.to_string(); + assert!( + msg.contains("pdftotext"), + "error message must reference pdftotext; got {msg}" + ); + } else { + let result = outcome.expect("binary_unavailable is a structured success, not an Err"); + assert!(result.success); + assert!(result.content.contains("binary_unavailable")); + assert!(result.content.contains("pdftotext")); + assert!( + result.content.contains("prefer_external_pdftotext"), + "hint must reference the opt-in flag the user set" + ); } - let tmp = tempdir().expect("tempdir"); - let path = tmp.path().join("doc.pdf"); - fs::write(&path, b"%PDF-1.7\n%%EOF").unwrap(); - let ctx = ToolContext::new(tmp.path().to_path_buf()); - let result = ReadFileTool - .execute(json!({"path": "doc.pdf"}), &ctx) - .await - .expect("structured response, not error"); - assert!(result.success); - assert!(result.content.contains("binary_unavailable")); - assert!(result.content.contains("pdftotext")); } #[tokio::test]