feat(tools): swap read_file PDF path to bundled pdf-extract; pdftotext now opt-in

Before v0.8.32 the PDF branch in `read_file` shelled out to `pdftotext`
(Poppler), which meant first-time users on hosts without it saw the
tool return a `binary_unavailable` sentinel and had to install Poppler
before the model could open a PDF. The `pdf-extract` crate already
powered URL-fetched PDFs in `web_run`, so this change reuses it for
the local PDF path too — extraction is now zero-dependency on every
supported host. The `pages` parameter still filters by 1-indexed
inclusive page range; both whole-file and per-page variants run with
no system dependency.

Users with column-heavy or complex-table PDFs (academic papers,
financial filings) where `pdftotext -layout` still wins can opt into
the external path with `prefer_external_pdftotext = true` in
`~/.config/deepseek/settings.toml`. When set, the previous Poppler
dispatch returns — including the `binary_unavailable` install hint
when the binary is missing on PATH. `deepseek doctor` reframes
`pdftotext` as optional and explains the opt-in instead of treating
it as a missing dependency.

Tests round-trip a checked-in academic PDF through the new path and
verify the `pages` window still slices correctly. The pre-v0.8.32
"binary_unavailable on missing pdftotext" branch is exercised against
the new opt-in setting via a serialised env-var guard.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hunter Bown
2026-05-12 00:21:24 -05:00
parent c188cade88
commit f94bedf1ea
4 changed files with 338 additions and 52 deletions
+50 -17
View File
@@ -2127,26 +2127,59 @@ async fn run_doctor(config: &Config, workspace: &Path, config_path_override: Opt
}
}
// PDF reader: pure-Rust `pdf-extract` is the v0.8.32 default, so
// `pdftotext` is no longer required for `read_file` to handle PDFs.
// We still surface its presence (a) so users with column-heavy PDFs
// know they can opt in via `prefer_external_pdftotext = true`, and
// (b) so users who *did* opt in get a clean signal when the binary
// is missing rather than discovering it on the next PDF read.
let prefer_external = crate::settings::Settings::load()
.map(|s| s.prefer_external_pdftotext)
.unwrap_or(false);
match crate::dependencies::resolve_pdftotext() {
Some(_) => println!(
" {} pdftotext: available → read_file extracts PDF text",
"".truecolor(aqua_r, aqua_g, aqua_b),
),
Some(_) => {
if prefer_external {
println!(
" {} pdftotext: available → read_file routes PDFs through Poppler (prefer_external_pdftotext = true)",
"".truecolor(aqua_r, aqua_g, aqua_b),
);
} else {
println!(
" {} pdftotext: available (optional — pure-Rust extractor is the default in v0.8.32)",
"".truecolor(aqua_r, aqua_g, aqua_b),
);
println!(
" Set `prefer_external_pdftotext = true` in settings.toml for column-heavy PDFs."
);
}
}
None => {
println!(
" {} pdftotext: not found → read_file falls back to a not-supported error for .pdf files",
"!".truecolor(sky_r, sky_g, sky_b),
);
println!(" (Text files still work without pdftotext; this only affects PDF reads.)");
match std::env::consts::OS {
"macos" => println!(" Install via: brew install poppler"),
"linux" => {
println!(" Install via: sudo apt install poppler-utils (Debian/Ubuntu)")
if prefer_external {
println!(
" {} pdftotext: not found, but `prefer_external_pdftotext = true` is set → PDF reads will return `binary_unavailable`",
"".truecolor(red_r, red_g, red_b),
);
println!(
" Either install Poppler or unset `prefer_external_pdftotext` to fall back to the bundled pure-Rust extractor."
);
match std::env::consts::OS {
"macos" => println!(" Install via: brew install poppler"),
"linux" => println!(
" Install via: sudo apt install poppler-utils (Debian/Ubuntu)"
),
"windows" => println!(
" Install Poppler for Windows from https://blog.alivate.com.au/poppler-windows/"
),
_ => {}
}
"windows" => println!(
" Install Poppler for Windows from https://blog.alivate.com.au/poppler-windows/"
),
_ => {}
} else {
println!(
" {} pdftotext: not found (optional — pure-Rust extractor is the default in v0.8.32)",
"·".dimmed(),
);
println!(
" Install Poppler only if you want to opt into pdftotext for column-heavy PDFs."
);
}
}
}
+23
View File
@@ -253,6 +253,17 @@ pub struct Settings {
/// *do* support DEC 2026; it is purely a rendering-quality knob,
/// not a correctness one.
pub synchronized_output: String,
/// Prefer the external `pdftotext` binary (Poppler) over the bundled
/// pure-Rust `pdf-extract` extractor for PDF reads in `read_file`.
/// Pure-Rust extraction is the v0.8.32 default because it removes the
/// install-poppler-first hurdle most users hit, but `pdftotext -layout`
/// still wins for column-heavy or complex-table PDFs (academic papers
/// laid out in two columns, financial filings, etc.). Set to `true` to
/// route every PDF read through `pdftotext` instead — when the binary
/// is missing in that mode the tool returns the structured
/// `binary_unavailable` response with an install hint, matching the
/// pre-v0.8.32 behavior.
pub prefer_external_pdftotext: bool,
}
impl Default for Settings {
@@ -292,6 +303,7 @@ impl Default for Settings {
provider_models: None,
status_indicator: "whale".to_string(),
synchronized_output: "auto".to_string(),
prefer_external_pdftotext: false,
}
}
}
@@ -513,6 +525,9 @@ impl Settings {
}
self.synchronized_output = normalized.to_string();
}
"prefer_external_pdftotext" | "external_pdftotext" | "pdftotext" => {
self.prefer_external_pdftotext = parse_bool(value)?;
}
"default_mode" | "mode" => {
let normalized = normalize_mode(value);
if !["agent", "plan", "yolo"].contains(&normalized) {
@@ -629,6 +644,10 @@ impl Settings {
" synchronized_output: {}",
self.synchronized_output
));
lines.push(format!(
" prefer_external_pdftotext: {}",
self.prefer_external_pdftotext
));
lines.push(format!(" default_mode: {}", self.default_mode));
lines.push(format!(
" sidebar_width: {}%",
@@ -705,6 +724,10 @@ impl Settings {
"synchronized_output",
"DEC 2026 synchronized output: auto, on, off (set off if your terminal flickers)",
),
(
"prefer_external_pdftotext",
"Route PDF reads through Poppler's pdftotext instead of the bundled pure-Rust extractor: on/off (default off)",
),
("default_mode", "Default mode: agent, plan, yolo"),
("sidebar_width", "Sidebar width percentage: 10-50"),
(
+237 -35
View File
@@ -26,7 +26,7 @@ impl ToolSpec for ReadFileTool {
}
fn description(&self) -> &'static str {
"Read a UTF-8 file from the workspace. Use this instead of `cat`, `head`, `tail`, or `sed -n '..p'` in `exec_shell` — it's faster, sandbox-aware, and skips the approval prompt. Plain text is returned as-is; PDFs are auto-extracted via `pdftotext` (poppler) when available. Cannot read images or non-PDF binaries.\n\nFor large files, use `start_line` and `max_lines` to read in chunks. By default, returns at most 200 lines (~16KB). If `truncated=\"true\"` in the response, use `next_start_line` to continue reading. For PDFs, use `pages` instead — `start_line`/`max_lines` only apply to text files."
"Read a UTF-8 file from the workspace. Use this instead of `cat`, `head`, `tail`, or `sed -n '..p'` in `exec_shell` — it's faster, sandbox-aware, and skips the approval prompt. Plain text is returned as-is; PDFs are auto-extracted via the bundled pure-Rust extractor (no Poppler install required). Cannot read images or non-PDF binaries.\n\nFor large files, use `start_line` and `max_lines` to read in chunks. By default, returns at most 200 lines (~16KB). If `truncated=\"true\"` in the response, use `next_start_line` to continue reading. For PDFs, use `pages` instead — `start_line`/`max_lines` only apply to text files."
}
fn input_schema(&self) -> Value {
@@ -234,23 +234,87 @@ fn parse_pages_arg(spec: &str) -> Option<(u32, u32)> {
}
fn read_pdf(path: &Path, pages: Option<&str>) -> Result<ToolResult, ToolError> {
// Try pdftotext (from the poppler suite). Other extractors (mutool,
// pdfminer) could be added later behind the same dispatch.
let mut cmd = Command::new("pdftotext");
cmd.arg("-layout");
if let Some(spec) = pages {
match parse_pages_arg(spec) {
Some((start, end)) => {
cmd.arg("-f").arg(start.to_string());
cmd.arg("-l").arg(end.to_string());
}
// Validate the `pages` spec once, up front, so both extractor paths
// surface the same error shape on bad input.
let page_range = match pages {
Some(spec) => match parse_pages_arg(spec) {
Some((start, end)) => Some((start, end)),
None => {
return Err(ToolError::invalid_input(format!(
"invalid `pages` value `{spec}` (expected `N` or `N-M`, e.g. `1-5`)"
)));
}
},
None => None,
};
// Default to the bundled pure-Rust `pdf-extract` reader: it removes
// the install-poppler prerequisite that bit every new user, and the
// crate is already a workspace dep (used by `web_run`'s URL fetch
// path). Users with column-heavy / complex-table PDFs (academic
// papers, financial filings) can opt into the historical
// `pdftotext -layout` route by setting
// `prefer_external_pdftotext = true` in `~/.config/deepseek/settings.toml`.
let prefer_external = crate::settings::Settings::load()
.map(|s| s.prefer_external_pdftotext)
.unwrap_or(false);
if prefer_external {
read_pdf_via_pdftotext(path, page_range)
} else {
read_pdf_via_pdf_extract(path, page_range)
}
}
fn read_pdf_via_pdf_extract(
path: &Path,
page_range: Option<(u32, u32)>,
) -> Result<ToolResult, ToolError> {
let text = if let Some((start, end)) = page_range {
// Page-by-page extraction so we can slice the requested window
// without dragging every page through the caller's context.
// pdf-extract returns pages in document order; `start`/`end` are
// 1-indexed inclusive (validated above), so we convert to a
// 0-indexed half-open slice with bounds clamping.
let pages = pdf_extract::extract_text_by_pages(path).map_err(|e| {
ToolError::execution_failed(format!(
"pdf-extract failed on {}: {e} (set `prefer_external_pdftotext = true` in settings.toml to retry via pdftotext)",
path.display()
))
})?;
let total = pages.len();
if total == 0 {
String::new()
} else {
let start_idx = (start as usize).saturating_sub(1).min(total);
let end_idx = (end as usize).min(total);
if start_idx >= end_idx {
String::new()
} else {
pages[start_idx..end_idx].join("\n")
}
}
} else {
pdf_extract::extract_text(path).map_err(|e| {
ToolError::execution_failed(format!(
"pdf-extract failed on {}: {e} (set `prefer_external_pdftotext = true` in settings.toml to retry via pdftotext)",
path.display()
))
})?
};
Ok(ToolResult::success(text))
}
fn read_pdf_via_pdftotext(
path: &Path,
page_range: Option<(u32, u32)>,
) -> Result<ToolResult, ToolError> {
let mut cmd = Command::new("pdftotext");
cmd.arg("-layout");
if let Some((start, end)) = page_range {
cmd.arg("-f").arg(start.to_string());
cmd.arg("-l").arg(end.to_string());
}
cmd.arg(path).arg("-"); // output to stdout
@@ -261,13 +325,15 @@ fn read_pdf(path: &Path, pages: Option<&str>) -> Result<ToolResult, ToolError> {
let child = match cmd.spawn() {
Ok(c) => c,
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
// Structured "binary unavailable" — caller knows what to suggest.
// Structured "binary unavailable" — only reachable when the
// user explicitly opted into the external path. Hints back at
// both the install command and the in-tree default.
return ToolResult::json(&json!({
"type": "binary_unavailable",
"path": path.display().to_string(),
"kind": "pdf",
"reason": "pdftotext not installed",
"hint": "install poppler (macOS: `brew install poppler`; Debian/Ubuntu: `apt install poppler-utils`)"
"reason": "pdftotext not installed (prefer_external_pdftotext = true in settings)",
"hint": "install poppler (macOS: `brew install poppler`; Debian/Ubuntu: `apt install poppler-utils`) — or unset `prefer_external_pdftotext` to use the bundled pure-Rust extractor"
}))
.map_err(|e| {
ToolError::execution_failed(format!("failed to serialize response: {e}"))
@@ -850,32 +916,168 @@ mod tests {
assert_eq!(parse_pages_arg("abc"), None);
}
/// Sample PDF shipped with the repo for parity tests against the
/// pure-Rust extractor. 38 pages, born-digital LaTeX (arXiv 2512.24601).
/// Path is workspace-root-relative because the fixture lives outside
/// the tui crate.
const SAMPLE_PDF_PATH: &str = "../../docs/2512.24601v2.pdf";
fn sample_pdf_present() -> bool {
std::path::Path::new(SAMPLE_PDF_PATH).exists()
}
#[test]
fn read_pdf_via_pdf_extract_finds_known_title() {
// Skip when the fixture isn't checked out (sparse clones, shallow
// worktrees). Local dev + CI both have it.
if !sample_pdf_present() {
// Fixture not present (sparse / shallow checkout). Silent
// skip — `cargo test` reports the same `ok` either way.
return;
}
let path = std::path::PathBuf::from(SAMPLE_PDF_PATH);
let result = read_pdf_via_pdf_extract(&path, None).expect("extract whole PDF");
assert!(result.success);
assert!(
result.content.contains("Recursive Language Models"),
"pdf-extract should recover the document title; got prefix {:?}",
&result.content.chars().take(200).collect::<String>()
);
}
#[test]
fn read_pdf_via_pdf_extract_respects_pages_window() {
if !sample_pdf_present() {
// Fixture not present (sparse / shallow checkout). Silent
// skip — `cargo test` reports the same `ok` either way.
return;
}
let path = std::path::PathBuf::from(SAMPLE_PDF_PATH);
let single = read_pdf_via_pdf_extract(&path, Some((1, 1))).expect("single page");
let two = read_pdf_via_pdf_extract(&path, Some((1, 2))).expect("two pages");
assert!(single.success);
assert!(two.success);
// A two-page slice must be at least as long as the one-page slice
// (most documents have non-trivial body text past page 1).
assert!(
two.content.len() >= single.content.len(),
"expected pages 1-2 ({} bytes) >= page 1 ({} bytes)",
two.content.len(),
single.content.len()
);
// Title text lives on page 1 — must survive the window crop.
assert!(single.content.contains("Recursive Language Models"));
}
#[tokio::test]
async fn read_file_returns_binary_unavailable_when_pdftotext_missing() {
// We can't reliably remove pdftotext from $PATH in a test, but if
// it's missing on the runner this test exercises that branch. If
// it's installed, the test exits early — covered by the parse_pages
// and is_pdf tests above.
if Command::new("pdftotext")
async fn read_file_pdf_path_uses_pdf_extract_by_default() {
if !sample_pdf_present() {
// Fixture not present (sparse / shallow checkout). Silent
// skip — `cargo test` reports the same `ok` either way.
return;
}
// The fixture lives outside the tui crate, so we point ToolContext
// at the workspace root and read by relative path. This exercises
// the full ReadFileTool::execute → is_pdf → read_pdf dispatch on
// the bundled extractor (no pdftotext required on the test host).
let workspace = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../");
let ctx = ToolContext::new(workspace);
let result = ReadFileTool
.execute(json!({"path": "docs/2512.24601v2.pdf", "pages": "1"}), &ctx)
.await
.expect("execute");
assert!(result.success);
assert!(
result.content.contains("Recursive Language Models"),
"page-1 extraction must surface the title"
);
}
/// Serialises tests that mutate `DEEPSEEK_CONFIG_PATH` so they don't
/// race against each other — env vars are process-global and the
/// settings loader inspects this var on every call.
static DS_CONFIG_PATH_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
struct ConfigPathEnvGuard {
prior: Option<std::ffi::OsString>,
}
impl ConfigPathEnvGuard {
fn capture() -> Self {
Self {
prior: std::env::var_os("DEEPSEEK_CONFIG_PATH"),
}
}
}
impl Drop for ConfigPathEnvGuard {
fn drop(&mut self) {
// Safety: scoped to test process; reverts to the captured value.
match &self.prior {
Some(v) => unsafe { std::env::set_var("DEEPSEEK_CONFIG_PATH", v) },
None => unsafe { std::env::remove_var("DEEPSEEK_CONFIG_PATH") },
}
}
}
#[test]
fn read_pdf_routes_to_pdftotext_when_setting_opted_in() {
// Two concerns in one test: with `prefer_external_pdftotext = true`
// the dispatch must (a) call pdftotext when present, and (b) return
// the structured `binary_unavailable` response when pdftotext is
// missing — both branches were covered by the pre-v0.8.32 default.
// Sync test (calls `read_pdf` directly, not the async ReadFileTool
// wrapper) so the env-var lock is never held across an `.await`.
let _lock = DS_CONFIG_PATH_LOCK.lock().unwrap();
let _guard = ConfigPathEnvGuard::capture();
let tmp = tempdir().expect("tempdir");
let config_dir = tmp.path().join("cfg");
fs::create_dir_all(&config_dir).unwrap();
let config_path = config_dir.join("config.toml");
fs::write(&config_path, "").unwrap();
// The sibling settings.toml is what Settings::load() reads.
fs::write(
config_dir.join("settings.toml"),
"prefer_external_pdftotext = true\n",
)
.unwrap();
// Safety: serialised by DS_CONFIG_PATH_LOCK; reverted by guard.
unsafe {
std::env::set_var("DEEPSEEK_CONFIG_PATH", &config_path);
}
let pdf_path = tmp.path().join("doc.pdf");
fs::write(&pdf_path, b"%PDF-1.7\n%%EOF").unwrap();
let outcome = read_pdf(&pdf_path, None);
let pdftotext_present = Command::new("pdftotext")
.arg("-v")
.stdout(Stdio::null())
.stderr(Stdio::null())
.status()
.is_ok()
{
return;
.is_ok();
if pdftotext_present {
// pdftotext on a stub `%PDF-1.7\n%%EOF` cannot find a real
// trailer/xref table and fails with `exit 1`. That failure
// text mentions pdftotext explicitly — proof we routed
// through Poppler rather than falling back to the bundled
// extractor. Validate by inspecting the error message.
let err = outcome.expect_err("malformed PDF must surface the pdftotext error");
let msg = err.to_string();
assert!(
msg.contains("pdftotext"),
"error message must reference pdftotext; got {msg}"
);
} else {
let result = outcome.expect("binary_unavailable is a structured success, not an Err");
assert!(result.success);
assert!(result.content.contains("binary_unavailable"));
assert!(result.content.contains("pdftotext"));
assert!(
result.content.contains("prefer_external_pdftotext"),
"hint must reference the opt-in flag the user set"
);
}
let tmp = tempdir().expect("tempdir");
let path = tmp.path().join("doc.pdf");
fs::write(&path, b"%PDF-1.7\n%%EOF").unwrap();
let ctx = ToolContext::new(tmp.path().to_path_buf());
let result = ReadFileTool
.execute(json!({"path": "doc.pdf"}), &ctx)
.await
.expect("structured response, not error");
assert!(result.success);
assert!(result.content.contains("binary_unavailable"));
assert!(result.content.contains("pdftotext"));
}
#[tokio::test]