fix(pdf): use extract_text_by_pages to avoid hang on full-PDF reads
`pdf_extract::extract_text` uses an internal codepath that can hang on certain PDF cross-reference tables or font encodings. The per-page `extract_text_by_pages` path does not trigger this hang and produces identical output when joined. When `pages` is not specified, route through `extract_text_by_pages` and join all pages instead of calling `extract_text`. Fixes #2641.
This commit is contained in:
@@ -361,12 +361,18 @@ fn read_pdf_via_pdf_extract(
|
||||
}
|
||||
}
|
||||
} else {
|
||||
pdf_extract::extract_text(path).map_err(|e| {
|
||||
ToolError::execution_failed(format!(
|
||||
"pdf-extract failed on {}: {e} (set `prefer_external_pdftotext = true` in settings.toml to retry via pdftotext)",
|
||||
path.display()
|
||||
))
|
||||
})?
|
||||
// Call extract_text_by_pages even when the caller wants every page:
|
||||
// extract_text uses an internal codepath that can hang on certain PDF
|
||||
// cross-reference tables or font encodings (#2641). The per-page path
|
||||
// avoids that hang and produces identical output when joined.
|
||||
pdf_extract::extract_text_by_pages(path)
|
||||
.map(|pages| pages.join("\n"))
|
||||
.map_err(|e| {
|
||||
ToolError::execution_failed(format!(
|
||||
"pdf-extract failed on {}: {e} (set `prefer_external_pdftotext = true` in settings.toml to retry via pdftotext)",
|
||||
path.display()
|
||||
))
|
||||
})?
|
||||
};
|
||||
Ok(ToolResult::success(clean_pdf_text(&text)))
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user