From bd603a271cf0927113e23e5e381671c4f8bd86d8 Mon Sep 17 00:00:00 2001 From: Hunter Bown Date: Tue, 12 May 2026 00:58:48 -0500 Subject: [PATCH] =?UTF-8?q?feat(tools):=20add=20image=5Focr=20tool=20?= =?UTF-8?q?=E2=80=94=20extract=20text=20from=20images=20via=20tesseract?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lets the model OCR a screenshot, scanned receipt, whiteboard photo, or image-only PDF the user drops into the workspace, without bouncing through `exec_shell` (which would mean an approval prompt plus the model having to remember tesseract's CLI surface). The tool spawns `tesseract -` and returns the recognised text inline — no file is written. Capability is ReadOnly + parallel since OCR is a side-effect-free read. Registration is gated on `crate::dependencies::resolve_tesseract()` via the new `ToolRegistryBuilder::with_image_ocr_tools()` builder, hooked into `with_agent_tools` alongside `pandoc_convert`. When tesseract is missing the tool isn't advertised — same probe-then-decide pattern v0.8.31 introduced for Python. The execute path also late-resolves so a concurrent uninstall surfaces the install-tesseract hint rather than the raw spawn failure. `deepseek doctor`'s "Tool Dependencies" section reports tesseract status next to pandoc / node / python with platform-aware install hints. For non-default language packs or PSM modes the user can still drop into `exec_shell` with the full tesseract CLI surface. Tests check the metadata (ReadOnly + parallel, not WritesFiles), the missing-path rejection, and the happy-path OCR round-trip against `crates/tui/tests/fixtures/ocr_hello.png` — a 2 KB 300×100 grayscale PNG generated with ImageMagick rendering "HELLO OCR" in Helvetica. The happy-path test skips silently on hosts without tesseract (matching the catalog-build behaviour) and on hosts where the fixture isn't checked out (sparse / shallow clones). Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 15 ++ crates/tui/src/dependencies.rs | 26 ++++ crates/tui/src/main.rs | 23 +++ crates/tui/src/tools/image_ocr.rs | 194 ++++++++++++++++++++++++ crates/tui/src/tools/mod.rs | 1 + crates/tui/src/tools/registry.rs | 18 ++- crates/tui/tests/fixtures/ocr_hello.png | Bin 0 -> 2023 bytes 7 files changed, 276 insertions(+), 1 deletion(-) create mode 100644 crates/tui/src/tools/image_ocr.rs create mode 100644 crates/tui/tests/fixtures/ocr_hello.png diff --git a/CHANGELOG.md b/CHANGELOG.md index 90bf29dc..8ff56835 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,21 @@ real world uses." ### Added +- **`image_ocr` tool — extract text from images via local + tesseract.** Lets the model OCR a screenshot, scanned receipt, + whiteboard photo, or image-only PDF the user drops into the + workspace, without bouncing through `exec_shell`. Spawns + `tesseract -` and returns the recognised text inline; + no file is written. PNG / JPEG / TIFF inputs supported. + Registration is gated on `dependencies::resolve_tesseract()`; + when tesseract is missing the tool isn't advertised, so the + model never tries to call an OCR engine the host can't run. + `deepseek doctor` reports tesseract status alongside the other + external-binary dependencies with platform-aware install hints + (`brew install tesseract` / `apt install tesseract-ocr` / + `winget install UB-Mannheim.TesseractOCR`). For non-default + language packs or PSM modes, users can still drop into + `exec_shell` with the full tesseract CLI surface. - **`pandoc_convert` tool — convert documents between formats via the local pandoc binary.** Pandoc is the Swiss Army knife the real world uses for moving prose around — Markdown to HTML, diff --git a/crates/tui/src/dependencies.rs b/crates/tui/src/dependencies.rs index d37b5f66..266bbd9f 100644 --- a/crates/tui/src/dependencies.rs +++ b/crates/tui/src/dependencies.rs @@ -117,6 +117,32 @@ pub fn resolve_pdftotext() -> Option { .clone() } +/// Resolve `tesseract` (OCR engine) once per process. Used by +/// the `image_ocr` tool to decide whether to register itself with +/// the model. Tesseract is the de-facto open-source OCR engine and +/// ships as a single binary on every platform we support, so the +/// candidate list is just `tesseract`. +pub fn resolve_tesseract() -> Option { + static CACHE: OnceLock> = OnceLock::new(); + CACHE + .get_or_init(|| { + if probe_executable("tesseract") { + tracing::info!( + target: "tool_dependencies", + "Resolved tesseract binary for image_ocr", + ); + Some("tesseract".to_string()) + } else { + tracing::warn!( + target: "tool_dependencies", + "tesseract binary not found; image_ocr tool will not be registered", + ); + None + } + }) + .clone() +} + /// Resolve `pandoc` (universal document converter) once per /// process. Used by the `pandoc_convert` tool to decide whether /// to register itself with the model. Pandoc is a single-binary diff --git a/crates/tui/src/main.rs b/crates/tui/src/main.rs index cf122ad2..cf004a0b 100644 --- a/crates/tui/src/main.rs +++ b/crates/tui/src/main.rs @@ -2180,6 +2180,29 @@ async fn run_doctor(config: &Config, workspace: &Path, config_path_override: Opt } } + match crate::dependencies::resolve_tesseract() { + Some(_) => println!( + " {} tesseract: present → image_ocr tool registered", + "✓".truecolor(aqua_r, aqua_g, aqua_b), + ), + None => { + println!(" {} tesseract: not found (optional)", "·".dimmed(),); + println!( + " image_ocr tool is NOT advertised to the model. Install tesseract to enable:" + ); + match std::env::consts::OS { + "macos" => println!(" brew install tesseract"), + "linux" => println!( + " sudo apt install tesseract-ocr (Debian/Ubuntu) — or your distro's equivalent" + ), + "windows" => println!(" winget install UB-Mannheim.TesseractOCR"), + other => { + println!(" install tesseract for {other} from tesseract-ocr.github.io") + } + } + } + } + // PDF reader: pure-Rust `pdf-extract` is the v0.8.32 default, so // `pdftotext` is no longer required for `read_file` to handle PDFs. // We still surface its presence (a) so users with column-heavy PDFs diff --git a/crates/tui/src/tools/image_ocr.rs b/crates/tui/src/tools/image_ocr.rs new file mode 100644 index 00000000..efcab3e6 --- /dev/null +++ b/crates/tui/src/tools/image_ocr.rs @@ -0,0 +1,194 @@ +//! `image_ocr` tool — extract text from an image via the local +//! `tesseract` OCR engine. +//! +//! Tesseract is the open-source workhorse for "convert this image +//! to text" — covers screenshots, scanned PDFs that arrived as +//! image-only blobs, handwriting-free documents in 100+ languages, +//! receipts, whiteboard photos, etc. Surfacing it as a +//! model-callable tool means the model can OCR an asset the user +//! drops into the workspace without bouncing through `exec_shell`. +//! +//! Registration is gated by [`crate::dependencies::resolve_tesseract`] +//! (see [`crate::tools::registry::ToolRegistryBuilder::with_image_ocr_tools`]). +//! When tesseract isn't installed the tool simply doesn't appear in +//! the catalog, so the model never sees a binary it can't actually +//! use. + +use std::process::{Command, Stdio}; + +use async_trait::async_trait; +use serde_json::{Value, json}; + +use super::spec::{ToolCapability, ToolContext, ToolError, ToolResult, ToolSpec, required_str}; + +/// Tool implementing `image_ocr`. Spawns `tesseract -` and +/// returns the extracted text on success. +pub struct ImageOcrTool; + +#[async_trait] +impl ToolSpec for ImageOcrTool { + fn name(&self) -> &'static str { + "image_ocr" + } + + fn description(&self) -> &'static str { + "Extract text from an image (PNG, JPEG, or TIFF) via local tesseract OCR. Use this for screenshots, scanned receipts/whiteboards, image-only PDFs, or any visual that contains text the model needs to read. Returns the extracted text inline; no file is written. Use `exec_shell` only when you need a non-default OCR language pack or PSM mode." + } + + fn input_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "Path to the image file (relative to workspace or absolute). PNG / JPEG / TIFF supported." + } + }, + "required": ["path"] + }) + } + + fn capabilities(&self) -> Vec { + vec![ToolCapability::ReadOnly, ToolCapability::Sandboxable] + } + + fn supports_parallel(&self) -> bool { + true + } + + async fn execute(&self, input: Value, context: &ToolContext) -> Result { + let path_str = required_str(&input, "path")?; + let image_path = context.resolve_path(path_str)?; + if !image_path.exists() { + return Err(ToolError::execution_failed(format!( + "image_ocr: source path does not exist: {}", + image_path.display() + ))); + } + + // Late-resolve tesseract too. Registration gated on + // resolve_tesseract(), but a concurrent uninstall between + // catalog build and the model's call should surface a clear + // error rather than the raw spawn failure. + let tesseract = crate::dependencies::resolve_tesseract().ok_or_else(|| { + ToolError::execution_failed( + "image_ocr: tesseract binary not found on PATH. \ + Install tesseract (macOS: `brew install tesseract`; \ + Debian/Ubuntu: `apt install tesseract-ocr`; \ + Windows: `winget install UB-Mannheim.TesseractOCR`) \ + and restart deepseek-tui.", + ) + })?; + + // `tesseract -` writes the recognised text to stdout. + // The trailing `-` is documented and produces text mode by + // default (no `.txt` file written to disk). + let mut cmd = Command::new(&tesseract); + cmd.arg(&image_path); + cmd.arg("-"); + cmd.stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + + let output = cmd + .output() + .map_err(|e| ToolError::execution_failed(format!("failed to launch tesseract: {e}")))?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string(); + return Err(ToolError::execution_failed(format!( + "tesseract failed (exit {:?}): {stderr}", + output.status.code() + ))); + } + + // Tesseract appends a trailing form-feed on some platforms; + // trim trailing whitespace so the result reads cleanly inline. + let text = String::from_utf8_lossy(&output.stdout) + .trim_end() + .to_string(); + Ok(ToolResult::success(text)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + use tempfile::tempdir; + + /// Tesseract availability — happy-path tests skip when missing so + /// CI environments without OCR still pass the suite. + fn tesseract_present() -> bool { + crate::dependencies::resolve_tesseract().is_some() + } + + /// Resolve the checked-in OCR fixture path. The image lives at + /// `crates/tui/tests/fixtures/ocr_hello.png` (300x100 grayscale, + /// "HELLO OCR" rendered in Helvetica) and is committed for the + /// happy-path round-trip below. + fn ocr_fixture_path() -> std::path::PathBuf { + std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/ocr_hello.png") + } + + #[test] + fn tool_metadata_marks_image_ocr_read_only_and_parallel() { + let tool = ImageOcrTool; + assert_eq!(tool.name(), "image_ocr"); + assert!(tool.supports_parallel()); + let caps = tool.capabilities(); + assert!(caps.contains(&ToolCapability::ReadOnly)); + assert!(!caps.contains(&ToolCapability::WritesFiles)); + } + + #[tokio::test] + async fn image_ocr_rejects_missing_path() { + let tmp = tempdir().expect("tempdir"); + let ctx = ToolContext::new(tmp.path().to_path_buf()); + let err = ImageOcrTool + .execute(json!({"path": "definitely-not-here.png"}), &ctx) + .await + .expect_err("nonexistent path must reject before tesseract spawn"); + let msg = err.to_string(); + assert!( + msg.contains("does not exist"), + "error must call out missing path; got {msg}" + ); + } + + #[tokio::test] + async fn image_ocr_recovers_hello_from_fixture_image() { + if !tesseract_present() { + // Tool wouldn't be registered without tesseract — mirror + // that here so the suite stays green on CI images that + // intentionally omit OCR tooling. + return; + } + let fixture = ocr_fixture_path(); + if !fixture.exists() { + // Fixture not committed (sparse / shallow checkout). Skip + // silently rather than failing the suite. + return; + } + let tmp = tempdir().expect("tempdir"); + // Stage the fixture under the workspace so the path resolver + // accepts the relative input — keeps the test independent of + // the workspace boundary check inside `resolve_path`. + let staged = tmp.path().join("ocr_hello.png"); + fs::copy(&fixture, &staged).unwrap(); + let ctx = ToolContext::new(tmp.path().to_path_buf()); + let result = ImageOcrTool + .execute(json!({"path": "ocr_hello.png"}), &ctx) + .await + .expect("execute"); + assert!(result.success); + // Tesseract reliably recovers "HELLO OCR" from the rendered + // PNG; allow either spacing variant. + let normalised = result.content.to_uppercase(); + assert!( + normalised.contains("HELLO") && normalised.contains("OCR"), + "expected OCR to recover HELLO OCR; got {:?}", + result.content + ); + } +} diff --git a/crates/tui/src/tools/mod.rs b/crates/tui/src/tools/mod.rs index c5e510d1..ce2ac86e 100644 --- a/crates/tui/src/tools/mod.rs +++ b/crates/tui/src/tools/mod.rs @@ -23,6 +23,7 @@ pub mod fim; pub mod git; pub mod git_history; pub mod github; +pub mod image_ocr; pub mod js_execution; pub mod large_output_router; pub mod notify; diff --git a/crates/tui/src/tools/registry.rs b/crates/tui/src/tools/registry.rs index 707a557c..d25a0383 100644 --- a/crates/tui/src/tools/registry.rs +++ b/crates/tui/src/tools/registry.rs @@ -490,6 +490,21 @@ impl ToolRegistryBuilder { } } + /// Include the `image_ocr` tool only when the `tesseract` + /// binary is present on this host. Probe-then-decide mirroring + /// `with_pandoc_tools` — when tesseract is missing the tool + /// stays out of the catalog, so the model never tries to call + /// an OCR engine the host can't actually run. + #[must_use] + pub fn with_image_ocr_tools(self) -> Self { + if crate::dependencies::resolve_tesseract().is_some() { + use super::image_ocr::ImageOcrTool; + self.with_tool(Arc::new(ImageOcrTool)) + } else { + self + } + } + /// Include the `load_skill` tool (#434) so the model can pull a /// SKILL.md body + companion file list into context with one /// call instead of `read_file` + `list_dir` against the path @@ -748,7 +763,8 @@ impl ToolRegistryBuilder { .with_tool_result_retrieval_tool() .with_runtime_task_tools() .with_revert_turn_tool() - .with_pandoc_tools(); + .with_pandoc_tools() + .with_image_ocr_tools(); if allow_shell { builder.with_shell_tools() diff --git a/crates/tui/tests/fixtures/ocr_hello.png b/crates/tui/tests/fixtures/ocr_hello.png new file mode 100644 index 0000000000000000000000000000000000000000..277ac8bd179674e02be143b2ff1ba2ee8b36b0a6 GIT binary patch literal 2023 zcmbuAdpHyNAIHC=nfr=zFVvZ%Hf?0nnM;zz+(T*;xlJ`^O_`8l9Yss0QPOgY6hg!1 zawzN+#>t73TOpQexty^G#|ppw^?RP*ALsc!=Q;m;F7MCh^?d&Oyjeb8E*h!^ssI2D zqN}qn01%YCj!>2dpilKRk|&dJS6@#6&YJ zamvnk=eBuQpFR%#gId?H_2a|JlnGQ0mmCv7okHx+ z5sqD2g%K(~ybb0gg)9`rEgK-*ng2+3EUjn;dp@S6_XMfWWoK}VroF8mTwZsbS}A$` z+F@sYKH9Ud1}=*I-4!Qb33QuBa?2h2tUUu^8C9gvzNYR>-J5~p^p<@MWHmQe*2TUF zG@GTD?^imbvBxNNXlS|MPAiSaX%FIkgX@=EC;5k0?RwC5$x>$d87Yat#`qPiR^#!TZ9`V^4FSapGn6Fh?g&4%qNrx4W zU5HO4yJ+!`07i~O0tZ%@G-8{`oa5ta^B*;|F2)`pOx0s=3p!%i`YlC#dnA7~sBNmC zPK7!7QRDGL?Yp2YZ_;rh2F%*^<4aDF^aMiMa^+a+BkwT?rqd_(=gRT|&S+TPN?idqr=v{@g1l4e+IfiJ1cg=MG5I%tuTm0JB?)|SjbB7hztu0yKii0 z0J(o`=S-TFi6o$6Fa|uyL)^Fnie-1xJfvS&*NV;|@zCg7NQwj+-^;_68~XHrFt34$ zPP-swI#NAGlXC!sxk$)~GxxdJe`vymx%qpwT`yt2?-vP3(t`a#4`U;8G0O}uNGg00 zGBdW3dFzJOL(pJ*w5O(KapO{I(X=$bu87YfZp~bTZ5)3&K=x+xZ)lw`kv5tYmc;uF z(uB+_aR&I=UoyBAisx(lt~XwooOj;RTj#E>_VYM*fPGe&kZq%B-W>6E6y7_2Xd+c7R;x)fCvdZ3-g=C2Z;%9$3Gr^MZLd` zj*hzJLOtgd5VKjgm86d-Q9^`{>8*W@&$*RTbc}Q`aq)ZHbrb7VpExZDaUiP zH86sX8#?JQaQpUhd{m0(Ep4Zi1Q4{RkmcOu7-Ko>arQKUK9Xajy_^w*_ zaO?BV53AOu1w~ayevV2?jHuJBzR+22q41Z}@t91M#IwTQbad7`%gOD>t5)o;)9PkX z2*Kt%4w*`kG#h(MiqHP@F524ZN`|S<$sFx36S4s2Q#b?rykKCDHYRyV{jt*am4?RU z-o&}Jf%=TSpNMp!Hu1{Loo~Z3ivN=P8QO@SZ_$%e;!O(%DxcUf|oPb IDdeaB0EaDuK>z>% literal 0 HcmV?d00001