feat(vision): add image_analyze tool gated behind vision_model feature flag

`image_analyze` sends an image file to an OpenAI-compatible vision endpoint and returns the model's natural-language description. Complements `image_ocr` (which uses local tesseract for "what text is on this image"); `image_analyze` is for "what is this image about" — visual reasoning the local OCR engine can't do. Trust-boundary scope: **two-step opt-in only**. 1. The feature is gated by `[features] vision_model = true` — default `false`. 2. The tool needs a `[vision_model]` config block specifying `model` (with optional `api_key` / `base_url` — falls back to the main config api_key + the OpenAI base URL). Without both, the tool isn't registered, so no install fires a vision API call without explicit user setup. Workspace boundary: the tool rejects absolute paths and any `..` parent-dir traversal before any base64 encoding or HTTP call. Stateless — each call sends only the requested image + optional prompt; no session, no conversation history attached. Supports PNG, JPEG, GIF, WebP, and BMP inputs. **Billing**: each call hits the configured vision endpoint (OpenAI by default — `gpt-4o-mini` / `gpt-4o` family commonly configured). Users with their own deployments (Gemini, Claude Vision via OpenAI shim, local llama.cpp) can point `base_url` / `api_key` at the alternative. Tests cover the tool metadata (read-only capability, correct name), MIME-type detection across the supported formats and the unsupported-format rejection path, and the workspace-boundary checks (absolute paths and `..` traversal both reject before any API call). Skipped from the upstream PR: the `.github/workflows/sync-cnb.yml` rewrite, which v0.8.31 already addressed with the concurrency/scoped-push refactor; landing the older form would regress that commit. Resolved a clippy::collapsible_if in tool_setup.rs (the `if feature && let Some(cfg) = ...` form) to satisfy the workspace -D warnings gate. Harvested from PR #1467 by @MMMarcinho Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-12 01:03:19 -05:00
parent bd603a271c
commit 3aaf0ad95e
12 changed files with 392 additions and 0 deletions
@@ -39,6 +39,24 @@ real world uses."

 ### Added

+- **`image_analyze` tool — vision-model image understanding**
+  (harvested from PR #1467 by **@MMMarcinho**). Sends an image
+  file to an OpenAI-compatible vision endpoint and returns the
+  model's natural-language description. Complements `image_ocr`:
+  use `image_ocr` for "what text is on this image", `image_analyze`
+  for "what is this image about". **Opt-in only** — gated by both
+  the `[features] vision_model = true` flag and a `[vision_model]`
+  config block specifying `model` (and optionally `api_key` /
+  `base_url`). Default configuration ships the feature flag at
+  `false`, so no install sees vision API calls fire without an
+  explicit two-step opt-in. **Billing**: each call hits the
+  configured vision endpoint (OpenAI by default), so usage is
+  billed by the third-party provider; calls are stateless (no
+  conversation context attached). Workspace-boundary check: the
+  tool rejects absolute paths and any `..` parent-dir traversal
+  before any base64 encoding or API call. To disable later: set
+  `[features] vision_model = false` (or omit `[vision_model]`).
+  Supports PNG, JPEG, GIF, WebP, and BMP inputs.
 - **`image_ocr` tool — extract text from images via local
  tesseract.** Lets the model OCR a screenshot, scanned receipt,
  whiteboard photo, or image-only PDF the user drops into the
@@ -304,6 +304,18 @@ web_search = true # enables canonical web.run plus the compatibility web_search
 apply_patch = true
 mcp = true
 exec_policy = true
+# vision_model = false  # enable vision model for image_analyze tool
+
+# ─────────────────────────────────────────────────────────────────────────────────
+# Vision Model Configuration (optional)
+# ─────────────────────────────────────────────────────────────────────────────────
+# Uses an OpenAI-compatible vision model API for the `image_analyze` tool.
+# api_key inherits from the main config if not specified.
+#
+# [vision_model]
+# model = "gemini-3.1-flash-lite-preview"  # Required: vision-capable model ID
+# api_key = "YOUR_API_KEY"                 # Optional: defaults to main api_key
+# base_url = "https://generativelanguage.googleapis.com/v1beta/openai/"  # Optional

 # ─────────────────────────────────────────────────────────────────────────────────
 # Retry Configuration
@@ -905,6 +905,24 @@ pub struct Config {
    /// default threshold of 4 096 tokens applies and routing is active.
    #[serde(default)]
    pub workshop: Option<crate::tools::large_output_router::WorkshopConfig>,
+
+    /// Vision model configuration for the `image_analyze` tool.
+    #[serde(default)]
+    pub vision_model: Option<VisionModelConfig>,
+}
+
+/// Vision model configuration for the `image_analyze` tool.
+/// Uses an OpenAI-compatible vision model API.
+#[derive(Debug, Clone, Deserialize)]
+pub struct VisionModelConfig {
+    /// Model identifier (e.g., "gemini-3.1-flash-lite-preview").
+    pub model: String,
+    /// API key for the vision model. Inherits from main config if not specified.
+    #[serde(default)]
+    pub api_key: Option<String>,
+    /// Base URL for the vision model API. Defaults to OpenAI.
+    #[serde(default)]
+    pub base_url: Option<String>,
 }

 /// `[runtime_api]` table — knobs for the local HTTP/SSE daemon.
@@ -1608,6 +1626,16 @@ impl Config {
            .unwrap_or(false)
    }

+    /// Return the configured vision model config, inheriting api_key from main config.
+    #[must_use]
+    pub fn vision_model_config(&self) -> Option<VisionModelConfig> {
+        let mut config = self.vision_model.clone()?;
+        if config.api_key.is_none() {
+            config.api_key = self.api_key.clone();
+        }
+        Some(config)
+    }
+
    #[must_use]
    pub fn project_context_pack_enabled(&self) -> bool {
        self.context.project_pack.unwrap_or(true)
@@ -2605,6 +2633,7 @@ fn merge_config(base: Config, override_cfg: Config) -> Config {
        mcp_config_path: override_cfg.mcp_config_path.or(base.mcp_config_path),
        notes_path: override_cfg.notes_path.or(base.notes_path),
        memory_path: override_cfg.memory_path.or(base.memory_path),
+        vision_model: override_cfg.vision_model.or(base.vision_model),
        // #454: project's instructions array replaces user's array
        // wholesale. The typical "merge" pattern is for users who want
        // both — they list `~/global.md` inside the project array.
@@ -146,6 +146,7 @@ pub struct EngineConfig {
    /// Path to the user memory file (#489). Always populated; only
    /// consulted when `memory_enabled` is `true`.
    pub memory_path: PathBuf,
+    pub vision_config: Option<crate::config::VisionModelConfig>,
    pub goal_objective: Option<String>,
    /// Resolved BCP-47 locale tag (e.g. `"en"`, `"zh-Hans"`, `"ja"`)
    /// for the `## Environment` block in the system prompt. The
@@ -192,6 +193,7 @@ impl Default for EngineConfig {
            subagent_model_overrides: HashMap::new(),
            memory_enabled: false,
            memory_path: PathBuf::from("./memory.md"),
+            vision_config: None,
            strict_tool_mode: false,
            goal_objective: None,
            locale_tag: "en".to_string(),
@@ -92,6 +92,13 @@ impl Engine {
            builder = builder.with_remember_tool();
        }

+        // Register image_analyze tool when vision_model is configured and feature enabled.
+        if self.config.features.enabled(Feature::VisionModel)
+            && let Some(ref vision_config) = self.config.vision_config
+        {
+            builder = builder.with_vision_tools(vision_config.clone());
+        }
+
        // Register the `notify` tool unconditionally (#1322). It has no
        // side effects beyond a single terminal escape write and respects
        // the user's `[notifications].method` config (including `off`),
@@ -44,6 +44,8 @@ pub enum Feature {
    Mcp,
    /// Enable execpolicy integration/tooling.
    ExecPolicy,
+    /// Enable vision model for image analysis.
+    VisionModel,
 }

 impl fmt::Display for Stage {
@@ -207,6 +209,12 @@ pub const FEATURES: &[FeatureSpec] = &[
        stage: Stage::Experimental,
        default_enabled: true,
    },
+    FeatureSpec {
+        id: Feature::VisionModel,
+        key: "vision_model",
+        stage: Stage::Experimental,
+        default_enabled: false,
+    },
 ];

 #[cfg(test)]
@@ -71,6 +71,7 @@ mod test_support;
 mod tools;
 mod tui;
 mod utils;
+mod vision;
 mod working_set;
 mod workspace_trust;

@@ -4442,6 +4443,7 @@ async fn run_exec_agent(
        subagent_model_overrides: config.subagent_model_overrides(),
        memory_enabled: config.memory_enabled(),
        memory_path: config.memory_path(),
+        vision_config: config.vision_model_config(),
        strict_tool_mode: config.strict_tool_mode.unwrap_or(false),
        goal_objective: None,
        locale_tag: crate::localization::resolve_locale(
@@ -1959,6 +1959,7 @@ impl RuntimeThreadManager {
            subagent_model_overrides: self.config.subagent_model_overrides(),
            memory_enabled: self.config.memory_enabled(),
            memory_path: self.config.memory_path(),
+            vision_config: self.config.vision_model_config(),
            strict_tool_mode: self.config.strict_tool_mode.unwrap_or(false),
            goal_objective: None,
            locale_tag: crate::localization::resolve_locale(
@@ -616,6 +616,14 @@ impl ToolRegistryBuilder {
            .with_tool(Arc::new(WebRunTool))
    }

+    /// Register the `image_analyze` vision tool.
+    /// Only registered when `[vision_model]` is configured in config.toml.
+    #[must_use]
+    pub fn with_vision_tools(self, config: crate::config::VisionModelConfig) -> Self {
+        use crate::vision::tools::ImageAnalyzeTool;
+        self.with_tool(Arc::new(ImageAnalyzeTool::new(config)))
+    }
+
    /// Previously registered the OpenAI-style `multi_tool_use.parallel`
    /// meta-tool. DeepSeek-V4 has native parallel tool calls (multiple
    /// `tool_calls` entries in one assistant turn) and the meta-tool name
@@ -612,6 +612,7 @@ fn build_engine_config(app: &App, config: &Config) -> EngineConfig {
        subagent_model_overrides: config.subagent_model_overrides(),
        memory_enabled: config.memory_enabled(),
        memory_path: config.memory_path(),
+        vision_config: config.vision_model_config(),
        strict_tool_mode: config.strict_tool_mode.unwrap_or(false),
        goal_objective: app.goal.goal_objective.clone(),
        locale_tag: app.ui_locale.tag().to_string(),
@@ -0,0 +1,6 @@
+//! Vision model tool for image analysis.
+//!
+//! Provides the `image_analyze` tool that sends images to an
+//! OpenAI-compatible vision model API and returns text descriptions.
+
+pub mod tools;
@@ -0,0 +1,298 @@
+//! `image_analyze` tool — analyze images using a dedicated vision model.
+
+use std::path::Path;
+use std::time::Duration;
+
+use async_trait::async_trait;
+use base64::{Engine as _, engine::general_purpose::STANDARD as BASE64};
+use serde_json::{Value, json};
+
+use crate::config::VisionModelConfig;
+use crate::llm_client::{LlmError, RetryConfig, with_retry};
+use crate::tools::spec::{
+    ToolCapability, ToolContext, ToolError, ToolResult, ToolSpec, required_str,
+};
+
+pub struct ImageAnalyzeTool {
+    config: VisionModelConfig,
+    client: reqwest::Client,
+}
+
+impl ImageAnalyzeTool {
+    #[must_use]
+    pub fn new(config: VisionModelConfig) -> Self {
+        let client = reqwest::Client::builder()
+            .timeout(Duration::from_secs(120))
+            .build()
+            .expect("Failed to build HTTP client");
+        Self { config, client }
+    }
+
+    async fn read_image_file(path: &Path) -> Result<(String, String), ToolError> {
+        let bytes = tokio::fs::read(path)
+            .await
+            .map_err(|e| ToolError::execution_failed(format!("Failed to read image file: {e}")))?;
+
+        let mime_type = Self::detect_mime_type(path)?;
+        let base64_data = BASE64.encode(&bytes);
+        Ok((base64_data, mime_type))
+    }
+
+    fn detect_mime_type(path: &Path) -> Result<String, ToolError> {
+        let extension = path
+            .extension()
+            .and_then(|e| e.to_str())
+            .unwrap_or("")
+            .to_lowercase();
+
+        match extension.as_str() {
+            "png" => Ok("image/png".to_string()),
+            "jpg" | "jpeg" => Ok("image/jpeg".to_string()),
+            "gif" => Ok("image/gif".to_string()),
+            "webp" => Ok("image/webp".to_string()),
+            "bmp" => Ok("image/bmp".to_string()),
+            _ => Err(ToolError::execution_failed(format!(
+                "Unsupported image format: {extension}"
+            ))),
+        }
+    }
+
+    fn base_url(&self) -> String {
+        self.config
+            .base_url
+            .clone()
+            .unwrap_or_else(|| "https://api.openai.com/v1".to_string())
+    }
+
+    fn api_key(&self) -> String {
+        self.config.api_key.clone().unwrap_or_default()
+    }
+}
+
+#[async_trait]
+impl ToolSpec for ImageAnalyzeTool {
+    fn name(&self) -> &str {
+        "image_analyze"
+    }
+
+    fn description(&self) -> &str {
+        "Analyze an image using the configured vision model. \
+         Supports PNG, JPEG, GIF, WebP, and BMP formats."
+    }
+
+    fn input_schema(&self) -> Value {
+        json!({
+            "type": "object",
+            "properties": {
+                "image_path": {
+                    "type": "string",
+                    "description": "Path to the image file to analyze"
+                },
+                "prompt": {
+                    "type": "string",
+                    "description": "Optional prompt to guide the analysis."
+                }
+            },
+            "required": ["image_path"]
+        })
+    }
+
+    fn capabilities(&self) -> Vec<ToolCapability> {
+        vec![ToolCapability::ReadOnly]
+    }
+
+    async fn execute(&self, input: Value, context: &ToolContext) -> Result<ToolResult, ToolError> {
+        let image_path = required_str(&input, "image_path")?;
+        let prompt = input
+            .get("prompt")
+            .and_then(|v| v.as_str())
+            .unwrap_or("Describe this image in detail.");
+
+        let image_path_buf = Path::new(image_path);
+        if image_path_buf.is_absolute()
+            || image_path_buf
+                .components()
+                .any(|c| matches!(c, std::path::Component::ParentDir))
+        {
+            return Err(ToolError::execution_failed(
+                "image_path must be a relative path within the workspace and cannot escape it.",
+            ));
+        }
+        let resolved_path = context.workspace.join(image_path_buf);
+        let (image_data, mime_type) = Self::read_image_file(&resolved_path).await?;
+
+        let payload = json!({
+            "model": self.config.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": format!("data:{};base64,{}", mime_type, image_data)
+                            }
+                        }
+                    ]
+                }
+            ],
+            "max_tokens": 4096,
+            "temperature": 0.7
+        });
+
+        let url = format!("{}/chat/completions", self.base_url());
+        let api_key = self.api_key();
+
+        let retry_config = RetryConfig {
+            max_retries: 3,
+            initial_delay: 1.0,
+            max_delay: 30.0,
+            enabled: true,
+            ..Default::default()
+        };
+
+        let response = with_retry(
+            &retry_config,
+            || {
+                let client = self.client.clone();
+                let url = url.clone();
+                let api_key = api_key.clone();
+                let payload = payload.clone();
+                async move {
+                    let response = client
+                        .post(&url)
+                        .header("Content-Type", "application/json")
+                        .header("Authorization", format!("Bearer {}", api_key))
+                        .json(&payload)
+                        .send()
+                        .await
+                        .map_err(|e| LlmError::from_reqwest(&e))?;
+
+                    let status = response.status();
+                    if !status.is_success() {
+                        let error_text = response
+                            .text()
+                            .await
+                            .unwrap_or_else(|_| "Unknown error".to_string());
+                        return Err(LlmError::from_http_response(status.as_u16(), &error_text));
+                    }
+                    Ok(response)
+                }
+            },
+            None,
+        )
+        .await
+        .map_err(|e| ToolError::execution_failed(format!("Vision API request failed: {e}")))?;
+
+        let json: Value = response
+            .json()
+            .await
+            .map_err(|e| ToolError::execution_failed(format!("Failed to parse response: {e}")))?;
+
+        let content = json
+            .get("choices")
+            .and_then(|c| c.get(0))
+            .and_then(|c| c.get("message"))
+            .and_then(|m| m.get("content"))
+            .and_then(|c| c.as_str())
+            .unwrap_or("")
+            .to_string();
+
+        let model = json
+            .get("model")
+            .and_then(|m| m.as_str())
+            .unwrap_or(&self.config.model)
+            .to_string();
+
+        let result = json!({
+            "analysis": content,
+            "model": model,
+        });
+
+        ToolResult::json(&result)
+            .map_err(|e| ToolError::execution_failed(format!("Failed to serialize result: {e}")))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tempfile::tempdir;
+
+    fn fake_config() -> VisionModelConfig {
+        VisionModelConfig {
+            model: "test-vision-model".to_string(),
+            api_key: Some("test-key".to_string()),
+            base_url: Some("https://example.invalid/v1".to_string()),
+        }
+    }
+
+    #[test]
+    fn tool_metadata_is_read_only_and_named_image_analyze() {
+        let tool = ImageAnalyzeTool::new(fake_config());
+        assert_eq!(tool.name(), "image_analyze");
+        assert!(tool.capabilities().contains(&ToolCapability::ReadOnly));
+    }
+
+    #[test]
+    fn mime_type_detection_covers_common_formats() {
+        for (ext, expected) in [
+            ("png", "image/png"),
+            ("PNG", "image/png"),
+            ("jpg", "image/jpeg"),
+            ("jpeg", "image/jpeg"),
+            ("gif", "image/gif"),
+            ("webp", "image/webp"),
+            ("bmp", "image/bmp"),
+        ] {
+            let path = std::path::PathBuf::from(format!("test.{ext}"));
+            let mime = ImageAnalyzeTool::detect_mime_type(&path)
+                .unwrap_or_else(|_| panic!("must detect {ext}"));
+            assert_eq!(mime, expected);
+        }
+    }
+
+    #[test]
+    fn mime_type_detection_rejects_unsupported_extension() {
+        let path = std::path::PathBuf::from("test.svg");
+        let err = ImageAnalyzeTool::detect_mime_type(&path)
+            .expect_err("svg is intentionally out of scope for vision tool");
+        assert!(err.to_string().contains("Unsupported image format"));
+    }
+
+    #[tokio::test]
+    async fn execute_rejects_absolute_path() {
+        // Trust-boundary pin: image_path must stay inside the workspace
+        // — an absolute path or a `..`-traversing path must reject
+        // before any base64 / API call.
+        let tmp = tempdir().expect("tempdir");
+        let ctx = ToolContext::new(tmp.path().to_path_buf());
+        let tool = ImageAnalyzeTool::new(fake_config());
+        let err = tool
+            .execute(json!({"image_path": "/etc/hosts"}), &ctx)
+            .await
+            .expect_err("absolute path must reject");
+        assert!(
+            err.to_string()
+                .contains("relative path within the workspace"),
+            "error must call out the workspace boundary; got {err}"
+        );
+    }
+
+    #[tokio::test]
+    async fn execute_rejects_parent_dir_traversal() {
+        let tmp = tempdir().expect("tempdir");
+        let ctx = ToolContext::new(tmp.path().to_path_buf());
+        let tool = ImageAnalyzeTool::new(fake_config());
+        let err = tool
+            .execute(json!({"image_path": "../escape.png"}), &ctx)
+            .await
+            .expect_err("`..`-traversal must reject");
+        assert!(
+            err.to_string()
+                .contains("relative path within the workspace"),
+            "error must call out the workspace boundary; got {err}"
+        );
+    }
+}