diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ff56835..3101f3b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,24 @@ real world uses." ### Added +- **`image_analyze` tool — vision-model image understanding** + (harvested from PR #1467 by **@MMMarcinho**). Sends an image + file to an OpenAI-compatible vision endpoint and returns the + model's natural-language description. Complements `image_ocr`: + use `image_ocr` for "what text is on this image", `image_analyze` + for "what is this image about". **Opt-in only** — gated by both + the `[features] vision_model = true` flag and a `[vision_model]` + config block specifying `model` (and optionally `api_key` / + `base_url`). Default configuration ships the feature flag at + `false`, so no install sees vision API calls fire without an + explicit two-step opt-in. **Billing**: each call hits the + configured vision endpoint (OpenAI by default), so usage is + billed by the third-party provider; calls are stateless (no + conversation context attached). Workspace-boundary check: the + tool rejects absolute paths and any `..` parent-dir traversal + before any base64 encoding or API call. To disable later: set + `[features] vision_model = false` (or omit `[vision_model]`). + Supports PNG, JPEG, GIF, WebP, and BMP inputs. - **`image_ocr` tool — extract text from images via local tesseract.** Lets the model OCR a screenshot, scanned receipt, whiteboard photo, or image-only PDF the user drops into the diff --git a/config.example.toml b/config.example.toml index 92829ef4..ebe710cb 100644 --- a/config.example.toml +++ b/config.example.toml @@ -304,6 +304,18 @@ web_search = true # enables canonical web.run plus the compatibility web_search apply_patch = true mcp = true exec_policy = true +# vision_model = false # enable vision model for image_analyze tool + +# ───────────────────────────────────────────────────────────────────────────────── +# Vision Model Configuration (optional) +# ───────────────────────────────────────────────────────────────────────────────── +# Uses an OpenAI-compatible vision model API for the `image_analyze` tool. +# api_key inherits from the main config if not specified. +# +# [vision_model] +# model = "gemini-3.1-flash-lite-preview" # Required: vision-capable model ID +# api_key = "YOUR_API_KEY" # Optional: defaults to main api_key +# base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" # Optional # ───────────────────────────────────────────────────────────────────────────────── # Retry Configuration diff --git a/crates/tui/src/config.rs b/crates/tui/src/config.rs index 7ad1c00c..c007149f 100644 --- a/crates/tui/src/config.rs +++ b/crates/tui/src/config.rs @@ -905,6 +905,24 @@ pub struct Config { /// default threshold of 4 096 tokens applies and routing is active. #[serde(default)] pub workshop: Option, + + /// Vision model configuration for the `image_analyze` tool. + #[serde(default)] + pub vision_model: Option, +} + +/// Vision model configuration for the `image_analyze` tool. +/// Uses an OpenAI-compatible vision model API. +#[derive(Debug, Clone, Deserialize)] +pub struct VisionModelConfig { + /// Model identifier (e.g., "gemini-3.1-flash-lite-preview"). + pub model: String, + /// API key for the vision model. Inherits from main config if not specified. + #[serde(default)] + pub api_key: Option, + /// Base URL for the vision model API. Defaults to OpenAI. + #[serde(default)] + pub base_url: Option, } /// `[runtime_api]` table — knobs for the local HTTP/SSE daemon. @@ -1608,6 +1626,16 @@ impl Config { .unwrap_or(false) } + /// Return the configured vision model config, inheriting api_key from main config. + #[must_use] + pub fn vision_model_config(&self) -> Option { + let mut config = self.vision_model.clone()?; + if config.api_key.is_none() { + config.api_key = self.api_key.clone(); + } + Some(config) + } + #[must_use] pub fn project_context_pack_enabled(&self) -> bool { self.context.project_pack.unwrap_or(true) @@ -2605,6 +2633,7 @@ fn merge_config(base: Config, override_cfg: Config) -> Config { mcp_config_path: override_cfg.mcp_config_path.or(base.mcp_config_path), notes_path: override_cfg.notes_path.or(base.notes_path), memory_path: override_cfg.memory_path.or(base.memory_path), + vision_model: override_cfg.vision_model.or(base.vision_model), // #454: project's instructions array replaces user's array // wholesale. The typical "merge" pattern is for users who want // both — they list `~/global.md` inside the project array. diff --git a/crates/tui/src/core/engine.rs b/crates/tui/src/core/engine.rs index fd3414da..9d1509ad 100644 --- a/crates/tui/src/core/engine.rs +++ b/crates/tui/src/core/engine.rs @@ -146,6 +146,7 @@ pub struct EngineConfig { /// Path to the user memory file (#489). Always populated; only /// consulted when `memory_enabled` is `true`. pub memory_path: PathBuf, + pub vision_config: Option, pub goal_objective: Option, /// Resolved BCP-47 locale tag (e.g. `"en"`, `"zh-Hans"`, `"ja"`) /// for the `## Environment` block in the system prompt. The @@ -192,6 +193,7 @@ impl Default for EngineConfig { subagent_model_overrides: HashMap::new(), memory_enabled: false, memory_path: PathBuf::from("./memory.md"), + vision_config: None, strict_tool_mode: false, goal_objective: None, locale_tag: "en".to_string(), diff --git a/crates/tui/src/core/engine/tool_setup.rs b/crates/tui/src/core/engine/tool_setup.rs index fb6e40fd..cc1bfdf0 100644 --- a/crates/tui/src/core/engine/tool_setup.rs +++ b/crates/tui/src/core/engine/tool_setup.rs @@ -92,6 +92,13 @@ impl Engine { builder = builder.with_remember_tool(); } + // Register image_analyze tool when vision_model is configured and feature enabled. + if self.config.features.enabled(Feature::VisionModel) + && let Some(ref vision_config) = self.config.vision_config + { + builder = builder.with_vision_tools(vision_config.clone()); + } + // Register the `notify` tool unconditionally (#1322). It has no // side effects beyond a single terminal escape write and respects // the user's `[notifications].method` config (including `off`), diff --git a/crates/tui/src/features.rs b/crates/tui/src/features.rs index 0dbc4f81..24633cd1 100644 --- a/crates/tui/src/features.rs +++ b/crates/tui/src/features.rs @@ -44,6 +44,8 @@ pub enum Feature { Mcp, /// Enable execpolicy integration/tooling. ExecPolicy, + /// Enable vision model for image analysis. + VisionModel, } impl fmt::Display for Stage { @@ -207,6 +209,12 @@ pub const FEATURES: &[FeatureSpec] = &[ stage: Stage::Experimental, default_enabled: true, }, + FeatureSpec { + id: Feature::VisionModel, + key: "vision_model", + stage: Stage::Experimental, + default_enabled: false, + }, ]; #[cfg(test)] diff --git a/crates/tui/src/main.rs b/crates/tui/src/main.rs index cf004a0b..b8579c3d 100644 --- a/crates/tui/src/main.rs +++ b/crates/tui/src/main.rs @@ -71,6 +71,7 @@ mod test_support; mod tools; mod tui; mod utils; +mod vision; mod working_set; mod workspace_trust; @@ -4442,6 +4443,7 @@ async fn run_exec_agent( subagent_model_overrides: config.subagent_model_overrides(), memory_enabled: config.memory_enabled(), memory_path: config.memory_path(), + vision_config: config.vision_model_config(), strict_tool_mode: config.strict_tool_mode.unwrap_or(false), goal_objective: None, locale_tag: crate::localization::resolve_locale( diff --git a/crates/tui/src/runtime_threads.rs b/crates/tui/src/runtime_threads.rs index dc1e7c0e..de659163 100644 --- a/crates/tui/src/runtime_threads.rs +++ b/crates/tui/src/runtime_threads.rs @@ -1959,6 +1959,7 @@ impl RuntimeThreadManager { subagent_model_overrides: self.config.subagent_model_overrides(), memory_enabled: self.config.memory_enabled(), memory_path: self.config.memory_path(), + vision_config: self.config.vision_model_config(), strict_tool_mode: self.config.strict_tool_mode.unwrap_or(false), goal_objective: None, locale_tag: crate::localization::resolve_locale( diff --git a/crates/tui/src/tools/registry.rs b/crates/tui/src/tools/registry.rs index d25a0383..e355a9ef 100644 --- a/crates/tui/src/tools/registry.rs +++ b/crates/tui/src/tools/registry.rs @@ -616,6 +616,14 @@ impl ToolRegistryBuilder { .with_tool(Arc::new(WebRunTool)) } + /// Register the `image_analyze` vision tool. + /// Only registered when `[vision_model]` is configured in config.toml. + #[must_use] + pub fn with_vision_tools(self, config: crate::config::VisionModelConfig) -> Self { + use crate::vision::tools::ImageAnalyzeTool; + self.with_tool(Arc::new(ImageAnalyzeTool::new(config))) + } + /// Previously registered the OpenAI-style `multi_tool_use.parallel` /// meta-tool. DeepSeek-V4 has native parallel tool calls (multiple /// `tool_calls` entries in one assistant turn) and the meta-tool name diff --git a/crates/tui/src/tui/ui.rs b/crates/tui/src/tui/ui.rs index e73fc96e..a66b0a91 100644 --- a/crates/tui/src/tui/ui.rs +++ b/crates/tui/src/tui/ui.rs @@ -612,6 +612,7 @@ fn build_engine_config(app: &App, config: &Config) -> EngineConfig { subagent_model_overrides: config.subagent_model_overrides(), memory_enabled: config.memory_enabled(), memory_path: config.memory_path(), + vision_config: config.vision_model_config(), strict_tool_mode: config.strict_tool_mode.unwrap_or(false), goal_objective: app.goal.goal_objective.clone(), locale_tag: app.ui_locale.tag().to_string(), diff --git a/crates/tui/src/vision/mod.rs b/crates/tui/src/vision/mod.rs new file mode 100644 index 00000000..a7d4bba1 --- /dev/null +++ b/crates/tui/src/vision/mod.rs @@ -0,0 +1,6 @@ +//! Vision model tool for image analysis. +//! +//! Provides the `image_analyze` tool that sends images to an +//! OpenAI-compatible vision model API and returns text descriptions. + +pub mod tools; diff --git a/crates/tui/src/vision/tools.rs b/crates/tui/src/vision/tools.rs new file mode 100644 index 00000000..808b5cbe --- /dev/null +++ b/crates/tui/src/vision/tools.rs @@ -0,0 +1,298 @@ +//! `image_analyze` tool — analyze images using a dedicated vision model. + +use std::path::Path; +use std::time::Duration; + +use async_trait::async_trait; +use base64::{Engine as _, engine::general_purpose::STANDARD as BASE64}; +use serde_json::{Value, json}; + +use crate::config::VisionModelConfig; +use crate::llm_client::{LlmError, RetryConfig, with_retry}; +use crate::tools::spec::{ + ToolCapability, ToolContext, ToolError, ToolResult, ToolSpec, required_str, +}; + +pub struct ImageAnalyzeTool { + config: VisionModelConfig, + client: reqwest::Client, +} + +impl ImageAnalyzeTool { + #[must_use] + pub fn new(config: VisionModelConfig) -> Self { + let client = reqwest::Client::builder() + .timeout(Duration::from_secs(120)) + .build() + .expect("Failed to build HTTP client"); + Self { config, client } + } + + async fn read_image_file(path: &Path) -> Result<(String, String), ToolError> { + let bytes = tokio::fs::read(path) + .await + .map_err(|e| ToolError::execution_failed(format!("Failed to read image file: {e}")))?; + + let mime_type = Self::detect_mime_type(path)?; + let base64_data = BASE64.encode(&bytes); + Ok((base64_data, mime_type)) + } + + fn detect_mime_type(path: &Path) -> Result { + let extension = path + .extension() + .and_then(|e| e.to_str()) + .unwrap_or("") + .to_lowercase(); + + match extension.as_str() { + "png" => Ok("image/png".to_string()), + "jpg" | "jpeg" => Ok("image/jpeg".to_string()), + "gif" => Ok("image/gif".to_string()), + "webp" => Ok("image/webp".to_string()), + "bmp" => Ok("image/bmp".to_string()), + _ => Err(ToolError::execution_failed(format!( + "Unsupported image format: {extension}" + ))), + } + } + + fn base_url(&self) -> String { + self.config + .base_url + .clone() + .unwrap_or_else(|| "https://api.openai.com/v1".to_string()) + } + + fn api_key(&self) -> String { + self.config.api_key.clone().unwrap_or_default() + } +} + +#[async_trait] +impl ToolSpec for ImageAnalyzeTool { + fn name(&self) -> &str { + "image_analyze" + } + + fn description(&self) -> &str { + "Analyze an image using the configured vision model. \ + Supports PNG, JPEG, GIF, WebP, and BMP formats." + } + + fn input_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "image_path": { + "type": "string", + "description": "Path to the image file to analyze" + }, + "prompt": { + "type": "string", + "description": "Optional prompt to guide the analysis." + } + }, + "required": ["image_path"] + }) + } + + fn capabilities(&self) -> Vec { + vec![ToolCapability::ReadOnly] + } + + async fn execute(&self, input: Value, context: &ToolContext) -> Result { + let image_path = required_str(&input, "image_path")?; + let prompt = input + .get("prompt") + .and_then(|v| v.as_str()) + .unwrap_or("Describe this image in detail."); + + let image_path_buf = Path::new(image_path); + if image_path_buf.is_absolute() + || image_path_buf + .components() + .any(|c| matches!(c, std::path::Component::ParentDir)) + { + return Err(ToolError::execution_failed( + "image_path must be a relative path within the workspace and cannot escape it.", + )); + } + let resolved_path = context.workspace.join(image_path_buf); + let (image_data, mime_type) = Self::read_image_file(&resolved_path).await?; + + let payload = json!({ + "model": self.config.model, + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": { + "url": format!("data:{};base64,{}", mime_type, image_data) + } + } + ] + } + ], + "max_tokens": 4096, + "temperature": 0.7 + }); + + let url = format!("{}/chat/completions", self.base_url()); + let api_key = self.api_key(); + + let retry_config = RetryConfig { + max_retries: 3, + initial_delay: 1.0, + max_delay: 30.0, + enabled: true, + ..Default::default() + }; + + let response = with_retry( + &retry_config, + || { + let client = self.client.clone(); + let url = url.clone(); + let api_key = api_key.clone(); + let payload = payload.clone(); + async move { + let response = client + .post(&url) + .header("Content-Type", "application/json") + .header("Authorization", format!("Bearer {}", api_key)) + .json(&payload) + .send() + .await + .map_err(|e| LlmError::from_reqwest(&e))?; + + let status = response.status(); + if !status.is_success() { + let error_text = response + .text() + .await + .unwrap_or_else(|_| "Unknown error".to_string()); + return Err(LlmError::from_http_response(status.as_u16(), &error_text)); + } + Ok(response) + } + }, + None, + ) + .await + .map_err(|e| ToolError::execution_failed(format!("Vision API request failed: {e}")))?; + + let json: Value = response + .json() + .await + .map_err(|e| ToolError::execution_failed(format!("Failed to parse response: {e}")))?; + + let content = json + .get("choices") + .and_then(|c| c.get(0)) + .and_then(|c| c.get("message")) + .and_then(|m| m.get("content")) + .and_then(|c| c.as_str()) + .unwrap_or("") + .to_string(); + + let model = json + .get("model") + .and_then(|m| m.as_str()) + .unwrap_or(&self.config.model) + .to_string(); + + let result = json!({ + "analysis": content, + "model": model, + }); + + ToolResult::json(&result) + .map_err(|e| ToolError::execution_failed(format!("Failed to serialize result: {e}"))) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + + fn fake_config() -> VisionModelConfig { + VisionModelConfig { + model: "test-vision-model".to_string(), + api_key: Some("test-key".to_string()), + base_url: Some("https://example.invalid/v1".to_string()), + } + } + + #[test] + fn tool_metadata_is_read_only_and_named_image_analyze() { + let tool = ImageAnalyzeTool::new(fake_config()); + assert_eq!(tool.name(), "image_analyze"); + assert!(tool.capabilities().contains(&ToolCapability::ReadOnly)); + } + + #[test] + fn mime_type_detection_covers_common_formats() { + for (ext, expected) in [ + ("png", "image/png"), + ("PNG", "image/png"), + ("jpg", "image/jpeg"), + ("jpeg", "image/jpeg"), + ("gif", "image/gif"), + ("webp", "image/webp"), + ("bmp", "image/bmp"), + ] { + let path = std::path::PathBuf::from(format!("test.{ext}")); + let mime = ImageAnalyzeTool::detect_mime_type(&path) + .unwrap_or_else(|_| panic!("must detect {ext}")); + assert_eq!(mime, expected); + } + } + + #[test] + fn mime_type_detection_rejects_unsupported_extension() { + let path = std::path::PathBuf::from("test.svg"); + let err = ImageAnalyzeTool::detect_mime_type(&path) + .expect_err("svg is intentionally out of scope for vision tool"); + assert!(err.to_string().contains("Unsupported image format")); + } + + #[tokio::test] + async fn execute_rejects_absolute_path() { + // Trust-boundary pin: image_path must stay inside the workspace + // — an absolute path or a `..`-traversing path must reject + // before any base64 / API call. + let tmp = tempdir().expect("tempdir"); + let ctx = ToolContext::new(tmp.path().to_path_buf()); + let tool = ImageAnalyzeTool::new(fake_config()); + let err = tool + .execute(json!({"image_path": "/etc/hosts"}), &ctx) + .await + .expect_err("absolute path must reject"); + assert!( + err.to_string() + .contains("relative path within the workspace"), + "error must call out the workspace boundary; got {err}" + ); + } + + #[tokio::test] + async fn execute_rejects_parent_dir_traversal() { + let tmp = tempdir().expect("tempdir"); + let ctx = ToolContext::new(tmp.path().to_path_buf()); + let tool = ImageAnalyzeTool::new(fake_config()); + let err = tool + .execute(json!({"image_path": "../escape.png"}), &ctx) + .await + .expect_err("`..`-traversal must reject"); + assert!( + err.to_string() + .contains("relative path within the workspace"), + "error must call out the workspace boundary; got {err}" + ); + } +}