feat: add Xiaomi MiMo speech support

2026-06-02 09:00:14 +08:00
parent 139b542d3f
commit 8532dcc49e
19 changed files with 1397 additions and 12 deletions
@@ -323,6 +323,7 @@ codewhale --provider openrouter --model minimax/minimax-m3
 # Xiaomi MiMo
 codewhale auth set --provider xiaomi-mimo --api-key "YOUR_XIAOMI_KEY"
 codewhale --provider xiaomi-mimo --model mimo-v2.5-pro
+codewhale --provider xiaomi-mimo speech "Hello from MiMo" --model tts -o hello.wav

 # Novita
 codewhale auth set --provider novita --api-key "YOUR_NOVITA_API_KEY"
@@ -269,6 +269,7 @@ codewhale --provider openrouter --model qwen/qwen3.7-max
 # Xiaomi MiMo
 codewhale auth set --provider xiaomi-mimo --api-key "YOUR_XIAOMI_MIMO_API_KEY"
 codewhale --provider xiaomi-mimo --model mimo-v2.5-pro
+codewhale --provider xiaomi-mimo speech "???MiMo" --model tts -o hello.wav

 # Novita
 codewhale auth set --provider novita --api-key "YOUR_NOVITA_API_KEY"
@@ -45,6 +45,9 @@ base_url = "https://api.deepseek.com/beta"
 #   deepseek-ai/deepseek-v4-flash   — default AtlasCloud model ID
 #   deepseek-reasoner               — default Wanjie Ark model ID
 #   mimo-v2.5-pro                   — default Xiaomi MiMo model ID
+#   mimo-v2.5-tts                   ? Xiaomi MiMo speech/TTS model ID
+#   mimo-v2.5-tts-voicedesign       ? Xiaomi MiMo voice-design TTS model ID
+#   mimo-v2.5-tts-voiceclone        ? Xiaomi MiMo voice-clone TTS model ID
 #   accounts/fireworks/models/deepseek-v4-pro — Fireworks AI Pro model ID
 #   deepseek-ai/DeepSeek-V4-Pro    — SiliconFlow hosted Pro model ID
 #   deepseek-ai/DeepSeek-V4-Flash  — SiliconFlow hosted Flash model ID
@@ -120,6 +123,11 @@ memory_path = "~/.codewhale/memory.md"
 # Parsed but currently unused (reserved for future versions):
 # tools_file = "./tools.json"

+# Xiaomi MiMo speech/TTS defaults. Also configurable with
+# XIAOMI_MIMO_SPEECH_OUTPUT_DIR / MIMO_SPEECH_OUTPUT_DIR.
+[speech]
+# output_dir = "./speech"
+
 # Native tool catalog controls (#2076). By default only the core tool surface
 # is loaded into the model context; less common native tools are discoverable
 # through ToolSearch and loaded on first use.
@@ -301,7 +309,9 @@ max_subagents = 10 # optional (1-20)
 [providers.xiaomi_mimo]
 # api_key = "YOUR_XIAOMI_KEY"
 # base_url = "https://api.xiaomimimo.com/v1"
-# model = "mimo-v2.5-pro"
+# model = "mimo-v2.5-pro"              # chat/reasoning
+# TTS aliases are also accepted by `codewhale speech`: tts, voice-design, voice-clone
+# TTS model IDs: mimo-v2.5-tts, mimo-v2.5-tts-voicedesign, mimo-v2.5-tts-voiceclone, mimo-v2-tts

 # Novita AI-hosted inference (https://novita.ai)
 [providers.novita]
@@ -307,6 +307,46 @@ impl Default for ModelRegistry {
                supports_tools: true,
                supports_reasoning: true,
            },
+            ModelInfo {
+                id: "mimo-v2.5-tts".to_string(),
+                provider: ProviderKind::XiaomiMimo,
+                aliases: vec![
+                    "tts".to_string(),
+                    "speech".to_string(),
+                    "mimo-tts".to_string(),
+                ],
+                supports_tools: false,
+                supports_reasoning: false,
+            },
+            ModelInfo {
+                id: "mimo-v2.5-tts-voicedesign".to_string(),
+                provider: ProviderKind::XiaomiMimo,
+                aliases: vec![
+                    "voicedesign".to_string(),
+                    "voice-design".to_string(),
+                    "mimo-voice-design".to_string(),
+                ],
+                supports_tools: false,
+                supports_reasoning: false,
+            },
+            ModelInfo {
+                id: "mimo-v2.5-tts-voiceclone".to_string(),
+                provider: ProviderKind::XiaomiMimo,
+                aliases: vec![
+                    "voiceclone".to_string(),
+                    "voice-clone".to_string(),
+                    "mimo-voice-clone".to_string(),
+                ],
+                supports_tools: false,
+                supports_reasoning: false,
+            },
+            ModelInfo {
+                id: "mimo-v2-tts".to_string(),
+                provider: ProviderKind::XiaomiMimo,
+                aliases: vec!["mimo-v2-speech".to_string()],
+                supports_tools: false,
+                supports_reasoning: false,
+            },
            ModelInfo {
                id: "deepseek/deepseek-v4-pro".to_string(),
                provider: ProviderKind::Novita,
@@ -707,6 +747,22 @@ mod tests {
        assert!(resolved.resolved.supports_reasoning);
    }

+    #[test]
+    fn xiaomi_mimo_tts_aliases_resolve_when_provider_hinted() {
+        let registry = ModelRegistry::default();
+        let resolved = registry.resolve(Some("tts"), Some(ProviderKind::XiaomiMimo));
+        assert_eq!(resolved.resolved.provider, ProviderKind::XiaomiMimo);
+        assert_eq!(resolved.resolved.id, "mimo-v2.5-tts");
+        assert!(!resolved.resolved.supports_tools);
+        assert!(!resolved.resolved.supports_reasoning);
+
+        let resolved = registry.resolve(Some("voice-design"), Some(ProviderKind::XiaomiMimo));
+        assert_eq!(resolved.resolved.id, "mimo-v2.5-tts-voicedesign");
+
+        let resolved = registry.resolve(Some("voiceclone"), Some(ProviderKind::XiaomiMimo));
+        assert_eq!(resolved.resolved.id, "mimo-v2.5-tts-voiceclone");
+    }
+
    #[test]
    fn wanjie_ark_default_uses_reasoner_model_id() {
        let registry = ModelRegistry::default();
@@ -133,6 +133,9 @@ enum Commands {
    Doctor(TuiPassthroughArgs),
    /// List live DeepSeek API models via the TUI binary.
    Models(TuiPassthroughArgs),
+    /// Generate speech audio with Xiaomi MiMo TTS models via the TUI binary.
+    #[command(visible_alias = "tts")]
+    Speech(TuiPassthroughArgs),
    /// List saved TUI sessions.
    Sessions(TuiPassthroughArgs),
    /// Resume a saved TUI session.
@@ -510,6 +513,10 @@ fn run() -> Result<()> {
            let resolved_runtime = resolve_runtime_for_dispatch(&mut store, &runtime_overrides);
            delegate_to_tui(&cli, &resolved_runtime, tui_args("models", args))
        }
+        Some(Commands::Speech(args)) => {
+            let resolved_runtime = resolve_runtime_for_dispatch(&mut store, &runtime_overrides);
+            delegate_to_tui(&cli, &resolved_runtime, tui_args("speech", args))
+        }
        Some(Commands::Sessions(args)) => {
            let resolved_runtime = resolve_runtime_for_dispatch(&mut store, &runtime_overrides);
            delegate_to_tui(&cli, &resolved_runtime, tui_args("sessions", args))
@@ -44,6 +44,10 @@ const OPENROUTER_TENCENT_HY3_PREVIEW_MODEL: &str = "tencent/hy3-preview";
 const OPENROUTER_XIAOMI_MIMO_V2_5_PRO_MODEL: &str = "xiaomi/mimo-v2.5-pro";
 const OPENROUTER_XIAOMI_MIMO_V2_5_MODEL: &str = "xiaomi/mimo-v2.5";
 const DEFAULT_XIAOMI_MIMO_MODEL: &str = "mimo-v2.5-pro";
+const XIAOMI_MIMO_TTS_MODEL: &str = "mimo-v2.5-tts";
+const XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL: &str = "mimo-v2.5-tts-voicedesign";
+const XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL: &str = "mimo-v2.5-tts-voiceclone";
+const XIAOMI_MIMO_V2_TTS_MODEL: &str = "mimo-v2-tts";
 const DEFAULT_NOVITA_MODEL: &str = "deepseek/deepseek-v4-pro";
 const DEFAULT_NOVITA_FLASH_MODEL: &str = "deepseek/deepseek-v4-flash";
 const DEFAULT_FIREWORKS_MODEL: &str = "accounts/fireworks/models/deepseek-v4-pro";
@@ -1447,6 +1451,12 @@ pub fn load_project_config(workspace: &Path) -> Option<ConfigToml> {
 }

 fn normalize_model_for_provider(provider: ProviderKind, model: &str) -> String {
+    if matches!(provider, ProviderKind::XiaomiMimo)
+        && let Some(canonical) = canonical_xiaomi_mimo_model_id(model)
+    {
+        return canonical.to_string();
+    }
+
    if matches!(
        provider,
        ProviderKind::Atlascloud
@@ -1521,6 +1531,38 @@ fn normalize_model_for_provider(provider: ProviderKind, model: &str) -> String {
    }
 }

+fn canonical_xiaomi_mimo_model_id(model: &str) -> Option<&'static str> {
+    let normalized = model.trim().to_ascii_lowercase();
+    let normalized = normalized.replace(['_', ' '], "-");
+    match normalized.as_str() {
+        "mimo"
+        | DEFAULT_XIAOMI_MIMO_MODEL
+        | "mimo-v2-5-pro"
+        | "xiaomi-mimo-v2.5-pro"
+        | "xiaomi-mimo-v2-5-pro" => Some(DEFAULT_XIAOMI_MIMO_MODEL),
+        "mimo-v2.5" | "mimo-v25" | "mimo-v2-5" | "xiaomi-mimo-v2.5" | "xiaomi-mimo-v2-5" => {
+            Some("mimo-v2.5")
+        }
+        "mimo-tts" | "mimo-v25-tts" | "mimo-v2.5-tts" | "tts" | "speech" => {
+            Some(XIAOMI_MIMO_TTS_MODEL)
+        }
+        "mimo-tts-voicedesign"
+        | "mimo-voice-design"
+        | "mimo-v25-tts-voicedesign"
+        | "mimo-v2.5-tts-voicedesign"
+        | "voicedesign"
+        | "voice-design" => Some(XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL),
+        "mimo-tts-voiceclone"
+        | "mimo-voice-clone"
+        | "mimo-v25-tts-voiceclone"
+        | "mimo-v2.5-tts-voiceclone"
+        | "voiceclone"
+        | "voice-clone" => Some(XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL),
+        "mimo-v2-tts" => Some(XIAOMI_MIMO_V2_TTS_MODEL),
+        _ => None,
+    }
+}
+
 fn canonical_openrouter_recent_model_id(model: &str) -> Option<&'static str> {
    let normalized = model.trim().to_ascii_lowercase();
    let normalized = normalized.replace(['_', ' '], "-");
@@ -3571,6 +3613,26 @@ unix_socket_path = "/tmp/cw-hooks.sock"
        assert_eq!(resolved.model, DEFAULT_XIAOMI_MIMO_MODEL);
    }

+    #[test]
+    fn xiaomi_mimo_tts_aliases_resolve_to_canonical_models() {
+        assert_eq!(
+            normalize_model_for_provider(ProviderKind::XiaomiMimo, "tts"),
+            "mimo-v2.5-tts"
+        );
+        assert_eq!(
+            normalize_model_for_provider(ProviderKind::XiaomiMimo, "voice-design"),
+            "mimo-v2.5-tts-voicedesign"
+        );
+        assert_eq!(
+            normalize_model_for_provider(ProviderKind::XiaomiMimo, "voiceclone"),
+            "mimo-v2.5-tts-voiceclone"
+        );
+        assert_eq!(
+            normalize_model_for_provider(ProviderKind::XiaomiMimo, "custom-mimo-model"),
+            "custom-mimo-model"
+        );
+    }
+
    #[test]
    fn novita_provider_defaults_to_canonical_endpoint_and_model() {
        let _lock = env_lock();
@@ -8,6 +8,7 @@ use std::sync::{Arc, Mutex as StdMutex, OnceLock};
 use std::time::{Duration, Instant};

 use anyhow::{Context, Result};
+use base64::{Engine as _, engine::general_purpose};
 use reqwest::header::{AUTHORIZATION, CONTENT_TYPE, HeaderMap, HeaderName, HeaderValue};
 use serde::{Deserialize, Serialize};
 use serde_json::{Value, json};
@@ -119,6 +120,31 @@ pub struct AvailableModel {
    pub created: Option<u64>,
 }

+/// Request payload for Xiaomi MiMo speech synthesis models.
+///
+/// MiMo-V2.5-TTS / MiMo-V2-TTS use the OpenAI-compatible
+/// `/v1/chat/completions` endpoint: the optional style/voice instruction is
+/// sent as a `user` message, while the text to synthesize is sent as an
+/// `assistant` message.
+#[derive(Debug, Clone)]
+pub struct SpeechSynthesisRequest {
+    pub model: String,
+    pub text: String,
+    pub instruction: Option<String>,
+    pub audio_format: String,
+    pub voice: Option<String>,
+}
+
+/// Decoded speech synthesis result.
+#[derive(Debug, Clone)]
+pub struct SpeechSynthesisResponse {
+    pub model: String,
+    pub audio_format: String,
+    pub audio_bytes: Vec<u8>,
+    pub transcript: Option<String>,
+    pub voice: Option<String>,
+}
+
 /// Client for DeepSeek's OpenAI-compatible APIs.
 #[must_use]
 pub struct DeepSeekClient {
@@ -407,6 +433,49 @@ pub(super) fn api_url(base_url: &str, path: &str) -> String {
    format!("{}/{}", versioned.trim_end_matches('/'), path)
 }

+fn normalize_audio_format(format: &str) -> String {
+    let normalized = format.trim().to_ascii_lowercase();
+    if normalized.is_empty() {
+        "wav".to_string()
+    } else {
+        normalized
+    }
+}
+
+fn parse_speech_audio_response(payload: &Value) -> Result<(Vec<u8>, Option<String>)> {
+    let audio = payload
+        .get("choices")
+        .and_then(Value::as_array)
+        .and_then(|choices| choices.first())
+        .and_then(|choice| {
+            choice
+                .get("message")
+                .and_then(|message| message.get("audio"))
+                .or_else(|| choice.get("delta").and_then(|delta| delta.get("audio")))
+        })
+        .or_else(|| payload.get("audio"))
+        .context("Speech synthesis response did not include choices[0].message.audio")?;
+
+    let data = audio
+        .get("data")
+        .and_then(Value::as_str)
+        .context("Speech synthesis response did not include audio.data")?
+        .trim();
+    let data = data
+        .split_once(',')
+        .map(|(_, base64)| base64.trim())
+        .unwrap_or(data);
+    let audio_bytes = general_purpose::STANDARD
+        .decode(data)
+        .context("Failed to decode speech audio base64 data")?;
+    let transcript = audio
+        .get("transcript")
+        .and_then(Value::as_str)
+        .map(str::to_string);
+
+    Ok((audio_bytes, transcript))
+}
+
 // === DeepSeekClient ===

 /// Returns true when DEEPSEEK_FORCE_HTTP1 is set to a truthy value
@@ -645,6 +714,104 @@ impl DeepSeekClient {
        parse_models_response(&response_text)
    }

+    /// Generate speech with Xiaomi MiMo TTS models.
+    ///
+    /// The spoken text is placed in an `assistant` message because Xiaomi
+    /// MiMo's TTS chat-completions surface expects that shape. The optional
+    /// `instruction` is a `user` message that controls style, voice design, or
+    /// voice-clone performance and is not spoken verbatim.
+    pub async fn synthesize_speech(
+        &self,
+        request: SpeechSynthesisRequest,
+    ) -> Result<SpeechSynthesisResponse> {
+        if self.api_provider != crate::config::ApiProvider::XiaomiMimo {
+            anyhow::bail!(
+                "speech synthesis requires provider 'xiaomi-mimo' (current: {})",
+                self.api_provider.as_str()
+            );
+        }
+
+        let model = request.model.trim().to_string();
+        if model.is_empty() {
+            anyhow::bail!("Speech model cannot be empty");
+        }
+        let text = request.text.trim().to_string();
+        if text.is_empty() {
+            anyhow::bail!("Speech text cannot be empty");
+        }
+
+        let audio_format = normalize_audio_format(&request.audio_format);
+        let model = wire_model_for_provider(self.api_provider, &model);
+        let model_lower = model.to_ascii_lowercase();
+        let instruction = request
+            .instruction
+            .as_deref()
+            .map(str::trim)
+            .filter(|value| !value.is_empty());
+        let voice = request
+            .voice
+            .as_deref()
+            .map(str::trim)
+            .filter(|value| !value.is_empty())
+            .map(str::to_string);
+
+        if model_lower.contains("voicedesign") && instruction.is_none() {
+            anyhow::bail!(
+                "Model '{model}' requires a voice design prompt. Pass --voice-prompt or --instruction."
+            );
+        }
+        if model_lower.contains("voiceclone") && voice.is_none() {
+            anyhow::bail!(
+                "Model '{model}' requires cloned voice data. Pass --clone-voice <mp3|wav> or --voice <data-uri>."
+            );
+        }
+
+        let mut audio = json!({
+            "format": audio_format.clone(),
+        });
+        if let Some(voice) = voice.as_deref() {
+            audio["voice"] = json!(voice);
+        }
+
+        let body = json!({
+            "model": model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": instruction.unwrap_or(""),
+                },
+                {
+                    "role": "assistant",
+                    "content": text,
+                }
+            ],
+            "audio": audio,
+        });
+
+        let url = api_url(&self.base_url, "chat/completions");
+        let response = self
+            .send_with_retry(|| self.http_client.post(&url).json(&body))
+            .await?;
+        let status = response.status();
+        if !status.is_success() {
+            let error_text = bounded_error_text(response, ERROR_BODY_MAX_BYTES).await;
+            anyhow::bail!("Speech synthesis failed: HTTP {status}: {error_text}");
+        }
+
+        let response_text = response.text().await.unwrap_or_default();
+        let payload: Value = serde_json::from_str(&response_text)
+            .context("Failed to parse speech synthesis response JSON")?;
+        let (audio_bytes, transcript) = parse_speech_audio_response(&payload)?;
+
+        Ok(SpeechSynthesisResponse {
+            model,
+            audio_format,
+            audio_bytes,
+            transcript,
+            voice,
+        })
+    }
+
    async fn wait_for_rate_limit(&self) {
        let maybe_delay = {
            let mut limiter = self.rate_limiter.lock().await;
@@ -1166,6 +1333,39 @@ mod tests {
        }
    }

+    #[test]
+    fn parse_speech_audio_response_accepts_message_audio() {
+        let encoded = general_purpose::STANDARD.encode(b"hi");
+        let payload = json!({
+            "choices": [{
+                "message": {
+                    "audio": {
+                        "data": encoded,
+                        "transcript": "hi"
+                    }
+                }
+            }]
+        });
+
+        let (audio, transcript) = parse_speech_audio_response(&payload).unwrap();
+        assert_eq!(audio, b"hi");
+        assert_eq!(transcript.as_deref(), Some("hi"));
+    }
+
+    #[test]
+    fn parse_speech_audio_response_accepts_data_uri() {
+        let encoded = general_purpose::STANDARD.encode(b"wav");
+        let payload = json!({
+            "audio": {
+                "data": format!("data:audio/wav;base64,{encoded}")
+            }
+        });
+
+        let (audio, transcript) = parse_speech_audio_response(&payload).unwrap();
+        assert_eq!(audio, b"wav");
+        assert_eq!(transcript, None);
+    }
+
    #[test]
    fn tool_name_roundtrip_dot() {
        let original = "multi_tool_use.parallel";
@@ -36,9 +36,13 @@ pub fn provider(app: &mut App, args: Option<&str>) -> CommandResult {

    let model = match model_arg {
        None => None,
+        Some(raw) if matches!(target, ApiProvider::XiaomiMimo) => {
+            let expanded = expand_model_alias_for_provider(target, raw);
+            Some(normalize_model_name_for_provider(target, &expanded).unwrap_or(expanded))
+        }
        Some(raw) if provider_passes_model_through(target) => Some(raw.trim().to_string()),
        Some(raw) => {
-            let expanded = expand_model_alias(raw);
+            let expanded = expand_model_alias_for_provider(target, raw);
            let normalized = if matches!(target, ApiProvider::Deepseek | ApiProvider::DeepseekCN) {
                normalize_model_name_for_provider(target, &expanded)
            } else {
@@ -48,7 +52,7 @@ pub fn provider(app: &mut App, args: Option<&str>) -> CommandResult {
                Some(normalized) => Some(normalized),
                None => {
                    return CommandResult::error(format!(
-                        "Invalid model '{raw}'. Try: flash, pro, deepseek-v4-flash, deepseek-v4-pro."
+                        "Invalid model '{raw}'. Try: flash, pro, deepseek-v4-flash, deepseek-v4-pro, or xiaomi-mimo tts."
                    ));
                }
            }
@@ -65,8 +69,24 @@ pub fn provider(app: &mut App, args: Option<&str>) -> CommandResult {
    })
 }

-fn expand_model_alias(name: &str) -> String {
-    match name.trim().to_ascii_lowercase().as_str() {
+fn expand_model_alias_for_provider(provider: ApiProvider, name: &str) -> String {
+    let lower = name.trim().to_ascii_lowercase();
+    if matches!(provider, ApiProvider::XiaomiMimo) {
+        return match lower.as_str() {
+            "pro" | "mimo" => "mimo-v2.5-pro".to_string(),
+            "text" => "mimo-v2.5".to_string(),
+            "tts" | "speech" | "mimo-tts" => "mimo-v2.5-tts".to_string(),
+            "voicedesign" | "voice-design" | "mimo-voice-design" => {
+                "mimo-v2.5-tts-voicedesign".to_string()
+            }
+            "voiceclone" | "voice-clone" | "mimo-voice-clone" => {
+                "mimo-v2.5-tts-voiceclone".to_string()
+            }
+            other => other.to_string(),
+        };
+    }
+
+    match lower.as_str() {
        "pro" | "v4-pro" => "deepseek-v4-pro".to_string(),
        "flash" | "v4-flash" => "deepseek-v4-flash".to_string(),
        other => other.to_string(),
@@ -154,6 +174,28 @@ mod tests {
        }
    }

+    #[test]
+    fn switch_to_xiaomi_mimo_accepts_tts_shorthands() {
+        let mut app = create_test_app();
+        let result = provider(&mut app, Some("xiaomi-mimo tts"));
+        match result.action {
+            Some(AppAction::SwitchProvider { provider, model }) => {
+                assert_eq!(provider, ApiProvider::XiaomiMimo);
+                assert_eq!(model.as_deref(), Some("mimo-v2.5-tts"));
+            }
+            other => panic!("expected SwitchProvider, got {other:?}"),
+        }
+
+        let result = provider(&mut app, Some("xiaomi-mimo voiceclone"));
+        match result.action {
+            Some(AppAction::SwitchProvider { provider, model }) => {
+                assert_eq!(provider, ApiProvider::XiaomiMimo);
+                assert_eq!(model.as_deref(), Some("mimo-v2.5-tts-voiceclone"));
+            }
+            other => panic!("expected SwitchProvider, got {other:?}"),
+        }
+    }
+
    #[test]
    fn switch_to_atlascloud_emits_action() {
        let mut app = create_test_app();
@@ -78,6 +78,10 @@ pub const RECENT_OPENROUTER_LARGE_MODELS: &[&str] = &[
 pub const DEFAULT_OPENROUTER_BASE_URL: &str = "https://openrouter.ai/api/v1";
 pub const DEFAULT_XIAOMI_MIMO_MODEL: &str = "mimo-v2.5-pro";
 pub const DEFAULT_XIAOMI_MIMO_BASE_URL: &str = "https://api.xiaomimimo.com/v1";
+pub const XIAOMI_MIMO_TTS_MODEL: &str = "mimo-v2.5-tts";
+pub const XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL: &str = "mimo-v2.5-tts-voicedesign";
+pub const XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL: &str = "mimo-v2.5-tts-voiceclone";
+pub const XIAOMI_MIMO_V2_TTS_MODEL: &str = "mimo-v2-tts";
 pub const DEFAULT_NOVITA_MODEL: &str = "deepseek/deepseek-v4-pro";
 pub const DEFAULT_NOVITA_FLASH_MODEL: &str = "deepseek/deepseek-v4-flash";
 pub const DEFAULT_NOVITA_BASE_URL: &str = "https://api.novita.ai/v1";
@@ -538,6 +542,38 @@ fn canonical_openrouter_recent_model_id(model: &str) -> Option<&'static str> {
    }
 }

+fn canonical_xiaomi_mimo_model_id(model: &str) -> Option<&'static str> {
+    let normalized = model.trim().to_ascii_lowercase();
+    let normalized = normalized.replace(['_', ' '], "-");
+    match normalized.as_str() {
+        "mimo"
+        | DEFAULT_XIAOMI_MIMO_MODEL
+        | "mimo-v2-5-pro"
+        | "xiaomi-mimo-v2.5-pro"
+        | "xiaomi-mimo-v2-5-pro" => Some(DEFAULT_XIAOMI_MIMO_MODEL),
+        "mimo-v2.5" | "mimo-v25" | "mimo-v2-5" | "xiaomi-mimo-v2.5" | "xiaomi-mimo-v2-5" => {
+            Some("mimo-v2.5")
+        }
+        "mimo-tts" | "mimo-v25-tts" | "mimo-v2.5-tts" | "tts" | "speech" => {
+            Some(XIAOMI_MIMO_TTS_MODEL)
+        }
+        "mimo-tts-voicedesign"
+        | "mimo-voice-design"
+        | "mimo-v25-tts-voicedesign"
+        | "mimo-v2.5-tts-voicedesign"
+        | "voicedesign"
+        | "voice-design" => Some(XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL),
+        "mimo-tts-voiceclone"
+        | "mimo-voice-clone"
+        | "mimo-v25-tts-voiceclone"
+        | "mimo-v2.5-tts-voiceclone"
+        | "voiceclone"
+        | "voice-clone" => Some(XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL),
+        "mimo-v2-tts" => Some(XIAOMI_MIMO_V2_TTS_MODEL),
+        _ => None,
+    }
+}
+
 /// Normalize a model selected through the TUI for the active provider.
 ///
 /// Official DeepSeek endpoints require bare model IDs. Provider-prefixed
@@ -556,6 +592,12 @@ pub fn normalize_model_name_for_provider(provider: ApiProvider, model: &str) ->
        return Some(canonical.to_string());
    }

+    if matches!(provider, ApiProvider::XiaomiMimo)
+        && let Some(canonical) = canonical_xiaomi_mimo_model_id(model)
+    {
+        return Some(canonical.to_string());
+    }
+
    let normalized = normalize_model_name(model)?;
    if matches!(provider, ApiProvider::Deepseek | ApiProvider::DeepseekCN)
        && let Some(canonical) = canonical_official_deepseek_model_id(&normalized)
@@ -585,7 +627,14 @@ pub fn normalize_model_name_for_provider(provider: ApiProvider, model: &str) ->
 #[must_use]
 pub fn wire_model_for_provider(provider: ApiProvider, model: &str) -> String {
    let trimmed = model.trim();
-    if trimmed.is_empty() || provider_passes_model_through(provider) {
+    if trimmed.is_empty() {
+        return trimmed.to_string();
+    }
+    if matches!(provider, ApiProvider::XiaomiMimo) {
+        return normalize_model_name_for_provider(provider, trimmed)
+            .unwrap_or_else(|| trimmed.to_string());
+    }
+    if provider_passes_model_through(provider) {
        return trimmed.to_string();
    }
    normalize_model_name_for_provider(provider, trimmed).unwrap_or_else(|| trimmed.to_string())
@@ -601,7 +650,14 @@ pub fn model_completion_names_for_provider(provider: ApiProvider) -> Vec<&'stati
            models.extend_from_slice(RECENT_OPENROUTER_LARGE_MODELS);
            models
        }
-        ApiProvider::XiaomiMimo => vec![DEFAULT_XIAOMI_MIMO_MODEL, "mimo-v2.5"],
+        ApiProvider::XiaomiMimo => vec![
+            DEFAULT_XIAOMI_MIMO_MODEL,
+            "mimo-v2.5",
+            XIAOMI_MIMO_TTS_MODEL,
+            XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL,
+            XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL,
+            XIAOMI_MIMO_V2_TTS_MODEL,
+        ],
        ApiProvider::Novita => vec![DEFAULT_NOVITA_MODEL, DEFAULT_NOVITA_FLASH_MODEL],
        ApiProvider::Fireworks => vec![DEFAULT_FIREWORKS_MODEL],
        ApiProvider::Siliconflow => {
@@ -822,6 +878,15 @@ pub struct MemoryConfig {
    pub enabled: Option<bool>,
 }

+/// Xiaomi MiMo speech/TTS output configuration.
+#[derive(Debug, Clone, Default, Deserialize)]
+pub struct SpeechConfig {
+    /// Default directory for generated speech/TTS files when no explicit
+    /// output path is provided.
+    #[serde(default)]
+    pub output_dir: Option<String>,
+}
+
 impl SnapshotsConfig {
    #[must_use]
    pub fn max_age(&self) -> std::time::Duration {
@@ -1429,6 +1494,10 @@ pub struct Config {
    #[serde(default)]
    pub memory: Option<MemoryConfig>,

+    /// Xiaomi MiMo speech/TTS defaults.
+    #[serde(default)]
+    pub speech: Option<SpeechConfig>,
+
    /// Tunables for `--model auto` (#1207). When absent, the auto router
    /// keeps its existing balanced behaviour.
    #[serde(default)]
@@ -2353,6 +2422,26 @@ impl Config {
            .unwrap_or_else(|| PathBuf::from("./memory.md"))
    }

+    /// Resolve the default speech/TTS output directory, if configured.
+    #[must_use]
+    pub fn speech_output_dir(&self) -> Option<PathBuf> {
+        std::env::var("XIAOMI_MIMO_SPEECH_OUTPUT_DIR")
+            .or_else(|_| std::env::var("MIMO_SPEECH_OUTPUT_DIR"))
+            .or_else(|_| std::env::var("XIAOMIMIMO_SPEECH_OUTPUT_DIR"))
+            .ok()
+            .map(|value| value.trim().to_string())
+            .filter(|value| !value.is_empty())
+            .map(|value| expand_path(&value))
+            .or_else(|| {
+                self.speech
+                    .as_ref()
+                    .and_then(|speech| speech.output_dir.as_deref())
+                    .map(str::trim)
+                    .filter(|value| !value.is_empty())
+                    .map(expand_path)
+            })
+    }
+
    /// Resolve the configured `instructions = [...]` array (#454)
    /// to absolute paths, in declared order. Empty when unset or
    /// when every entry is empty after trimming. Each entry runs
@@ -3540,6 +3629,11 @@ fn normalize_model_config(config: &mut Config) {
 }

 fn normalize_model_for_provider(provider: ApiProvider, model: &str) -> Option<String> {
+    if matches!(provider, ApiProvider::XiaomiMimo)
+        && let Some(canonical) = canonical_xiaomi_mimo_model_id(model)
+    {
+        return Some(canonical.to_string());
+    }
    if provider_passes_model_through(provider) {
        return None;
    }
@@ -3788,6 +3882,7 @@ fn merge_config(base: Config, override_cfg: Config) -> Config {
        snapshots: override_cfg.snapshots.or(base.snapshots),
        search: override_cfg.search.or(base.search),
        memory: override_cfg.memory.or(base.memory),
+        speech: override_cfg.speech.or(base.speech),
        auto: override_cfg.auto.or(base.auto),
        update: override_cfg.update.or(base.update),
        lsp: override_cfg.lsp.or(base.lsp),
@@ -6510,6 +6605,37 @@ api_key = "old-openrouter-key"
        }
    }

+    #[test]
+    fn normalize_xiaomi_mimo_tts_aliases_for_provider() {
+        assert_eq!(
+            normalize_model_name_for_provider(ApiProvider::XiaomiMimo, "tts").as_deref(),
+            Some("mimo-v2.5-tts")
+        );
+        assert_eq!(
+            normalize_model_name_for_provider(ApiProvider::XiaomiMimo, "voice-design").as_deref(),
+            Some("mimo-v2.5-tts-voicedesign")
+        );
+        assert_eq!(
+            wire_model_for_provider(ApiProvider::XiaomiMimo, "voiceclone"),
+            "mimo-v2.5-tts-voiceclone"
+        );
+    }
+
+    #[test]
+    fn model_completion_names_for_xiaomi_mimo_include_tts_models() {
+        let models = model_completion_names_for_provider(ApiProvider::XiaomiMimo);
+        for expected in [
+            "mimo-v2.5-pro",
+            "mimo-v2.5",
+            "mimo-v2.5-tts",
+            "mimo-v2.5-tts-voicedesign",
+            "mimo-v2.5-tts-voiceclone",
+            "mimo-v2-tts",
+        ] {
+            assert!(models.contains(&expected), "missing {expected}");
+        }
+    }
+
    #[test]
    fn model_completion_names_for_deepseek_api_are_deduplicated_bare_ids() {
        assert_eq!(
@@ -161,6 +161,8 @@ pub struct EngineConfig {
    /// Path to the user memory file (#489). Always populated; only
    /// consulted when `memory_enabled` is `true`.
    pub memory_path: PathBuf,
+    /// Default directory for Xiaomi MiMo speech/TTS tool outputs.
+    pub speech_output_dir: Option<PathBuf>,
    pub vision_config: Option<crate::config::VisionModelConfig>,
    pub goal_objective: Option<String>,
    /// Tool restriction from custom slash command frontmatter.
@@ -236,6 +238,7 @@ impl Default for EngineConfig {
            subagent_model_overrides: HashMap::new(),
            memory_enabled: false,
            memory_path: PathBuf::from("./memory.md"),
+            speech_output_dir: None,
            vision_config: None,
            strict_tool_mode: false,
            goal_objective: None,
@@ -78,7 +78,11 @@ impl Engine {
        if mode != AppMode::Plan {
            builder = builder
                .with_rlm_tool(self.deepseek_client.clone(), self.session.model.clone())
-                .with_fim_tool(self.deepseek_client.clone(), self.session.model.clone());
+                .with_fim_tool(self.deepseek_client.clone(), self.session.model.clone())
+                .with_speech_tools(
+                    self.deepseek_client.clone(),
+                    self.config.speech_output_dir.clone(),
+                );
        }

        if self.config.features.enabled(Feature::ApplyPatch) && mode != AppMode::Plan {
@@ -6,6 +6,7 @@ use std::process::{Command, Stdio};
 use std::time::Duration;

 use anyhow::{Context, Result, anyhow, bail};
+use base64::{Engine as _, engine::general_purpose};
 use clap::{Args, CommandFactory, Parser, Subcommand, ValueEnum};
 use clap_complete::{Shell, generate};
 use dotenvy::dotenv;
@@ -225,6 +226,9 @@ enum Commands {
    Logout,
    /// List available models from the configured API endpoint
    Models(ModelsArgs),
+    /// Generate speech audio with Xiaomi MiMo TTS models
+    #[command(visible_alias = "tts")]
+    Speech(SpeechArgs),
    /// Run a non-interactive prompt. Use --auto for tool-backed agent mode.
    Exec(ExecArgs),
    /// Generate SWE-bench prediction rows from CodeWhale runs
@@ -531,6 +535,50 @@ struct ModelsArgs {
    json: bool,
 }

+#[derive(Args, Debug, Clone)]
+struct SpeechArgs {
+    /// Text to synthesize. This is sent as the assistant message content.
+    #[arg(value_name = "TEXT")]
+    text: String,
+
+    /// Output audio path. Defaults to speech.<format> in --output-dir,
+    /// [speech].output_dir, or the current directory.
+    #[arg(short, long, value_name = "FILE")]
+    output: Option<PathBuf>,
+
+    /// Directory for the default speech.<format> output file when -o/--output is omitted.
+    #[arg(long = "output-dir", value_name = "DIR")]
+    output_dir: Option<PathBuf>,
+
+    /// TTS model. Defaults to built-in voices, or is inferred from --voice-prompt/--clone-voice.
+    #[arg(long)]
+    model: Option<String>,
+
+    /// Built-in voice ID, or a data:audio/...;base64,... URI for voice clone.
+    #[arg(long)]
+    voice: Option<String>,
+
+    /// Natural language style instruction; not spoken verbatim.
+    #[arg(long)]
+    instruction: Option<String>,
+
+    /// Voice design prompt. Implies mimo-v2.5-tts-voicedesign when --model is omitted.
+    #[arg(long = "voice-prompt")]
+    voice_prompt: Option<String>,
+
+    /// MP3/WAV sample used for voice cloning. Implies mimo-v2.5-tts-voiceclone when --model is omitted.
+    #[arg(long = "clone-voice", value_name = "FILE")]
+    clone_voice: Option<PathBuf>,
+
+    /// Output audio format requested from the API
+    #[arg(long, default_value = "wav")]
+    format: String,
+
+    /// Emit machine-readable JSON output
+    #[arg(long, default_value_t = false)]
+    json: bool,
+}
+
 #[derive(Args, Debug, Default, Clone)]
 struct FeatureToggles {
    /// Enable a feature (repeatable). Equivalent to `features.<name>=true`.
@@ -896,6 +944,10 @@ async fn main() -> Result<()> {
                let config = load_config_from_cli(&cli)?;
                run_models(&config, args).await
            }
+            Commands::Speech(args) => {
+                let config = load_config_from_cli(&cli)?;
+                run_speech(&config, args).await
+            }
            Commands::Exec(args) => {
                let config = load_config_from_cli(&cli)?;
                let workspace = cli.workspace.clone().unwrap_or_else(|| {
@@ -3514,6 +3566,258 @@ async fn run_models(config: &Config, args: ModelsArgs) -> Result<()> {
    Ok(())
 }

+async fn run_speech(config: &Config, args: SpeechArgs) -> Result<()> {
+    use crate::client::{DeepSeekClient, SpeechSynthesisRequest};
+    use crate::config::{ApiProvider, normalize_model_name_for_provider};
+
+    let SpeechArgs {
+        text,
+        output,
+        output_dir,
+        model,
+        voice,
+        instruction,
+        voice_prompt,
+        clone_voice,
+        format,
+        json: json_output,
+    } = args;
+
+    if config.api_provider() != ApiProvider::XiaomiMimo {
+        bail!(
+            "`speech` requires provider = \"xiaomi-mimo\" (current: {}). Run with `--provider xiaomi-mimo` or set it in config.",
+            config.api_provider().as_str()
+        );
+    }
+
+    if text.trim().is_empty() {
+        bail!("Speech text cannot be empty");
+    }
+    let voice_is_data_uri = voice
+        .as_deref()
+        .map(str::trim)
+        .is_some_and(|value| value.starts_with("data:audio/"));
+    if clone_voice.is_some() && voice.is_some() {
+        bail!("Use either --clone-voice or --voice for cloned voice data, not both");
+    }
+    let model = match model {
+        Some(value) => {
+            normalize_model_name_for_provider(ApiProvider::XiaomiMimo, &value).unwrap_or(value)
+        }
+        None => {
+            if clone_voice.is_some() || voice_is_data_uri {
+                "mimo-v2.5-tts-voiceclone".to_string()
+            } else if voice_prompt.is_some() {
+                "mimo-v2.5-tts-voicedesign".to_string()
+            } else {
+                "mimo-v2.5-tts".to_string()
+            }
+        }
+    };
+    let model_lower = model.to_ascii_lowercase();
+    if !model_lower.contains("tts") {
+        bail!(
+            "speech requires a TTS model (examples: mimo-v2.5-tts, mimo-v2.5-tts-voicedesign, mimo-v2.5-tts-voiceclone); got {model}"
+        );
+    }
+    let is_voice_design = model_lower.contains("voicedesign");
+    let is_voice_clone = model_lower.contains("voiceclone");
+
+    let instruction = combine_speech_instructions(instruction, voice_prompt);
+    if is_voice_design
+        && instruction
+            .as_deref()
+            .is_none_or(|value| value.trim().is_empty())
+    {
+        bail!(
+            "mimo-v2.5-tts-voicedesign requires --voice-prompt or --instruction to describe the voice"
+        );
+    }
+
+    let voice = if let Some(clone_path) = clone_voice {
+        Some(encode_voice_clone_data_uri(&clone_path)?)
+    } else if is_voice_design {
+        None
+    } else if let Some(value) = voice.filter(|value| !value.trim().is_empty()) {
+        Some(value)
+    } else if is_voice_clone {
+        bail!("mimo-v2.5-tts-voiceclone requires --clone-voice <mp3|wav> or --voice <data-uri>");
+    } else {
+        Some("mimo_default".to_string())
+    };
+    let format = normalize_speech_format(&format).with_context(|| {
+        format!("Unsupported speech format '{format}' (allowed: wav, mp3, pcm16)")
+    })?;
+    let output = resolve_speech_output_path(
+        output,
+        output_dir.or_else(|| config.speech_output_dir()),
+        &format,
+    );
+
+    let client = DeepSeekClient::new(config)?;
+    let response = client
+        .synthesize_speech(SpeechSynthesisRequest {
+            model: model.clone(),
+            text,
+            instruction,
+            audio_format: format.clone(),
+            voice,
+        })
+        .await?;
+
+    if let Some(parent) = output.parent().filter(|path| !path.as_os_str().is_empty()) {
+        std::fs::create_dir_all(parent)
+            .with_context(|| format!("Failed to create output directory {}", parent.display()))?;
+    }
+    std::fs::write(&output, &response.audio_bytes)
+        .with_context(|| format!("Failed to write audio file {}", output.display()))?;
+
+    if json_output {
+        println!(
+            "{}",
+            serde_json::to_string_pretty(&serde_json::json!({
+                "mode": "speech",
+                "success": true,
+                "model": response.model,
+                "format": response.audio_format,
+                "output": output.display().to_string(),
+                "bytes": response.audio_bytes.len(),
+                "voice": response.voice.as_deref().map(describe_speech_voice),
+                "transcript": response.transcript,
+            }))?
+        );
+    } else {
+        println!(
+            "Generated speech: {} ({} bytes, model: {}, format: {})",
+            output.display(),
+            response.audio_bytes.len(),
+            response.model,
+            response.audio_format
+        );
+    }
+
+    Ok(())
+}
+
+fn combine_speech_instructions(
+    instruction: Option<String>,
+    voice_prompt: Option<String>,
+) -> Option<String> {
+    match (instruction, voice_prompt) {
+        (Some(instruction), Some(voice_prompt)) => {
+            let instruction = instruction.trim();
+            let voice_prompt = voice_prompt.trim();
+            if instruction.is_empty() {
+                Some(voice_prompt.to_string()).filter(|value| !value.is_empty())
+            } else if voice_prompt.is_empty() {
+                Some(instruction.to_string()).filter(|value| !value.is_empty())
+            } else {
+                Some(format!("{voice_prompt}\n\n{instruction}"))
+            }
+        }
+        (Some(value), None) | (None, Some(value)) => {
+            let value = value.trim().to_string();
+            if value.is_empty() { None } else { Some(value) }
+        }
+        (None, None) => None,
+    }
+}
+
+const VOICE_CLONE_BASE64_MAX_BYTES: usize = 10 * 1024 * 1024;
+
+fn normalize_speech_format(format: &str) -> Option<String> {
+    let normalized = format.trim().to_ascii_lowercase();
+    match normalized.as_str() {
+        "wav" | "mp3" | "pcm16" => Some(normalized),
+        "pcm" => Some("pcm16".to_string()),
+        _ => None,
+    }
+}
+
+fn default_speech_output_name(format: &str) -> String {
+    format!(
+        "speech.{}",
+        normalize_speech_format(format).as_deref().unwrap_or("wav")
+    )
+}
+
+fn resolve_speech_output_path(
+    output: Option<PathBuf>,
+    output_dir: Option<PathBuf>,
+    format: &str,
+) -> PathBuf {
+    output.unwrap_or_else(|| {
+        output_dir
+            .unwrap_or_default()
+            .join(default_speech_output_name(format))
+    })
+}
+
+fn encode_voice_clone_data_uri(path: &Path) -> Result<String> {
+    let bytes = std::fs::read(path)
+        .with_context(|| format!("Failed to read voice clone sample {}", path.display()))?;
+    let base64_audio = general_purpose::STANDARD.encode(bytes);
+    if base64_audio.len() > VOICE_CLONE_BASE64_MAX_BYTES {
+        bail!(
+            "Voice clone sample is too large after base64 encoding ({} bytes > 10 MB)",
+            base64_audio.len()
+        );
+    }
+
+    let extension = path
+        .extension()
+        .and_then(|value| value.to_str())
+        .unwrap_or_default()
+        .to_ascii_lowercase();
+    let mime = match extension.as_str() {
+        "mp3" => "audio/mpeg",
+        "wav" => "audio/wav",
+        other => bail!(
+            "Unsupported voice clone sample extension '{}'. Use .mp3 or .wav.",
+            other
+        ),
+    };
+
+    Ok(format!("data:{mime};base64,{base64_audio}"))
+}
+
+fn describe_speech_voice(voice: &str) -> String {
+    if voice.starts_with("data:") {
+        "embedded voice clone sample".to_string()
+    } else {
+        voice.to_string()
+    }
+}
+
+#[cfg(test)]
+mod speech_cli_tests {
+    use super::*;
+
+    #[test]
+    fn normalizes_documented_speech_formats() {
+        assert_eq!(normalize_speech_format("WAV").as_deref(), Some("wav"));
+        assert_eq!(normalize_speech_format("pcm16").as_deref(), Some("pcm16"));
+        assert_eq!(normalize_speech_format("pcm").as_deref(), Some("pcm16"));
+        assert_eq!(normalize_speech_format("flac"), None);
+    }
+
+    #[test]
+    fn default_speech_output_tracks_requested_format() {
+        assert_eq!(
+            resolve_speech_output_path(None, None, "mp3"),
+            PathBuf::from("speech.mp3")
+        );
+        assert_eq!(
+            resolve_speech_output_path(None, Some(PathBuf::from("audio")), "pcm"),
+            PathBuf::from("audio").join("speech.pcm16")
+        );
+        assert_eq!(
+            resolve_speech_output_path(Some(PathBuf::from("custom.wav")), None, "mp3"),
+            PathBuf::from("custom.wav")
+        );
+    }
+}
+
 /// Test API connectivity by making a minimal request
 async fn test_api_connectivity(config: &Config) -> Result<()> {
    use crate::client::DeepSeekClient;
@@ -5462,6 +5766,7 @@ async fn run_exec_agent(
        prefer_bwrap: config.prefer_bwrap.unwrap_or(false),
        memory_enabled: config.memory_enabled(),
        memory_path: config.memory_path(),
+        speech_output_dir: config.speech_output_dir(),
        vision_config: config.vision_model_config(),
        strict_tool_mode: config.strict_tool_mode.unwrap_or(false),
        goal_objective: None,
@@ -2017,6 +2017,7 @@ impl RuntimeThreadManager {
            prefer_bwrap: self.config.prefer_bwrap.unwrap_or(false),
            memory_enabled: self.config.memory_enabled(),
            memory_path: self.config.memory_path(),
+            speech_output_dir: self.config.speech_output_dir(),
            vision_config: self.config.vision_model_config(),
            strict_tool_mode: self.config.strict_tool_mode.unwrap_or(false),
            goal_objective: None,
@@ -48,6 +48,7 @@ pub mod shell;
 mod shell_output;
 pub mod skill;
 pub mod spec;
+pub mod speech;
 pub mod subagent;
 pub mod tasks;
 pub mod test_runner;
@@ -9,7 +9,7 @@
 use std::collections::HashMap;
 use std::sync::{Arc, OnceLock};

-use std::path::Path;
+use std::path::{Path, PathBuf};

 use serde_json::Value;

@@ -776,6 +776,22 @@ impl ToolRegistryBuilder {
        self.with_tool(Arc::new(RevertTurnTool))
    }

+    /// Include Xiaomi MiMo speech/TTS tools (`speech`, `tts`).
+    #[must_use]
+    pub fn with_speech_tools(
+        self,
+        client: Option<DeepSeekClient>,
+        output_dir: Option<PathBuf>,
+    ) -> Self {
+        use super::speech::SpeechTool;
+        self.with_tool(Arc::new(SpeechTool::new(
+            "speech",
+            client.clone(),
+            output_dir.clone(),
+        )))
+        .with_tool(Arc::new(SpeechTool::new("tts", client, output_dir)))
+    }
+
    /// Include persistent RLM session tools.
    #[must_use]
    pub fn with_rlm_tool(self, client: Option<DeepSeekClient>, _root_model: String) -> Self {
@@ -958,11 +974,13 @@ impl ToolRegistryBuilder {
        todo_list: super::todo::SharedTodoList,
        plan_state: super::plan::SharedPlanState,
    ) -> Self {
+        let speech_client = client.clone();
        self.with_agent_tools(allow_shell)
            .with_todo_tool(todo_list)
            .with_plan_tool(plan_state)
            .with_review_tool(client.clone(), model.clone())
            .with_rlm_tool(client, model)
+            .with_speech_tools(speech_client, None)
            .with_recall_archive_tool()
            .with_subagent_tools(manager, runtime)
    }
@@ -1218,6 +1236,18 @@ mod tests {
        assert!(registry.contains("list_dir"));
    }

+    #[test]
+    fn builder_registers_speech_alias_tools() {
+        let tmp = tempdir().expect("tempdir");
+        let ctx = ToolContext::new(tmp.path().to_path_buf());
+        let registry = ToolRegistryBuilder::new()
+            .with_speech_tools(None, None)
+            .build(ctx);
+
+        assert!(registry.contains("speech"));
+        assert!(registry.contains("tts"));
+    }
+
    #[test]
    fn test_registry_names() {
        let tmp = tempdir().expect("tempdir");
@@ -0,0 +1,528 @@
+//! Model-visible Xiaomi MiMo speech/TTS generation tool.
+//!
+//! This mirrors the CLI `speech` / `tts` command as a first-class API tool so
+//! the TUI model can generate narrated audio without shelling out to a nested
+//! CodeWhale process.
+
+use std::path::{Path, PathBuf};
+
+use async_trait::async_trait;
+use base64::{Engine as _, engine::general_purpose};
+use serde_json::{Value, json};
+
+use crate::client::{DeepSeekClient, SpeechSynthesisRequest};
+use crate::config::{ApiProvider, normalize_model_name_for_provider};
+use crate::network_policy::{Decision, host_from_url};
+
+use super::spec::{
+    ApprovalRequirement, ToolCapability, ToolContext, ToolError, ToolResult, ToolSpec,
+    optional_bool, optional_str, required_str,
+};
+
+const DEFAULT_FORMAT: &str = "wav";
+const DEFAULT_VOICE: &str = "mimo_default";
+const VOICE_CLONE_BASE64_MAX_BYTES: usize = 10 * 1024 * 1024;
+const SUPPORTED_SPEECH_FORMATS: &[&str] = &["wav", "mp3", "pcm16"];
+
+pub const SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS: &[&str] = &[
+    "mimo-v2.5-pro",
+    "mimo-v2.5",
+    "mimo-v2.5-tts-voiceclone",
+    "mimo-v2.5-tts-voicedesign",
+    "mimo-v2.5-tts",
+    "mimo-v2-pro",
+    "mimo-v2-omni",
+    "mimo-v2-tts",
+];
+
+const SPEECH_MODEL_EXAMPLES: &[&str] = &[
+    "mimo-v2.5-tts",
+    "mimo-v2.5-tts-voicedesign",
+    "mimo-v2.5-tts-voiceclone",
+    "mimo-v2-tts",
+];
+
+pub struct SpeechTool {
+    name: &'static str,
+    client: Option<DeepSeekClient>,
+    output_dir: Option<PathBuf>,
+}
+
+impl SpeechTool {
+    #[must_use]
+    pub fn new(
+        name: &'static str,
+        client: Option<DeepSeekClient>,
+        output_dir: Option<PathBuf>,
+    ) -> Self {
+        Self {
+            name,
+            client,
+            output_dir,
+        }
+    }
+}
+
+#[async_trait]
+impl ToolSpec for SpeechTool {
+    fn name(&self) -> &str {
+        self.name
+    }
+
+    fn description(&self) -> &str {
+        "Generate speech/audio directly through the configured Xiaomi MiMo OpenAI-compatible API. Use this when the user asks for speech, TTS, narration, read-aloud, voice design, or voice cloning."
+    }
+
+    fn input_schema(&self) -> Value {
+        json!({
+            "type": "object",
+            "properties": {
+                "text": {
+                    "type": "string",
+                    "description": "Text to synthesize. This is sent as the assistant message and is the spoken content; MiMo TTS style/audio tags may be included here."
+                },
+                "output": {
+                    "type": "string",
+                    "description": "Audio file path to write, relative to the workspace unless absolute. Default: speech.<format> in output_dir, configured [speech].output_dir, or the workspace."
+                },
+                "output_dir": {
+                    "type": "string",
+                    "description": "Directory for the default speech.<format> output file when output is omitted. Relative paths stay inside the workspace."
+                },
+                "model": {
+                    "type": "string",
+                    "description": "TTS model. Defaults to mimo-v2.5-tts, or infers voice-design/voice-clone models from voice_prompt/clone_voice.",
+                    "enum": SPEECH_MODEL_EXAMPLES
+                },
+                "voice": {
+                    "type": "string",
+                    "description": "Built-in voice ID (for example mimo_default, 冰糖, 茉莉, 苏打, 白桦, Mia, Chloe, Milo, Dean) or a data:audio/...;base64,... URI for voice clone."
+                },
+                "instruction": {
+                    "type": "string",
+                    "description": "Natural-language style, emotion, speed, scene, or performance instruction. It is not spoken verbatim."
+                },
+                "voice_prompt": {
+                    "type": "string",
+                    "description": "Voice design prompt. When model is omitted this uses mimo-v2.5-tts-voicedesign."
+                },
+                "clone_voice": {
+                    "type": "string",
+                    "description": "Path to a .mp3 or .wav voice sample for cloning. When model is omitted this uses mimo-v2.5-tts-voiceclone."
+                },
+                "format": {
+                    "type": "string",
+                    "description": "Requested audio format. Default: wav. MiMo-V2.5-TTS documentation examples use wav and pcm16; mp3 is accepted when the API returns it.",
+                    "enum": SUPPORTED_SPEECH_FORMATS
+                },
+                "stream": {
+                    "type": "boolean",
+                    "description": "Low-latency streaming request. The direct tool currently writes complete audio files only, so leave this false."
+                }
+            },
+            "required": ["text"]
+        })
+    }
+
+    fn capabilities(&self) -> Vec<ToolCapability> {
+        vec![
+            ToolCapability::WritesFiles,
+            ToolCapability::Network,
+            ToolCapability::Sandboxable,
+        ]
+    }
+
+    fn approval_requirement(&self) -> ApprovalRequirement {
+        // Speech generation is an explicit user-facing generation action.
+        // Path resolution still enforces workspace/trusted-root boundaries.
+        ApprovalRequirement::Auto
+    }
+
+    async fn execute(&self, input: Value, context: &ToolContext) -> Result<ToolResult, ToolError> {
+        let text = required_str(&input, "text")?.trim().to_string();
+        if text.is_empty() {
+            return Err(ToolError::invalid_input("speech text cannot be empty"));
+        }
+
+        let client = self.client.clone().ok_or_else(|| {
+            ToolError::not_available(
+                "speech tool requires an active Xiaomi MiMo API client; configure provider = \"xiaomi-mimo\" and an API key first",
+            )
+        })?;
+
+        let requested_format_raw = optional_str(&input, "format")
+            .map(str::trim)
+            .filter(|value| !value.is_empty())
+            .unwrap_or(DEFAULT_FORMAT);
+        let requested_format = normalize_speech_format(requested_format_raw).ok_or_else(|| {
+            ToolError::invalid_input(format!(
+                "unsupported speech format '{requested_format_raw}' (allowed: {})",
+                SUPPORTED_SPEECH_FORMATS.join(", ")
+            ))
+        })?;
+        if optional_bool(&input, "stream", false) {
+            return Err(ToolError::invalid_input(
+                "stream=true low-latency speech output is not implemented in the direct tool yet; use stream=false to generate a complete audio file",
+            ));
+        }
+        let output_raw = optional_str(&input, "output")
+            .map(str::trim)
+            .filter(|value| !value.is_empty());
+        let output_path = resolve_speech_output_path(
+            &input,
+            context,
+            output_raw,
+            &requested_format,
+            self.output_dir.as_ref(),
+        )?;
+        let output_label = output_raw
+            .map(str::to_string)
+            .unwrap_or_else(|| output_path.display().to_string());
+
+        let raw_voice = optional_str(&input, "voice")
+            .map(str::trim)
+            .filter(|value| !value.is_empty())
+            .map(str::to_string);
+        let raw_instruction = optional_str(&input, "instruction")
+            .map(str::trim)
+            .filter(|value| !value.is_empty())
+            .map(str::to_string);
+        let voice_prompt = optional_str(&input, "voice_prompt")
+            .map(str::trim)
+            .filter(|value| !value.is_empty())
+            .map(str::to_string);
+        let clone_voice = optional_str(&input, "clone_voice")
+            .map(str::trim)
+            .filter(|value| !value.is_empty())
+            .map(str::to_string);
+
+        let voice_is_data_uri = raw_voice
+            .as_deref()
+            .is_some_and(|value| value.starts_with("data:audio/"));
+        if clone_voice.is_some() && raw_voice.is_some() {
+            return Err(ToolError::invalid_input(
+                "use either clone_voice or voice for cloned voice data, not both",
+            ));
+        }
+        let model = infer_speech_model(
+            optional_str(&input, "model"),
+            clone_voice.is_some() || voice_is_data_uri,
+            voice_prompt.is_some(),
+        );
+        let model_lower = model.to_ascii_lowercase();
+        if !model_lower.contains("tts") {
+            return Err(ToolError::invalid_input(format!(
+                "speech tool requires a TTS model (examples: {}), got '{model}'",
+                SPEECH_MODEL_EXAMPLES.join(", ")
+            )));
+        }
+
+        let is_voice_design = model_lower.contains("voicedesign");
+        let is_voice_clone = model_lower.contains("voiceclone");
+        let instruction = combine_speech_instructions(raw_instruction, voice_prompt);
+        if is_voice_design
+            && instruction
+                .as_deref()
+                .is_none_or(|value| value.trim().is_empty())
+        {
+            return Err(ToolError::invalid_input(
+                "mimo-v2.5-tts-voicedesign requires voice_prompt or instruction",
+            ));
+        }
+
+        let voice = if let Some(clone_path) = clone_voice {
+            let clone_path = context.resolve_path(&clone_path)?;
+            Some(encode_voice_clone_data_uri(&clone_path).await?)
+        } else if is_voice_design {
+            None
+        } else if let Some(value) = raw_voice {
+            Some(value)
+        } else if is_voice_clone {
+            return Err(ToolError::invalid_input(
+                "mimo-v2.5-tts-voiceclone requires clone_voice <mp3|wav> or voice <data-uri>",
+            ));
+        } else {
+            Some(DEFAULT_VOICE.to_string())
+        };
+
+        check_network_policy(context, client.base_url())?;
+
+        let response = client
+            .synthesize_speech(SpeechSynthesisRequest {
+                model: model.clone(),
+                text,
+                instruction,
+                audio_format: requested_format,
+                voice,
+            })
+            .await
+            .map_err(|err| {
+                ToolError::execution_failed(format!("speech synthesis failed: {err}"))
+            })?;
+
+        if let Some(parent) = output_path
+            .parent()
+            .filter(|path| !path.as_os_str().is_empty())
+        {
+            tokio::fs::create_dir_all(parent).await.map_err(|err| {
+                ToolError::execution_failed(format!(
+                    "failed to create output directory {}: {err}",
+                    parent.display()
+                ))
+            })?;
+        }
+        tokio::fs::write(&output_path, &response.audio_bytes)
+            .await
+            .map_err(|err| {
+                ToolError::execution_failed(format!(
+                    "failed to write audio file {}: {err}",
+                    output_path.display()
+                ))
+            })?;
+
+        let result = json!({
+            "mode": "speech",
+            "success": true,
+            "api": "Xiaomi MiMo OpenAI-compatible chat/completions speech synthesis",
+            "base_url": openai_compatible_base_url(client.base_url()),
+            "model": response.model,
+            "format": response.audio_format,
+            "stream": false,
+            "output": output_label,
+            "absolute_output": output_path.display().to_string(),
+            "bytes": response.audio_bytes.len(),
+            "voice": response.voice.as_deref().map(describe_speech_voice),
+            "transcript": response.transcript,
+            "supported_formats": SUPPORTED_SPEECH_FORMATS,
+            "supported_xiaomi_mimo_models": SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS,
+        });
+        ToolResult::json(&result).map_err(|err| {
+            ToolError::execution_failed(format!("failed to serialize result: {err}"))
+        })
+    }
+}
+
+fn infer_speech_model(
+    model: Option<&str>,
+    has_clone_voice: bool,
+    has_voice_prompt: bool,
+) -> String {
+    match model.map(str::trim).filter(|value| !value.is_empty()) {
+        Some(value) => normalize_model_name_for_provider(ApiProvider::XiaomiMimo, value)
+            .unwrap_or_else(|| value.into()),
+        None if has_clone_voice => "mimo-v2.5-tts-voiceclone".to_string(),
+        None if has_voice_prompt => "mimo-v2.5-tts-voicedesign".to_string(),
+        None => "mimo-v2.5-tts".to_string(),
+    }
+}
+
+fn combine_speech_instructions(
+    instruction: Option<String>,
+    voice_prompt: Option<String>,
+) -> Option<String> {
+    match (instruction, voice_prompt) {
+        (Some(instruction), Some(voice_prompt)) => {
+            let instruction = instruction.trim();
+            let voice_prompt = voice_prompt.trim();
+            if instruction.is_empty() {
+                Some(voice_prompt.to_string()).filter(|value| !value.is_empty())
+            } else if voice_prompt.is_empty() {
+                Some(instruction.to_string()).filter(|value| !value.is_empty())
+            } else {
+                Some(format!("{voice_prompt}\n\n{instruction}"))
+            }
+        }
+        (Some(value), None) | (None, Some(value)) => {
+            let value = value.trim().to_string();
+            if value.is_empty() { None } else { Some(value) }
+        }
+        (None, None) => None,
+    }
+}
+
+fn normalize_speech_format(format: &str) -> Option<String> {
+    let normalized = format.trim().to_ascii_lowercase();
+    match normalized.as_str() {
+        "wav" | "mp3" | "pcm16" => Some(normalized),
+        "pcm" => Some("pcm16".to_string()),
+        _ => None,
+    }
+}
+
+fn default_speech_output_name(format: &str) -> String {
+    format!(
+        "speech.{}",
+        normalize_speech_format(format)
+            .as_deref()
+            .unwrap_or(DEFAULT_FORMAT)
+    )
+}
+
+fn resolve_speech_output_path(
+    input: &Value,
+    context: &ToolContext,
+    output_raw: Option<&str>,
+    format: &str,
+    configured_output_dir: Option<&PathBuf>,
+) -> Result<PathBuf, ToolError> {
+    if let Some(output) = output_raw {
+        return context.resolve_path(output);
+    }
+
+    let filename = default_speech_output_name(format);
+    if let Some(output_dir) = optional_str(input, "output_dir")
+        .map(str::trim)
+        .filter(|value| !value.is_empty())
+    {
+        return Ok(context.resolve_path(output_dir)?.join(filename));
+    }
+
+    if let Some(output_dir) = configured_output_dir {
+        return Ok(output_dir.join(filename));
+    }
+
+    Ok(context.workspace.join(filename))
+}
+
+async fn encode_voice_clone_data_uri(path: &Path) -> Result<String, ToolError> {
+    let bytes = tokio::fs::read(path).await.map_err(|err| {
+        ToolError::execution_failed(format!(
+            "failed to read voice clone sample {}: {err}",
+            path.display()
+        ))
+    })?;
+    let base64_audio = general_purpose::STANDARD.encode(bytes);
+    if base64_audio.len() > VOICE_CLONE_BASE64_MAX_BYTES {
+        return Err(ToolError::invalid_input(format!(
+            "voice clone sample is too large after base64 encoding ({} bytes > 10 MB)",
+            base64_audio.len()
+        )));
+    }
+
+    let extension = path
+        .extension()
+        .and_then(|value| value.to_str())
+        .unwrap_or_default()
+        .to_ascii_lowercase();
+    let mime = match extension.as_str() {
+        "mp3" => "audio/mpeg",
+        "wav" => "audio/wav",
+        other => {
+            return Err(ToolError::invalid_input(format!(
+                "unsupported voice clone sample extension '{other}'. Use .mp3 or .wav."
+            )));
+        }
+    };
+
+    Ok(format!("data:{mime};base64,{base64_audio}"))
+}
+
+fn describe_speech_voice(voice: &str) -> String {
+    if voice.starts_with("data:") {
+        "embedded voice clone sample".to_string()
+    } else {
+        voice.to_string()
+    }
+}
+
+fn openai_compatible_base_url(base_url: &str) -> String {
+    let trimmed = base_url.trim_end_matches('/');
+    if trimmed.ends_with("/v1") || trimmed.ends_with("/beta") {
+        trimmed.to_string()
+    } else {
+        format!("{trimmed}/v1")
+    }
+}
+
+fn check_network_policy(context: &ToolContext, base_url: &str) -> Result<(), ToolError> {
+    let Some(decider) = context.network_policy.as_ref() else {
+        return Ok(());
+    };
+    let display_url = openai_compatible_base_url(base_url);
+    let Some(host) = host_from_url(&display_url) else {
+        return Ok(());
+    };
+    match decider.evaluate(&host, "speech") {
+        Decision::Allow => Ok(()),
+        Decision::Deny => Err(ToolError::permission_denied(format!(
+            "speech network call to '{host}' blocked by network policy"
+        ))),
+        Decision::Prompt => Err(ToolError::permission_denied(format!(
+            "speech network call to '{host}' requires approval; re-run after `/network allow {host}` or set network.default = \"allow\" in config"
+        ))),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn infers_speech_model_from_requested_mode() {
+        assert_eq!(infer_speech_model(None, false, false), "mimo-v2.5-tts");
+        assert_eq!(
+            infer_speech_model(None, false, true),
+            "mimo-v2.5-tts-voicedesign"
+        );
+        assert_eq!(
+            infer_speech_model(None, true, false),
+            "mimo-v2.5-tts-voiceclone"
+        );
+        assert_eq!(
+            infer_speech_model(Some("mimo-tts"), false, false),
+            "mimo-v2.5-tts"
+        );
+        assert_eq!(
+            infer_speech_model(Some("mimo-v2-tts"), false, false),
+            "mimo-v2-tts"
+        );
+    }
+
+    #[test]
+    fn combines_voice_prompt_before_instruction() {
+        assert_eq!(
+            combine_speech_instructions(
+                Some("Speak warmly.".to_string()),
+                Some("Young Chinese female voice".to_string())
+            )
+            .as_deref(),
+            Some("Young Chinese female voice\n\nSpeak warmly.")
+        );
+        assert_eq!(
+            combine_speech_instructions(Some("  calm  ".to_string()), None).as_deref(),
+            Some("calm")
+        );
+    }
+
+    #[test]
+    fn normalizes_documented_speech_formats() {
+        assert_eq!(normalize_speech_format("WAV").as_deref(), Some("wav"));
+        assert_eq!(normalize_speech_format("pcm16").as_deref(), Some("pcm16"));
+        assert_eq!(normalize_speech_format("pcm").as_deref(), Some("pcm16"));
+        assert_eq!(normalize_speech_format("flac"), None);
+    }
+
+    #[test]
+    fn displays_openai_compatible_base_url() {
+        assert_eq!(
+            openai_compatible_base_url("https://api.xiaomimimo.com"),
+            "https://api.xiaomimimo.com/v1"
+        );
+        assert_eq!(
+            openai_compatible_base_url("https://api.xiaomimimo.com/v1"),
+            "https://api.xiaomimimo.com/v1"
+        );
+    }
+
+    #[test]
+    fn speech_tool_is_auto_approved_but_not_read_only() {
+        let tool = SpeechTool::new("speech", None, None);
+        assert_eq!(tool.name(), "speech");
+        assert_eq!(tool.approval_requirement(), ApprovalRequirement::Auto);
+        assert!(!tool.is_read_only());
+        let schema = tool.input_schema();
+        assert!(schema.to_string().contains("mimo-v2.5-tts-voiceclone"));
+        assert!(schema.to_string().contains("pcm16"));
+        assert!(schema.to_string().contains("stream"));
+    }
+}
@@ -332,6 +332,9 @@ fn picker_model_hint(id: &str) -> &'static str {
        }
        "arcee-ai/trinity-large-thinking" => "large thinking",
        "xiaomi/mimo-v2.5-pro" | "mimo-v2.5-pro" => "long context",
+        "mimo-v2.5-tts" | "mimo-v2-tts" => "speech / TTS",
+        "mimo-v2.5-tts-voicedesign" => "voice design",
+        "mimo-v2.5-tts-voiceclone" => "voice clone",
        "minimax/minimax-m3" => "1M multimodal",
        _ => "provider model",
    }
@@ -781,6 +781,7 @@ fn build_engine_config(app: &App, config: &Config) -> EngineConfig {
        prefer_bwrap: config.prefer_bwrap.unwrap_or(false),
        memory_enabled: config.memory_enabled(),
        memory_path: config.memory_path(),
+        speech_output_dir: config.speech_output_dir(),
        vision_config: config.vision_model_config(),
        strict_tool_mode: config.strict_tool_mode.unwrap_or(false),
        goal_objective: app.hunt.quarry.clone(),
@@ -118,7 +118,7 @@ endpoint.
 | `wanjie-ark` | `[providers.wanjie_ark]` | `WANJIE_ARK_API_KEY`, `WANJIE_API_KEY`, `WANJIE_MAAS_API_KEY` | `WANJIE_ARK_BASE_URL`, `WANJIE_BASE_URL`, `WANJIE_MAAS_BASE_URL`; default `https://maas-openapi.wanjiedata.com/api/v1` | `deepseek-reasoner` | OpenAI-compatible hosted route. `WANJIE_ARK_MODEL`, `WANJIE_MODEL`, and `WANJIE_MAAS_MODEL` are accepted. |
 | `volcengine` | `[providers.volcengine]` | `VOLCENGINE_API_KEY`, `VOLCENGINE_ARK_API_KEY`, `ARK_API_KEY` | `VOLCENGINE_BASE_URL`, `VOLCENGINE_ARK_BASE_URL`, `ARK_BASE_URL`; default `https://ark.cn-beijing.volces.com/api/coding/v3` | `DeepSeek-V4-Pro`, `DeepSeek-V4-Flash` | Volcengine/Volcano Engine Ark OpenAI-compatible coding endpoint. `VOLCENGINE_MODEL` and `VOLCENGINE_ARK_MODEL` are accepted. |
 | `openrouter` | `[providers.openrouter]` | `OPENROUTER_API_KEY` | `OPENROUTER_BASE_URL`; default `https://openrouter.ai/api/v1` | `deepseek/deepseek-v4-pro`, `deepseek/deepseek-v4-flash`; recent large IDs include `arcee-ai/trinity-large-thinking`, `minimax/minimax-m3`, `xiaomi/mimo-v2.5-pro`, `qwen/qwen3.6-35b-a3b`, `google/gemma-4-31b-it`, `z-ai/glm-5.1`, `moonshotai/kimi-k2.6` | Additive open-model routing layer. It does not replace DeepSeek; it lets users route supported model IDs through OpenRouter when they choose it. |
-| `xiaomi-mimo` | `[providers.xiaomi_mimo]` | `XIAOMI_MIMO_API_KEY`, `XIAOMI_API_KEY`, `MIMO_API_KEY` | `XIAOMI_MIMO_BASE_URL`, `MIMO_BASE_URL`; default `https://api.xiaomimimo.com/v1` | `mimo-v2.5-pro`, `mimo-v2.5` | Xiaomi MiMo OpenAI-compatible chat completions route. It sends `max_completion_tokens` and uses MiMo's `thinking` field for reasoning control. |
+| `xiaomi-mimo` | `[providers.xiaomi_mimo]` | `XIAOMI_MIMO_API_KEY`, `XIAOMI_API_KEY`, `MIMO_API_KEY` | `XIAOMI_MIMO_BASE_URL`, `MIMO_BASE_URL`; default `https://api.xiaomimimo.com/v1` | `mimo-v2.5-pro`, `mimo-v2.5`, `mimo-v2.5-tts`, `mimo-v2.5-tts-voicedesign`, `mimo-v2.5-tts-voiceclone`, `mimo-v2-tts` | Xiaomi MiMo OpenAI-compatible chat completions route. It sends `max_completion_tokens` and uses MiMo's `thinking` field for reasoning control. `codewhale speech` / `tts` uses the TTS models. |
 | `novita` | `[providers.novita]` | `NOVITA_API_KEY` | `NOVITA_BASE_URL`; default `https://api.novita.ai/v1` | `deepseek/deepseek-v4-pro`, `deepseek/deepseek-v4-flash` | OpenAI-compatible hosted route for DeepSeek model IDs. Use config or `CODEWHALE_MODEL` / `DEEPSEEK_MODEL` for model overrides. |
 | `fireworks` | `[providers.fireworks]` | `FIREWORKS_API_KEY` | `FIREWORKS_BASE_URL`; default `https://api.fireworks.ai/inference/v1` | `accounts/fireworks/models/deepseek-v4-pro` | OpenAI-compatible hosted route. Use config or `CODEWHALE_MODEL` / `DEEPSEEK_MODEL` for model overrides. |
 | `siliconflow` | `[providers.siliconflow]` | `SILICONFLOW_API_KEY` | `SILICONFLOW_BASE_URL`; default `https://api.siliconflow.com/v1` | `deepseek-ai/DeepSeek-V4-Pro`, `deepseek-ai/DeepSeek-V4-Flash` | OpenAI-compatible hosted route. Official docs use the `.com` endpoint; users who need the regional endpoint can set `https://api.siliconflow.cn/v1` explicitly. `SILICONFLOW_MODEL` is accepted. Reasoning aliases `deepseek-reasoner` and `deepseek-r1` map to Pro; `deepseek-chat` and `deepseek-v3` map to Flash. |
@@ -130,7 +130,11 @@ endpoint.
 ### Xiaomi MiMo Notes

 `xiaomi-mimo` defaults to `mimo-v2.5-pro` for long-context reasoning and coding
-work, while the static registry also exposes `mimo-v2.5`. Xiaomi's current
+work, while the static registry also exposes `mimo-v2.5`. Xiaomi MiMo TTS is
+available through `codewhale --provider xiaomi-mimo speech "text" --model tts`
+(or the `tts` alias) plus model-visible `speech` / `tts` tools in Agent/YOLO mode.
+Voice-design and voice-clone shorthands map to `mimo-v2.5-tts-voicedesign` and
+`mimo-v2.5-tts-voiceclone`. Xiaomi's current
 [image-understanding guide](https://platform.xiaomimimo.com/docs/en-US/usage-guide/multimodal-understanding/image-understanding)
 includes `mimo-v2.5` for image input. CodeWhale exposes image analysis through the
 separate `[vision_model]` / `image_analyze` path; set that model to
@@ -164,7 +168,7 @@ endpoint when the endpoint supports model listing.
 | `wanjie-ark` | `deepseek-reasoner` | yes | yes |
 | `volcengine` | `DeepSeek-V4-Pro`, `DeepSeek-V4-Flash` | yes | yes |
 | `openrouter` | `deepseek/deepseek-v4-pro`, `deepseek/deepseek-v4-flash`, `arcee-ai/trinity-large-thinking`, `minimax/minimax-m3`, `xiaomi/mimo-v2.5-pro`, `xiaomi/mimo-v2.5`, `qwen/qwen3.6-35b-a3b`, `qwen/qwen3.6-27b`, `moonshotai/kimi-k2.6`, `z-ai/glm-5.1`, `tencent/hy3-preview`, `google/gemma-4-31b-it`, `google/gemma-4-26b-a4b-it`, `nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free` | yes | yes |
-| `xiaomi-mimo` | `mimo-v2.5-pro`, `mimo-v2.5` | yes | yes |
+| `xiaomi-mimo` | `mimo-v2.5-pro`, `mimo-v2.5`, `mimo-v2.5-tts`, `mimo-v2.5-tts-voicedesign`, `mimo-v2.5-tts-voiceclone`, `mimo-v2-tts` | yes | yes for chat models; no for TTS models |
 | `novita` | `deepseek/deepseek-v4-pro`, `deepseek/deepseek-v4-flash` | yes | yes |
 | `fireworks` | `accounts/fireworks/models/deepseek-v4-pro` | yes | yes |
 | `siliconflow` | `deepseek-ai/DeepSeek-V4-Pro`, `deepseek-ai/DeepSeek-V4-Flash` | yes | yes |