feat: add Xiaomi MiMo speech support

This commit is contained in:
xyuai
2026-06-02 09:00:14 +08:00
committed by Hunter B
parent 139b542d3f
commit 8532dcc49e
19 changed files with 1397 additions and 12 deletions
+1
View File
@@ -323,6 +323,7 @@ codewhale --provider openrouter --model minimax/minimax-m3
# Xiaomi MiMo
codewhale auth set --provider xiaomi-mimo --api-key "YOUR_XIAOMI_KEY"
codewhale --provider xiaomi-mimo --model mimo-v2.5-pro
codewhale --provider xiaomi-mimo speech "Hello from MiMo" --model tts -o hello.wav
# Novita
codewhale auth set --provider novita --api-key "YOUR_NOVITA_API_KEY"
+1
View File
@@ -269,6 +269,7 @@ codewhale --provider openrouter --model qwen/qwen3.7-max
# Xiaomi MiMo
codewhale auth set --provider xiaomi-mimo --api-key "YOUR_XIAOMI_MIMO_API_KEY"
codewhale --provider xiaomi-mimo --model mimo-v2.5-pro
codewhale --provider xiaomi-mimo speech "???MiMo" --model tts -o hello.wav
# Novita
codewhale auth set --provider novita --api-key "YOUR_NOVITA_API_KEY"
+11 -1
View File
@@ -45,6 +45,9 @@ base_url = "https://api.deepseek.com/beta"
# deepseek-ai/deepseek-v4-flash — default AtlasCloud model ID
# deepseek-reasoner — default Wanjie Ark model ID
# mimo-v2.5-pro — default Xiaomi MiMo model ID
# mimo-v2.5-tts ? Xiaomi MiMo speech/TTS model ID
# mimo-v2.5-tts-voicedesign ? Xiaomi MiMo voice-design TTS model ID
# mimo-v2.5-tts-voiceclone ? Xiaomi MiMo voice-clone TTS model ID
# accounts/fireworks/models/deepseek-v4-pro — Fireworks AI Pro model ID
# deepseek-ai/DeepSeek-V4-Pro — SiliconFlow hosted Pro model ID
# deepseek-ai/DeepSeek-V4-Flash — SiliconFlow hosted Flash model ID
@@ -120,6 +123,11 @@ memory_path = "~/.codewhale/memory.md"
# Parsed but currently unused (reserved for future versions):
# tools_file = "./tools.json"
# Xiaomi MiMo speech/TTS defaults. Also configurable with
# XIAOMI_MIMO_SPEECH_OUTPUT_DIR / MIMO_SPEECH_OUTPUT_DIR.
[speech]
# output_dir = "./speech"
# Native tool catalog controls (#2076). By default only the core tool surface
# is loaded into the model context; less common native tools are discoverable
# through ToolSearch and loaded on first use.
@@ -301,7 +309,9 @@ max_subagents = 10 # optional (1-20)
[providers.xiaomi_mimo]
# api_key = "YOUR_XIAOMI_KEY"
# base_url = "https://api.xiaomimimo.com/v1"
# model = "mimo-v2.5-pro"
# model = "mimo-v2.5-pro" # chat/reasoning
# TTS aliases are also accepted by `codewhale speech`: tts, voice-design, voice-clone
# TTS model IDs: mimo-v2.5-tts, mimo-v2.5-tts-voicedesign, mimo-v2.5-tts-voiceclone, mimo-v2-tts
# Novita AI-hosted inference (https://novita.ai)
[providers.novita]
+56
View File
@@ -307,6 +307,46 @@ impl Default for ModelRegistry {
supports_tools: true,
supports_reasoning: true,
},
ModelInfo {
id: "mimo-v2.5-tts".to_string(),
provider: ProviderKind::XiaomiMimo,
aliases: vec![
"tts".to_string(),
"speech".to_string(),
"mimo-tts".to_string(),
],
supports_tools: false,
supports_reasoning: false,
},
ModelInfo {
id: "mimo-v2.5-tts-voicedesign".to_string(),
provider: ProviderKind::XiaomiMimo,
aliases: vec![
"voicedesign".to_string(),
"voice-design".to_string(),
"mimo-voice-design".to_string(),
],
supports_tools: false,
supports_reasoning: false,
},
ModelInfo {
id: "mimo-v2.5-tts-voiceclone".to_string(),
provider: ProviderKind::XiaomiMimo,
aliases: vec![
"voiceclone".to_string(),
"voice-clone".to_string(),
"mimo-voice-clone".to_string(),
],
supports_tools: false,
supports_reasoning: false,
},
ModelInfo {
id: "mimo-v2-tts".to_string(),
provider: ProviderKind::XiaomiMimo,
aliases: vec!["mimo-v2-speech".to_string()],
supports_tools: false,
supports_reasoning: false,
},
ModelInfo {
id: "deepseek/deepseek-v4-pro".to_string(),
provider: ProviderKind::Novita,
@@ -707,6 +747,22 @@ mod tests {
assert!(resolved.resolved.supports_reasoning);
}
#[test]
fn xiaomi_mimo_tts_aliases_resolve_when_provider_hinted() {
let registry = ModelRegistry::default();
let resolved = registry.resolve(Some("tts"), Some(ProviderKind::XiaomiMimo));
assert_eq!(resolved.resolved.provider, ProviderKind::XiaomiMimo);
assert_eq!(resolved.resolved.id, "mimo-v2.5-tts");
assert!(!resolved.resolved.supports_tools);
assert!(!resolved.resolved.supports_reasoning);
let resolved = registry.resolve(Some("voice-design"), Some(ProviderKind::XiaomiMimo));
assert_eq!(resolved.resolved.id, "mimo-v2.5-tts-voicedesign");
let resolved = registry.resolve(Some("voiceclone"), Some(ProviderKind::XiaomiMimo));
assert_eq!(resolved.resolved.id, "mimo-v2.5-tts-voiceclone");
}
#[test]
fn wanjie_ark_default_uses_reasoner_model_id() {
let registry = ModelRegistry::default();
+7
View File
@@ -133,6 +133,9 @@ enum Commands {
Doctor(TuiPassthroughArgs),
/// List live DeepSeek API models via the TUI binary.
Models(TuiPassthroughArgs),
/// Generate speech audio with Xiaomi MiMo TTS models via the TUI binary.
#[command(visible_alias = "tts")]
Speech(TuiPassthroughArgs),
/// List saved TUI sessions.
Sessions(TuiPassthroughArgs),
/// Resume a saved TUI session.
@@ -510,6 +513,10 @@ fn run() -> Result<()> {
let resolved_runtime = resolve_runtime_for_dispatch(&mut store, &runtime_overrides);
delegate_to_tui(&cli, &resolved_runtime, tui_args("models", args))
}
Some(Commands::Speech(args)) => {
let resolved_runtime = resolve_runtime_for_dispatch(&mut store, &runtime_overrides);
delegate_to_tui(&cli, &resolved_runtime, tui_args("speech", args))
}
Some(Commands::Sessions(args)) => {
let resolved_runtime = resolve_runtime_for_dispatch(&mut store, &runtime_overrides);
delegate_to_tui(&cli, &resolved_runtime, tui_args("sessions", args))
+62
View File
@@ -44,6 +44,10 @@ const OPENROUTER_TENCENT_HY3_PREVIEW_MODEL: &str = "tencent/hy3-preview";
const OPENROUTER_XIAOMI_MIMO_V2_5_PRO_MODEL: &str = "xiaomi/mimo-v2.5-pro";
const OPENROUTER_XIAOMI_MIMO_V2_5_MODEL: &str = "xiaomi/mimo-v2.5";
const DEFAULT_XIAOMI_MIMO_MODEL: &str = "mimo-v2.5-pro";
const XIAOMI_MIMO_TTS_MODEL: &str = "mimo-v2.5-tts";
const XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL: &str = "mimo-v2.5-tts-voicedesign";
const XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL: &str = "mimo-v2.5-tts-voiceclone";
const XIAOMI_MIMO_V2_TTS_MODEL: &str = "mimo-v2-tts";
const DEFAULT_NOVITA_MODEL: &str = "deepseek/deepseek-v4-pro";
const DEFAULT_NOVITA_FLASH_MODEL: &str = "deepseek/deepseek-v4-flash";
const DEFAULT_FIREWORKS_MODEL: &str = "accounts/fireworks/models/deepseek-v4-pro";
@@ -1447,6 +1451,12 @@ pub fn load_project_config(workspace: &Path) -> Option<ConfigToml> {
}
fn normalize_model_for_provider(provider: ProviderKind, model: &str) -> String {
if matches!(provider, ProviderKind::XiaomiMimo)
&& let Some(canonical) = canonical_xiaomi_mimo_model_id(model)
{
return canonical.to_string();
}
if matches!(
provider,
ProviderKind::Atlascloud
@@ -1521,6 +1531,38 @@ fn normalize_model_for_provider(provider: ProviderKind, model: &str) -> String {
}
}
fn canonical_xiaomi_mimo_model_id(model: &str) -> Option<&'static str> {
let normalized = model.trim().to_ascii_lowercase();
let normalized = normalized.replace(['_', ' '], "-");
match normalized.as_str() {
"mimo"
| DEFAULT_XIAOMI_MIMO_MODEL
| "mimo-v2-5-pro"
| "xiaomi-mimo-v2.5-pro"
| "xiaomi-mimo-v2-5-pro" => Some(DEFAULT_XIAOMI_MIMO_MODEL),
"mimo-v2.5" | "mimo-v25" | "mimo-v2-5" | "xiaomi-mimo-v2.5" | "xiaomi-mimo-v2-5" => {
Some("mimo-v2.5")
}
"mimo-tts" | "mimo-v25-tts" | "mimo-v2.5-tts" | "tts" | "speech" => {
Some(XIAOMI_MIMO_TTS_MODEL)
}
"mimo-tts-voicedesign"
| "mimo-voice-design"
| "mimo-v25-tts-voicedesign"
| "mimo-v2.5-tts-voicedesign"
| "voicedesign"
| "voice-design" => Some(XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL),
"mimo-tts-voiceclone"
| "mimo-voice-clone"
| "mimo-v25-tts-voiceclone"
| "mimo-v2.5-tts-voiceclone"
| "voiceclone"
| "voice-clone" => Some(XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL),
"mimo-v2-tts" => Some(XIAOMI_MIMO_V2_TTS_MODEL),
_ => None,
}
}
fn canonical_openrouter_recent_model_id(model: &str) -> Option<&'static str> {
let normalized = model.trim().to_ascii_lowercase();
let normalized = normalized.replace(['_', ' '], "-");
@@ -3571,6 +3613,26 @@ unix_socket_path = "/tmp/cw-hooks.sock"
assert_eq!(resolved.model, DEFAULT_XIAOMI_MIMO_MODEL);
}
#[test]
fn xiaomi_mimo_tts_aliases_resolve_to_canonical_models() {
assert_eq!(
normalize_model_for_provider(ProviderKind::XiaomiMimo, "tts"),
"mimo-v2.5-tts"
);
assert_eq!(
normalize_model_for_provider(ProviderKind::XiaomiMimo, "voice-design"),
"mimo-v2.5-tts-voicedesign"
);
assert_eq!(
normalize_model_for_provider(ProviderKind::XiaomiMimo, "voiceclone"),
"mimo-v2.5-tts-voiceclone"
);
assert_eq!(
normalize_model_for_provider(ProviderKind::XiaomiMimo, "custom-mimo-model"),
"custom-mimo-model"
);
}
#[test]
fn novita_provider_defaults_to_canonical_endpoint_and_model() {
let _lock = env_lock();
+200
View File
@@ -8,6 +8,7 @@ use std::sync::{Arc, Mutex as StdMutex, OnceLock};
use std::time::{Duration, Instant};
use anyhow::{Context, Result};
use base64::{Engine as _, engine::general_purpose};
use reqwest::header::{AUTHORIZATION, CONTENT_TYPE, HeaderMap, HeaderName, HeaderValue};
use serde::{Deserialize, Serialize};
use serde_json::{Value, json};
@@ -119,6 +120,31 @@ pub struct AvailableModel {
pub created: Option<u64>,
}
/// Request payload for Xiaomi MiMo speech synthesis models.
///
/// MiMo-V2.5-TTS / MiMo-V2-TTS use the OpenAI-compatible
/// `/v1/chat/completions` endpoint: the optional style/voice instruction is
/// sent as a `user` message, while the text to synthesize is sent as an
/// `assistant` message.
#[derive(Debug, Clone)]
pub struct SpeechSynthesisRequest {
pub model: String,
pub text: String,
pub instruction: Option<String>,
pub audio_format: String,
pub voice: Option<String>,
}
/// Decoded speech synthesis result.
#[derive(Debug, Clone)]
pub struct SpeechSynthesisResponse {
pub model: String,
pub audio_format: String,
pub audio_bytes: Vec<u8>,
pub transcript: Option<String>,
pub voice: Option<String>,
}
/// Client for DeepSeek's OpenAI-compatible APIs.
#[must_use]
pub struct DeepSeekClient {
@@ -407,6 +433,49 @@ pub(super) fn api_url(base_url: &str, path: &str) -> String {
format!("{}/{}", versioned.trim_end_matches('/'), path)
}
fn normalize_audio_format(format: &str) -> String {
let normalized = format.trim().to_ascii_lowercase();
if normalized.is_empty() {
"wav".to_string()
} else {
normalized
}
}
fn parse_speech_audio_response(payload: &Value) -> Result<(Vec<u8>, Option<String>)> {
let audio = payload
.get("choices")
.and_then(Value::as_array)
.and_then(|choices| choices.first())
.and_then(|choice| {
choice
.get("message")
.and_then(|message| message.get("audio"))
.or_else(|| choice.get("delta").and_then(|delta| delta.get("audio")))
})
.or_else(|| payload.get("audio"))
.context("Speech synthesis response did not include choices[0].message.audio")?;
let data = audio
.get("data")
.and_then(Value::as_str)
.context("Speech synthesis response did not include audio.data")?
.trim();
let data = data
.split_once(',')
.map(|(_, base64)| base64.trim())
.unwrap_or(data);
let audio_bytes = general_purpose::STANDARD
.decode(data)
.context("Failed to decode speech audio base64 data")?;
let transcript = audio
.get("transcript")
.and_then(Value::as_str)
.map(str::to_string);
Ok((audio_bytes, transcript))
}
// === DeepSeekClient ===
/// Returns true when DEEPSEEK_FORCE_HTTP1 is set to a truthy value
@@ -645,6 +714,104 @@ impl DeepSeekClient {
parse_models_response(&response_text)
}
/// Generate speech with Xiaomi MiMo TTS models.
///
/// The spoken text is placed in an `assistant` message because Xiaomi
/// MiMo's TTS chat-completions surface expects that shape. The optional
/// `instruction` is a `user` message that controls style, voice design, or
/// voice-clone performance and is not spoken verbatim.
pub async fn synthesize_speech(
&self,
request: SpeechSynthesisRequest,
) -> Result<SpeechSynthesisResponse> {
if self.api_provider != crate::config::ApiProvider::XiaomiMimo {
anyhow::bail!(
"speech synthesis requires provider 'xiaomi-mimo' (current: {})",
self.api_provider.as_str()
);
}
let model = request.model.trim().to_string();
if model.is_empty() {
anyhow::bail!("Speech model cannot be empty");
}
let text = request.text.trim().to_string();
if text.is_empty() {
anyhow::bail!("Speech text cannot be empty");
}
let audio_format = normalize_audio_format(&request.audio_format);
let model = wire_model_for_provider(self.api_provider, &model);
let model_lower = model.to_ascii_lowercase();
let instruction = request
.instruction
.as_deref()
.map(str::trim)
.filter(|value| !value.is_empty());
let voice = request
.voice
.as_deref()
.map(str::trim)
.filter(|value| !value.is_empty())
.map(str::to_string);
if model_lower.contains("voicedesign") && instruction.is_none() {
anyhow::bail!(
"Model '{model}' requires a voice design prompt. Pass --voice-prompt or --instruction."
);
}
if model_lower.contains("voiceclone") && voice.is_none() {
anyhow::bail!(
"Model '{model}' requires cloned voice data. Pass --clone-voice <mp3|wav> or --voice <data-uri>."
);
}
let mut audio = json!({
"format": audio_format.clone(),
});
if let Some(voice) = voice.as_deref() {
audio["voice"] = json!(voice);
}
let body = json!({
"model": model,
"messages": [
{
"role": "user",
"content": instruction.unwrap_or(""),
},
{
"role": "assistant",
"content": text,
}
],
"audio": audio,
});
let url = api_url(&self.base_url, "chat/completions");
let response = self
.send_with_retry(|| self.http_client.post(&url).json(&body))
.await?;
let status = response.status();
if !status.is_success() {
let error_text = bounded_error_text(response, ERROR_BODY_MAX_BYTES).await;
anyhow::bail!("Speech synthesis failed: HTTP {status}: {error_text}");
}
let response_text = response.text().await.unwrap_or_default();
let payload: Value = serde_json::from_str(&response_text)
.context("Failed to parse speech synthesis response JSON")?;
let (audio_bytes, transcript) = parse_speech_audio_response(&payload)?;
Ok(SpeechSynthesisResponse {
model,
audio_format,
audio_bytes,
transcript,
voice,
})
}
async fn wait_for_rate_limit(&self) {
let maybe_delay = {
let mut limiter = self.rate_limiter.lock().await;
@@ -1166,6 +1333,39 @@ mod tests {
}
}
#[test]
fn parse_speech_audio_response_accepts_message_audio() {
let encoded = general_purpose::STANDARD.encode(b"hi");
let payload = json!({
"choices": [{
"message": {
"audio": {
"data": encoded,
"transcript": "hi"
}
}
}]
});
let (audio, transcript) = parse_speech_audio_response(&payload).unwrap();
assert_eq!(audio, b"hi");
assert_eq!(transcript.as_deref(), Some("hi"));
}
#[test]
fn parse_speech_audio_response_accepts_data_uri() {
let encoded = general_purpose::STANDARD.encode(b"wav");
let payload = json!({
"audio": {
"data": format!("data:audio/wav;base64,{encoded}")
}
});
let (audio, transcript) = parse_speech_audio_response(&payload).unwrap();
assert_eq!(audio, b"wav");
assert_eq!(transcript, None);
}
#[test]
fn tool_name_roundtrip_dot() {
let original = "multi_tool_use.parallel";
+46 -4
View File
@@ -36,9 +36,13 @@ pub fn provider(app: &mut App, args: Option<&str>) -> CommandResult {
let model = match model_arg {
None => None,
Some(raw) if matches!(target, ApiProvider::XiaomiMimo) => {
let expanded = expand_model_alias_for_provider(target, raw);
Some(normalize_model_name_for_provider(target, &expanded).unwrap_or(expanded))
}
Some(raw) if provider_passes_model_through(target) => Some(raw.trim().to_string()),
Some(raw) => {
let expanded = expand_model_alias(raw);
let expanded = expand_model_alias_for_provider(target, raw);
let normalized = if matches!(target, ApiProvider::Deepseek | ApiProvider::DeepseekCN) {
normalize_model_name_for_provider(target, &expanded)
} else {
@@ -48,7 +52,7 @@ pub fn provider(app: &mut App, args: Option<&str>) -> CommandResult {
Some(normalized) => Some(normalized),
None => {
return CommandResult::error(format!(
"Invalid model '{raw}'. Try: flash, pro, deepseek-v4-flash, deepseek-v4-pro."
"Invalid model '{raw}'. Try: flash, pro, deepseek-v4-flash, deepseek-v4-pro, or xiaomi-mimo tts."
));
}
}
@@ -65,8 +69,24 @@ pub fn provider(app: &mut App, args: Option<&str>) -> CommandResult {
})
}
fn expand_model_alias(name: &str) -> String {
match name.trim().to_ascii_lowercase().as_str() {
fn expand_model_alias_for_provider(provider: ApiProvider, name: &str) -> String {
let lower = name.trim().to_ascii_lowercase();
if matches!(provider, ApiProvider::XiaomiMimo) {
return match lower.as_str() {
"pro" | "mimo" => "mimo-v2.5-pro".to_string(),
"text" => "mimo-v2.5".to_string(),
"tts" | "speech" | "mimo-tts" => "mimo-v2.5-tts".to_string(),
"voicedesign" | "voice-design" | "mimo-voice-design" => {
"mimo-v2.5-tts-voicedesign".to_string()
}
"voiceclone" | "voice-clone" | "mimo-voice-clone" => {
"mimo-v2.5-tts-voiceclone".to_string()
}
other => other.to_string(),
};
}
match lower.as_str() {
"pro" | "v4-pro" => "deepseek-v4-pro".to_string(),
"flash" | "v4-flash" => "deepseek-v4-flash".to_string(),
other => other.to_string(),
@@ -154,6 +174,28 @@ mod tests {
}
}
#[test]
fn switch_to_xiaomi_mimo_accepts_tts_shorthands() {
let mut app = create_test_app();
let result = provider(&mut app, Some("xiaomi-mimo tts"));
match result.action {
Some(AppAction::SwitchProvider { provider, model }) => {
assert_eq!(provider, ApiProvider::XiaomiMimo);
assert_eq!(model.as_deref(), Some("mimo-v2.5-tts"));
}
other => panic!("expected SwitchProvider, got {other:?}"),
}
let result = provider(&mut app, Some("xiaomi-mimo voiceclone"));
match result.action {
Some(AppAction::SwitchProvider { provider, model }) => {
assert_eq!(provider, ApiProvider::XiaomiMimo);
assert_eq!(model.as_deref(), Some("mimo-v2.5-tts-voiceclone"));
}
other => panic!("expected SwitchProvider, got {other:?}"),
}
}
#[test]
fn switch_to_atlascloud_emits_action() {
let mut app = create_test_app();
+128 -2
View File
@@ -78,6 +78,10 @@ pub const RECENT_OPENROUTER_LARGE_MODELS: &[&str] = &[
pub const DEFAULT_OPENROUTER_BASE_URL: &str = "https://openrouter.ai/api/v1";
pub const DEFAULT_XIAOMI_MIMO_MODEL: &str = "mimo-v2.5-pro";
pub const DEFAULT_XIAOMI_MIMO_BASE_URL: &str = "https://api.xiaomimimo.com/v1";
pub const XIAOMI_MIMO_TTS_MODEL: &str = "mimo-v2.5-tts";
pub const XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL: &str = "mimo-v2.5-tts-voicedesign";
pub const XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL: &str = "mimo-v2.5-tts-voiceclone";
pub const XIAOMI_MIMO_V2_TTS_MODEL: &str = "mimo-v2-tts";
pub const DEFAULT_NOVITA_MODEL: &str = "deepseek/deepseek-v4-pro";
pub const DEFAULT_NOVITA_FLASH_MODEL: &str = "deepseek/deepseek-v4-flash";
pub const DEFAULT_NOVITA_BASE_URL: &str = "https://api.novita.ai/v1";
@@ -538,6 +542,38 @@ fn canonical_openrouter_recent_model_id(model: &str) -> Option<&'static str> {
}
}
fn canonical_xiaomi_mimo_model_id(model: &str) -> Option<&'static str> {
let normalized = model.trim().to_ascii_lowercase();
let normalized = normalized.replace(['_', ' '], "-");
match normalized.as_str() {
"mimo"
| DEFAULT_XIAOMI_MIMO_MODEL
| "mimo-v2-5-pro"
| "xiaomi-mimo-v2.5-pro"
| "xiaomi-mimo-v2-5-pro" => Some(DEFAULT_XIAOMI_MIMO_MODEL),
"mimo-v2.5" | "mimo-v25" | "mimo-v2-5" | "xiaomi-mimo-v2.5" | "xiaomi-mimo-v2-5" => {
Some("mimo-v2.5")
}
"mimo-tts" | "mimo-v25-tts" | "mimo-v2.5-tts" | "tts" | "speech" => {
Some(XIAOMI_MIMO_TTS_MODEL)
}
"mimo-tts-voicedesign"
| "mimo-voice-design"
| "mimo-v25-tts-voicedesign"
| "mimo-v2.5-tts-voicedesign"
| "voicedesign"
| "voice-design" => Some(XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL),
"mimo-tts-voiceclone"
| "mimo-voice-clone"
| "mimo-v25-tts-voiceclone"
| "mimo-v2.5-tts-voiceclone"
| "voiceclone"
| "voice-clone" => Some(XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL),
"mimo-v2-tts" => Some(XIAOMI_MIMO_V2_TTS_MODEL),
_ => None,
}
}
/// Normalize a model selected through the TUI for the active provider.
///
/// Official DeepSeek endpoints require bare model IDs. Provider-prefixed
@@ -556,6 +592,12 @@ pub fn normalize_model_name_for_provider(provider: ApiProvider, model: &str) ->
return Some(canonical.to_string());
}
if matches!(provider, ApiProvider::XiaomiMimo)
&& let Some(canonical) = canonical_xiaomi_mimo_model_id(model)
{
return Some(canonical.to_string());
}
let normalized = normalize_model_name(model)?;
if matches!(provider, ApiProvider::Deepseek | ApiProvider::DeepseekCN)
&& let Some(canonical) = canonical_official_deepseek_model_id(&normalized)
@@ -585,7 +627,14 @@ pub fn normalize_model_name_for_provider(provider: ApiProvider, model: &str) ->
#[must_use]
pub fn wire_model_for_provider(provider: ApiProvider, model: &str) -> String {
let trimmed = model.trim();
if trimmed.is_empty() || provider_passes_model_through(provider) {
if trimmed.is_empty() {
return trimmed.to_string();
}
if matches!(provider, ApiProvider::XiaomiMimo) {
return normalize_model_name_for_provider(provider, trimmed)
.unwrap_or_else(|| trimmed.to_string());
}
if provider_passes_model_through(provider) {
return trimmed.to_string();
}
normalize_model_name_for_provider(provider, trimmed).unwrap_or_else(|| trimmed.to_string())
@@ -601,7 +650,14 @@ pub fn model_completion_names_for_provider(provider: ApiProvider) -> Vec<&'stati
models.extend_from_slice(RECENT_OPENROUTER_LARGE_MODELS);
models
}
ApiProvider::XiaomiMimo => vec![DEFAULT_XIAOMI_MIMO_MODEL, "mimo-v2.5"],
ApiProvider::XiaomiMimo => vec![
DEFAULT_XIAOMI_MIMO_MODEL,
"mimo-v2.5",
XIAOMI_MIMO_TTS_MODEL,
XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL,
XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL,
XIAOMI_MIMO_V2_TTS_MODEL,
],
ApiProvider::Novita => vec![DEFAULT_NOVITA_MODEL, DEFAULT_NOVITA_FLASH_MODEL],
ApiProvider::Fireworks => vec![DEFAULT_FIREWORKS_MODEL],
ApiProvider::Siliconflow => {
@@ -822,6 +878,15 @@ pub struct MemoryConfig {
pub enabled: Option<bool>,
}
/// Xiaomi MiMo speech/TTS output configuration.
#[derive(Debug, Clone, Default, Deserialize)]
pub struct SpeechConfig {
/// Default directory for generated speech/TTS files when no explicit
/// output path is provided.
#[serde(default)]
pub output_dir: Option<String>,
}
impl SnapshotsConfig {
#[must_use]
pub fn max_age(&self) -> std::time::Duration {
@@ -1429,6 +1494,10 @@ pub struct Config {
#[serde(default)]
pub memory: Option<MemoryConfig>,
/// Xiaomi MiMo speech/TTS defaults.
#[serde(default)]
pub speech: Option<SpeechConfig>,
/// Tunables for `--model auto` (#1207). When absent, the auto router
/// keeps its existing balanced behaviour.
#[serde(default)]
@@ -2353,6 +2422,26 @@ impl Config {
.unwrap_or_else(|| PathBuf::from("./memory.md"))
}
/// Resolve the default speech/TTS output directory, if configured.
#[must_use]
pub fn speech_output_dir(&self) -> Option<PathBuf> {
std::env::var("XIAOMI_MIMO_SPEECH_OUTPUT_DIR")
.or_else(|_| std::env::var("MIMO_SPEECH_OUTPUT_DIR"))
.or_else(|_| std::env::var("XIAOMIMIMO_SPEECH_OUTPUT_DIR"))
.ok()
.map(|value| value.trim().to_string())
.filter(|value| !value.is_empty())
.map(|value| expand_path(&value))
.or_else(|| {
self.speech
.as_ref()
.and_then(|speech| speech.output_dir.as_deref())
.map(str::trim)
.filter(|value| !value.is_empty())
.map(expand_path)
})
}
/// Resolve the configured `instructions = [...]` array (#454)
/// to absolute paths, in declared order. Empty when unset or
/// when every entry is empty after trimming. Each entry runs
@@ -3540,6 +3629,11 @@ fn normalize_model_config(config: &mut Config) {
}
fn normalize_model_for_provider(provider: ApiProvider, model: &str) -> Option<String> {
if matches!(provider, ApiProvider::XiaomiMimo)
&& let Some(canonical) = canonical_xiaomi_mimo_model_id(model)
{
return Some(canonical.to_string());
}
if provider_passes_model_through(provider) {
return None;
}
@@ -3788,6 +3882,7 @@ fn merge_config(base: Config, override_cfg: Config) -> Config {
snapshots: override_cfg.snapshots.or(base.snapshots),
search: override_cfg.search.or(base.search),
memory: override_cfg.memory.or(base.memory),
speech: override_cfg.speech.or(base.speech),
auto: override_cfg.auto.or(base.auto),
update: override_cfg.update.or(base.update),
lsp: override_cfg.lsp.or(base.lsp),
@@ -6510,6 +6605,37 @@ api_key = "old-openrouter-key"
}
}
#[test]
fn normalize_xiaomi_mimo_tts_aliases_for_provider() {
assert_eq!(
normalize_model_name_for_provider(ApiProvider::XiaomiMimo, "tts").as_deref(),
Some("mimo-v2.5-tts")
);
assert_eq!(
normalize_model_name_for_provider(ApiProvider::XiaomiMimo, "voice-design").as_deref(),
Some("mimo-v2.5-tts-voicedesign")
);
assert_eq!(
wire_model_for_provider(ApiProvider::XiaomiMimo, "voiceclone"),
"mimo-v2.5-tts-voiceclone"
);
}
#[test]
fn model_completion_names_for_xiaomi_mimo_include_tts_models() {
let models = model_completion_names_for_provider(ApiProvider::XiaomiMimo);
for expected in [
"mimo-v2.5-pro",
"mimo-v2.5",
"mimo-v2.5-tts",
"mimo-v2.5-tts-voicedesign",
"mimo-v2.5-tts-voiceclone",
"mimo-v2-tts",
] {
assert!(models.contains(&expected), "missing {expected}");
}
}
#[test]
fn model_completion_names_for_deepseek_api_are_deduplicated_bare_ids() {
assert_eq!(
+3
View File
@@ -161,6 +161,8 @@ pub struct EngineConfig {
/// Path to the user memory file (#489). Always populated; only
/// consulted when `memory_enabled` is `true`.
pub memory_path: PathBuf,
/// Default directory for Xiaomi MiMo speech/TTS tool outputs.
pub speech_output_dir: Option<PathBuf>,
pub vision_config: Option<crate::config::VisionModelConfig>,
pub goal_objective: Option<String>,
/// Tool restriction from custom slash command frontmatter.
@@ -236,6 +238,7 @@ impl Default for EngineConfig {
subagent_model_overrides: HashMap::new(),
memory_enabled: false,
memory_path: PathBuf::from("./memory.md"),
speech_output_dir: None,
vision_config: None,
strict_tool_mode: false,
goal_objective: None,
+5 -1
View File
@@ -78,7 +78,11 @@ impl Engine {
if mode != AppMode::Plan {
builder = builder
.with_rlm_tool(self.deepseek_client.clone(), self.session.model.clone())
.with_fim_tool(self.deepseek_client.clone(), self.session.model.clone());
.with_fim_tool(self.deepseek_client.clone(), self.session.model.clone())
.with_speech_tools(
self.deepseek_client.clone(),
self.config.speech_output_dir.clone(),
);
}
if self.config.features.enabled(Feature::ApplyPatch) && mode != AppMode::Plan {
+305
View File
@@ -6,6 +6,7 @@ use std::process::{Command, Stdio};
use std::time::Duration;
use anyhow::{Context, Result, anyhow, bail};
use base64::{Engine as _, engine::general_purpose};
use clap::{Args, CommandFactory, Parser, Subcommand, ValueEnum};
use clap_complete::{Shell, generate};
use dotenvy::dotenv;
@@ -225,6 +226,9 @@ enum Commands {
Logout,
/// List available models from the configured API endpoint
Models(ModelsArgs),
/// Generate speech audio with Xiaomi MiMo TTS models
#[command(visible_alias = "tts")]
Speech(SpeechArgs),
/// Run a non-interactive prompt. Use --auto for tool-backed agent mode.
Exec(ExecArgs),
/// Generate SWE-bench prediction rows from CodeWhale runs
@@ -531,6 +535,50 @@ struct ModelsArgs {
json: bool,
}
#[derive(Args, Debug, Clone)]
struct SpeechArgs {
/// Text to synthesize. This is sent as the assistant message content.
#[arg(value_name = "TEXT")]
text: String,
/// Output audio path. Defaults to speech.<format> in --output-dir,
/// [speech].output_dir, or the current directory.
#[arg(short, long, value_name = "FILE")]
output: Option<PathBuf>,
/// Directory for the default speech.<format> output file when -o/--output is omitted.
#[arg(long = "output-dir", value_name = "DIR")]
output_dir: Option<PathBuf>,
/// TTS model. Defaults to built-in voices, or is inferred from --voice-prompt/--clone-voice.
#[arg(long)]
model: Option<String>,
/// Built-in voice ID, or a data:audio/...;base64,... URI for voice clone.
#[arg(long)]
voice: Option<String>,
/// Natural language style instruction; not spoken verbatim.
#[arg(long)]
instruction: Option<String>,
/// Voice design prompt. Implies mimo-v2.5-tts-voicedesign when --model is omitted.
#[arg(long = "voice-prompt")]
voice_prompt: Option<String>,
/// MP3/WAV sample used for voice cloning. Implies mimo-v2.5-tts-voiceclone when --model is omitted.
#[arg(long = "clone-voice", value_name = "FILE")]
clone_voice: Option<PathBuf>,
/// Output audio format requested from the API
#[arg(long, default_value = "wav")]
format: String,
/// Emit machine-readable JSON output
#[arg(long, default_value_t = false)]
json: bool,
}
#[derive(Args, Debug, Default, Clone)]
struct FeatureToggles {
/// Enable a feature (repeatable). Equivalent to `features.<name>=true`.
@@ -896,6 +944,10 @@ async fn main() -> Result<()> {
let config = load_config_from_cli(&cli)?;
run_models(&config, args).await
}
Commands::Speech(args) => {
let config = load_config_from_cli(&cli)?;
run_speech(&config, args).await
}
Commands::Exec(args) => {
let config = load_config_from_cli(&cli)?;
let workspace = cli.workspace.clone().unwrap_or_else(|| {
@@ -3514,6 +3566,258 @@ async fn run_models(config: &Config, args: ModelsArgs) -> Result<()> {
Ok(())
}
async fn run_speech(config: &Config, args: SpeechArgs) -> Result<()> {
use crate::client::{DeepSeekClient, SpeechSynthesisRequest};
use crate::config::{ApiProvider, normalize_model_name_for_provider};
let SpeechArgs {
text,
output,
output_dir,
model,
voice,
instruction,
voice_prompt,
clone_voice,
format,
json: json_output,
} = args;
if config.api_provider() != ApiProvider::XiaomiMimo {
bail!(
"`speech` requires provider = \"xiaomi-mimo\" (current: {}). Run with `--provider xiaomi-mimo` or set it in config.",
config.api_provider().as_str()
);
}
if text.trim().is_empty() {
bail!("Speech text cannot be empty");
}
let voice_is_data_uri = voice
.as_deref()
.map(str::trim)
.is_some_and(|value| value.starts_with("data:audio/"));
if clone_voice.is_some() && voice.is_some() {
bail!("Use either --clone-voice or --voice for cloned voice data, not both");
}
let model = match model {
Some(value) => {
normalize_model_name_for_provider(ApiProvider::XiaomiMimo, &value).unwrap_or(value)
}
None => {
if clone_voice.is_some() || voice_is_data_uri {
"mimo-v2.5-tts-voiceclone".to_string()
} else if voice_prompt.is_some() {
"mimo-v2.5-tts-voicedesign".to_string()
} else {
"mimo-v2.5-tts".to_string()
}
}
};
let model_lower = model.to_ascii_lowercase();
if !model_lower.contains("tts") {
bail!(
"speech requires a TTS model (examples: mimo-v2.5-tts, mimo-v2.5-tts-voicedesign, mimo-v2.5-tts-voiceclone); got {model}"
);
}
let is_voice_design = model_lower.contains("voicedesign");
let is_voice_clone = model_lower.contains("voiceclone");
let instruction = combine_speech_instructions(instruction, voice_prompt);
if is_voice_design
&& instruction
.as_deref()
.is_none_or(|value| value.trim().is_empty())
{
bail!(
"mimo-v2.5-tts-voicedesign requires --voice-prompt or --instruction to describe the voice"
);
}
let voice = if let Some(clone_path) = clone_voice {
Some(encode_voice_clone_data_uri(&clone_path)?)
} else if is_voice_design {
None
} else if let Some(value) = voice.filter(|value| !value.trim().is_empty()) {
Some(value)
} else if is_voice_clone {
bail!("mimo-v2.5-tts-voiceclone requires --clone-voice <mp3|wav> or --voice <data-uri>");
} else {
Some("mimo_default".to_string())
};
let format = normalize_speech_format(&format).with_context(|| {
format!("Unsupported speech format '{format}' (allowed: wav, mp3, pcm16)")
})?;
let output = resolve_speech_output_path(
output,
output_dir.or_else(|| config.speech_output_dir()),
&format,
);
let client = DeepSeekClient::new(config)?;
let response = client
.synthesize_speech(SpeechSynthesisRequest {
model: model.clone(),
text,
instruction,
audio_format: format.clone(),
voice,
})
.await?;
if let Some(parent) = output.parent().filter(|path| !path.as_os_str().is_empty()) {
std::fs::create_dir_all(parent)
.with_context(|| format!("Failed to create output directory {}", parent.display()))?;
}
std::fs::write(&output, &response.audio_bytes)
.with_context(|| format!("Failed to write audio file {}", output.display()))?;
if json_output {
println!(
"{}",
serde_json::to_string_pretty(&serde_json::json!({
"mode": "speech",
"success": true,
"model": response.model,
"format": response.audio_format,
"output": output.display().to_string(),
"bytes": response.audio_bytes.len(),
"voice": response.voice.as_deref().map(describe_speech_voice),
"transcript": response.transcript,
}))?
);
} else {
println!(
"Generated speech: {} ({} bytes, model: {}, format: {})",
output.display(),
response.audio_bytes.len(),
response.model,
response.audio_format
);
}
Ok(())
}
fn combine_speech_instructions(
instruction: Option<String>,
voice_prompt: Option<String>,
) -> Option<String> {
match (instruction, voice_prompt) {
(Some(instruction), Some(voice_prompt)) => {
let instruction = instruction.trim();
let voice_prompt = voice_prompt.trim();
if instruction.is_empty() {
Some(voice_prompt.to_string()).filter(|value| !value.is_empty())
} else if voice_prompt.is_empty() {
Some(instruction.to_string()).filter(|value| !value.is_empty())
} else {
Some(format!("{voice_prompt}\n\n{instruction}"))
}
}
(Some(value), None) | (None, Some(value)) => {
let value = value.trim().to_string();
if value.is_empty() { None } else { Some(value) }
}
(None, None) => None,
}
}
const VOICE_CLONE_BASE64_MAX_BYTES: usize = 10 * 1024 * 1024;
fn normalize_speech_format(format: &str) -> Option<String> {
let normalized = format.trim().to_ascii_lowercase();
match normalized.as_str() {
"wav" | "mp3" | "pcm16" => Some(normalized),
"pcm" => Some("pcm16".to_string()),
_ => None,
}
}
fn default_speech_output_name(format: &str) -> String {
format!(
"speech.{}",
normalize_speech_format(format).as_deref().unwrap_or("wav")
)
}
fn resolve_speech_output_path(
output: Option<PathBuf>,
output_dir: Option<PathBuf>,
format: &str,
) -> PathBuf {
output.unwrap_or_else(|| {
output_dir
.unwrap_or_default()
.join(default_speech_output_name(format))
})
}
fn encode_voice_clone_data_uri(path: &Path) -> Result<String> {
let bytes = std::fs::read(path)
.with_context(|| format!("Failed to read voice clone sample {}", path.display()))?;
let base64_audio = general_purpose::STANDARD.encode(bytes);
if base64_audio.len() > VOICE_CLONE_BASE64_MAX_BYTES {
bail!(
"Voice clone sample is too large after base64 encoding ({} bytes > 10 MB)",
base64_audio.len()
);
}
let extension = path
.extension()
.and_then(|value| value.to_str())
.unwrap_or_default()
.to_ascii_lowercase();
let mime = match extension.as_str() {
"mp3" => "audio/mpeg",
"wav" => "audio/wav",
other => bail!(
"Unsupported voice clone sample extension '{}'. Use .mp3 or .wav.",
other
),
};
Ok(format!("data:{mime};base64,{base64_audio}"))
}
fn describe_speech_voice(voice: &str) -> String {
if voice.starts_with("data:") {
"embedded voice clone sample".to_string()
} else {
voice.to_string()
}
}
#[cfg(test)]
mod speech_cli_tests {
use super::*;
#[test]
fn normalizes_documented_speech_formats() {
assert_eq!(normalize_speech_format("WAV").as_deref(), Some("wav"));
assert_eq!(normalize_speech_format("pcm16").as_deref(), Some("pcm16"));
assert_eq!(normalize_speech_format("pcm").as_deref(), Some("pcm16"));
assert_eq!(normalize_speech_format("flac"), None);
}
#[test]
fn default_speech_output_tracks_requested_format() {
assert_eq!(
resolve_speech_output_path(None, None, "mp3"),
PathBuf::from("speech.mp3")
);
assert_eq!(
resolve_speech_output_path(None, Some(PathBuf::from("audio")), "pcm"),
PathBuf::from("audio").join("speech.pcm16")
);
assert_eq!(
resolve_speech_output_path(Some(PathBuf::from("custom.wav")), None, "mp3"),
PathBuf::from("custom.wav")
);
}
}
/// Test API connectivity by making a minimal request
async fn test_api_connectivity(config: &Config) -> Result<()> {
use crate::client::DeepSeekClient;
@@ -5462,6 +5766,7 @@ async fn run_exec_agent(
prefer_bwrap: config.prefer_bwrap.unwrap_or(false),
memory_enabled: config.memory_enabled(),
memory_path: config.memory_path(),
speech_output_dir: config.speech_output_dir(),
vision_config: config.vision_model_config(),
strict_tool_mode: config.strict_tool_mode.unwrap_or(false),
goal_objective: None,
+1
View File
@@ -2017,6 +2017,7 @@ impl RuntimeThreadManager {
prefer_bwrap: self.config.prefer_bwrap.unwrap_or(false),
memory_enabled: self.config.memory_enabled(),
memory_path: self.config.memory_path(),
speech_output_dir: self.config.speech_output_dir(),
vision_config: self.config.vision_model_config(),
strict_tool_mode: self.config.strict_tool_mode.unwrap_or(false),
goal_objective: None,
+1
View File
@@ -48,6 +48,7 @@ pub mod shell;
mod shell_output;
pub mod skill;
pub mod spec;
pub mod speech;
pub mod subagent;
pub mod tasks;
pub mod test_runner;
+31 -1
View File
@@ -9,7 +9,7 @@
use std::collections::HashMap;
use std::sync::{Arc, OnceLock};
use std::path::Path;
use std::path::{Path, PathBuf};
use serde_json::Value;
@@ -776,6 +776,22 @@ impl ToolRegistryBuilder {
self.with_tool(Arc::new(RevertTurnTool))
}
/// Include Xiaomi MiMo speech/TTS tools (`speech`, `tts`).
#[must_use]
pub fn with_speech_tools(
self,
client: Option<DeepSeekClient>,
output_dir: Option<PathBuf>,
) -> Self {
use super::speech::SpeechTool;
self.with_tool(Arc::new(SpeechTool::new(
"speech",
client.clone(),
output_dir.clone(),
)))
.with_tool(Arc::new(SpeechTool::new("tts", client, output_dir)))
}
/// Include persistent RLM session tools.
#[must_use]
pub fn with_rlm_tool(self, client: Option<DeepSeekClient>, _root_model: String) -> Self {
@@ -958,11 +974,13 @@ impl ToolRegistryBuilder {
todo_list: super::todo::SharedTodoList,
plan_state: super::plan::SharedPlanState,
) -> Self {
let speech_client = client.clone();
self.with_agent_tools(allow_shell)
.with_todo_tool(todo_list)
.with_plan_tool(plan_state)
.with_review_tool(client.clone(), model.clone())
.with_rlm_tool(client, model)
.with_speech_tools(speech_client, None)
.with_recall_archive_tool()
.with_subagent_tools(manager, runtime)
}
@@ -1218,6 +1236,18 @@ mod tests {
assert!(registry.contains("list_dir"));
}
#[test]
fn builder_registers_speech_alias_tools() {
let tmp = tempdir().expect("tempdir");
let ctx = ToolContext::new(tmp.path().to_path_buf());
let registry = ToolRegistryBuilder::new()
.with_speech_tools(None, None)
.build(ctx);
assert!(registry.contains("speech"));
assert!(registry.contains("tts"));
}
#[test]
fn test_registry_names() {
let tmp = tempdir().expect("tempdir");
+528
View File
@@ -0,0 +1,528 @@
//! Model-visible Xiaomi MiMo speech/TTS generation tool.
//!
//! This mirrors the CLI `speech` / `tts` command as a first-class API tool so
//! the TUI model can generate narrated audio without shelling out to a nested
//! CodeWhale process.
use std::path::{Path, PathBuf};
use async_trait::async_trait;
use base64::{Engine as _, engine::general_purpose};
use serde_json::{Value, json};
use crate::client::{DeepSeekClient, SpeechSynthesisRequest};
use crate::config::{ApiProvider, normalize_model_name_for_provider};
use crate::network_policy::{Decision, host_from_url};
use super::spec::{
ApprovalRequirement, ToolCapability, ToolContext, ToolError, ToolResult, ToolSpec,
optional_bool, optional_str, required_str,
};
const DEFAULT_FORMAT: &str = "wav";
const DEFAULT_VOICE: &str = "mimo_default";
const VOICE_CLONE_BASE64_MAX_BYTES: usize = 10 * 1024 * 1024;
const SUPPORTED_SPEECH_FORMATS: &[&str] = &["wav", "mp3", "pcm16"];
pub const SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS: &[&str] = &[
"mimo-v2.5-pro",
"mimo-v2.5",
"mimo-v2.5-tts-voiceclone",
"mimo-v2.5-tts-voicedesign",
"mimo-v2.5-tts",
"mimo-v2-pro",
"mimo-v2-omni",
"mimo-v2-tts",
];
const SPEECH_MODEL_EXAMPLES: &[&str] = &[
"mimo-v2.5-tts",
"mimo-v2.5-tts-voicedesign",
"mimo-v2.5-tts-voiceclone",
"mimo-v2-tts",
];
pub struct SpeechTool {
name: &'static str,
client: Option<DeepSeekClient>,
output_dir: Option<PathBuf>,
}
impl SpeechTool {
#[must_use]
pub fn new(
name: &'static str,
client: Option<DeepSeekClient>,
output_dir: Option<PathBuf>,
) -> Self {
Self {
name,
client,
output_dir,
}
}
}
#[async_trait]
impl ToolSpec for SpeechTool {
fn name(&self) -> &str {
self.name
}
fn description(&self) -> &str {
"Generate speech/audio directly through the configured Xiaomi MiMo OpenAI-compatible API. Use this when the user asks for speech, TTS, narration, read-aloud, voice design, or voice cloning."
}
fn input_schema(&self) -> Value {
json!({
"type": "object",
"properties": {
"text": {
"type": "string",
"description": "Text to synthesize. This is sent as the assistant message and is the spoken content; MiMo TTS style/audio tags may be included here."
},
"output": {
"type": "string",
"description": "Audio file path to write, relative to the workspace unless absolute. Default: speech.<format> in output_dir, configured [speech].output_dir, or the workspace."
},
"output_dir": {
"type": "string",
"description": "Directory for the default speech.<format> output file when output is omitted. Relative paths stay inside the workspace."
},
"model": {
"type": "string",
"description": "TTS model. Defaults to mimo-v2.5-tts, or infers voice-design/voice-clone models from voice_prompt/clone_voice.",
"enum": SPEECH_MODEL_EXAMPLES
},
"voice": {
"type": "string",
"description": "Built-in voice ID (for example mimo_default, 冰糖, 茉莉, 苏打, 白桦, Mia, Chloe, Milo, Dean) or a data:audio/...;base64,... URI for voice clone."
},
"instruction": {
"type": "string",
"description": "Natural-language style, emotion, speed, scene, or performance instruction. It is not spoken verbatim."
},
"voice_prompt": {
"type": "string",
"description": "Voice design prompt. When model is omitted this uses mimo-v2.5-tts-voicedesign."
},
"clone_voice": {
"type": "string",
"description": "Path to a .mp3 or .wav voice sample for cloning. When model is omitted this uses mimo-v2.5-tts-voiceclone."
},
"format": {
"type": "string",
"description": "Requested audio format. Default: wav. MiMo-V2.5-TTS documentation examples use wav and pcm16; mp3 is accepted when the API returns it.",
"enum": SUPPORTED_SPEECH_FORMATS
},
"stream": {
"type": "boolean",
"description": "Low-latency streaming request. The direct tool currently writes complete audio files only, so leave this false."
}
},
"required": ["text"]
})
}
fn capabilities(&self) -> Vec<ToolCapability> {
vec![
ToolCapability::WritesFiles,
ToolCapability::Network,
ToolCapability::Sandboxable,
]
}
fn approval_requirement(&self) -> ApprovalRequirement {
// Speech generation is an explicit user-facing generation action.
// Path resolution still enforces workspace/trusted-root boundaries.
ApprovalRequirement::Auto
}
async fn execute(&self, input: Value, context: &ToolContext) -> Result<ToolResult, ToolError> {
let text = required_str(&input, "text")?.trim().to_string();
if text.is_empty() {
return Err(ToolError::invalid_input("speech text cannot be empty"));
}
let client = self.client.clone().ok_or_else(|| {
ToolError::not_available(
"speech tool requires an active Xiaomi MiMo API client; configure provider = \"xiaomi-mimo\" and an API key first",
)
})?;
let requested_format_raw = optional_str(&input, "format")
.map(str::trim)
.filter(|value| !value.is_empty())
.unwrap_or(DEFAULT_FORMAT);
let requested_format = normalize_speech_format(requested_format_raw).ok_or_else(|| {
ToolError::invalid_input(format!(
"unsupported speech format '{requested_format_raw}' (allowed: {})",
SUPPORTED_SPEECH_FORMATS.join(", ")
))
})?;
if optional_bool(&input, "stream", false) {
return Err(ToolError::invalid_input(
"stream=true low-latency speech output is not implemented in the direct tool yet; use stream=false to generate a complete audio file",
));
}
let output_raw = optional_str(&input, "output")
.map(str::trim)
.filter(|value| !value.is_empty());
let output_path = resolve_speech_output_path(
&input,
context,
output_raw,
&requested_format,
self.output_dir.as_ref(),
)?;
let output_label = output_raw
.map(str::to_string)
.unwrap_or_else(|| output_path.display().to_string());
let raw_voice = optional_str(&input, "voice")
.map(str::trim)
.filter(|value| !value.is_empty())
.map(str::to_string);
let raw_instruction = optional_str(&input, "instruction")
.map(str::trim)
.filter(|value| !value.is_empty())
.map(str::to_string);
let voice_prompt = optional_str(&input, "voice_prompt")
.map(str::trim)
.filter(|value| !value.is_empty())
.map(str::to_string);
let clone_voice = optional_str(&input, "clone_voice")
.map(str::trim)
.filter(|value| !value.is_empty())
.map(str::to_string);
let voice_is_data_uri = raw_voice
.as_deref()
.is_some_and(|value| value.starts_with("data:audio/"));
if clone_voice.is_some() && raw_voice.is_some() {
return Err(ToolError::invalid_input(
"use either clone_voice or voice for cloned voice data, not both",
));
}
let model = infer_speech_model(
optional_str(&input, "model"),
clone_voice.is_some() || voice_is_data_uri,
voice_prompt.is_some(),
);
let model_lower = model.to_ascii_lowercase();
if !model_lower.contains("tts") {
return Err(ToolError::invalid_input(format!(
"speech tool requires a TTS model (examples: {}), got '{model}'",
SPEECH_MODEL_EXAMPLES.join(", ")
)));
}
let is_voice_design = model_lower.contains("voicedesign");
let is_voice_clone = model_lower.contains("voiceclone");
let instruction = combine_speech_instructions(raw_instruction, voice_prompt);
if is_voice_design
&& instruction
.as_deref()
.is_none_or(|value| value.trim().is_empty())
{
return Err(ToolError::invalid_input(
"mimo-v2.5-tts-voicedesign requires voice_prompt or instruction",
));
}
let voice = if let Some(clone_path) = clone_voice {
let clone_path = context.resolve_path(&clone_path)?;
Some(encode_voice_clone_data_uri(&clone_path).await?)
} else if is_voice_design {
None
} else if let Some(value) = raw_voice {
Some(value)
} else if is_voice_clone {
return Err(ToolError::invalid_input(
"mimo-v2.5-tts-voiceclone requires clone_voice <mp3|wav> or voice <data-uri>",
));
} else {
Some(DEFAULT_VOICE.to_string())
};
check_network_policy(context, client.base_url())?;
let response = client
.synthesize_speech(SpeechSynthesisRequest {
model: model.clone(),
text,
instruction,
audio_format: requested_format,
voice,
})
.await
.map_err(|err| {
ToolError::execution_failed(format!("speech synthesis failed: {err}"))
})?;
if let Some(parent) = output_path
.parent()
.filter(|path| !path.as_os_str().is_empty())
{
tokio::fs::create_dir_all(parent).await.map_err(|err| {
ToolError::execution_failed(format!(
"failed to create output directory {}: {err}",
parent.display()
))
})?;
}
tokio::fs::write(&output_path, &response.audio_bytes)
.await
.map_err(|err| {
ToolError::execution_failed(format!(
"failed to write audio file {}: {err}",
output_path.display()
))
})?;
let result = json!({
"mode": "speech",
"success": true,
"api": "Xiaomi MiMo OpenAI-compatible chat/completions speech synthesis",
"base_url": openai_compatible_base_url(client.base_url()),
"model": response.model,
"format": response.audio_format,
"stream": false,
"output": output_label,
"absolute_output": output_path.display().to_string(),
"bytes": response.audio_bytes.len(),
"voice": response.voice.as_deref().map(describe_speech_voice),
"transcript": response.transcript,
"supported_formats": SUPPORTED_SPEECH_FORMATS,
"supported_xiaomi_mimo_models": SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS,
});
ToolResult::json(&result).map_err(|err| {
ToolError::execution_failed(format!("failed to serialize result: {err}"))
})
}
}
fn infer_speech_model(
model: Option<&str>,
has_clone_voice: bool,
has_voice_prompt: bool,
) -> String {
match model.map(str::trim).filter(|value| !value.is_empty()) {
Some(value) => normalize_model_name_for_provider(ApiProvider::XiaomiMimo, value)
.unwrap_or_else(|| value.into()),
None if has_clone_voice => "mimo-v2.5-tts-voiceclone".to_string(),
None if has_voice_prompt => "mimo-v2.5-tts-voicedesign".to_string(),
None => "mimo-v2.5-tts".to_string(),
}
}
fn combine_speech_instructions(
instruction: Option<String>,
voice_prompt: Option<String>,
) -> Option<String> {
match (instruction, voice_prompt) {
(Some(instruction), Some(voice_prompt)) => {
let instruction = instruction.trim();
let voice_prompt = voice_prompt.trim();
if instruction.is_empty() {
Some(voice_prompt.to_string()).filter(|value| !value.is_empty())
} else if voice_prompt.is_empty() {
Some(instruction.to_string()).filter(|value| !value.is_empty())
} else {
Some(format!("{voice_prompt}\n\n{instruction}"))
}
}
(Some(value), None) | (None, Some(value)) => {
let value = value.trim().to_string();
if value.is_empty() { None } else { Some(value) }
}
(None, None) => None,
}
}
fn normalize_speech_format(format: &str) -> Option<String> {
let normalized = format.trim().to_ascii_lowercase();
match normalized.as_str() {
"wav" | "mp3" | "pcm16" => Some(normalized),
"pcm" => Some("pcm16".to_string()),
_ => None,
}
}
fn default_speech_output_name(format: &str) -> String {
format!(
"speech.{}",
normalize_speech_format(format)
.as_deref()
.unwrap_or(DEFAULT_FORMAT)
)
}
fn resolve_speech_output_path(
input: &Value,
context: &ToolContext,
output_raw: Option<&str>,
format: &str,
configured_output_dir: Option<&PathBuf>,
) -> Result<PathBuf, ToolError> {
if let Some(output) = output_raw {
return context.resolve_path(output);
}
let filename = default_speech_output_name(format);
if let Some(output_dir) = optional_str(input, "output_dir")
.map(str::trim)
.filter(|value| !value.is_empty())
{
return Ok(context.resolve_path(output_dir)?.join(filename));
}
if let Some(output_dir) = configured_output_dir {
return Ok(output_dir.join(filename));
}
Ok(context.workspace.join(filename))
}
async fn encode_voice_clone_data_uri(path: &Path) -> Result<String, ToolError> {
let bytes = tokio::fs::read(path).await.map_err(|err| {
ToolError::execution_failed(format!(
"failed to read voice clone sample {}: {err}",
path.display()
))
})?;
let base64_audio = general_purpose::STANDARD.encode(bytes);
if base64_audio.len() > VOICE_CLONE_BASE64_MAX_BYTES {
return Err(ToolError::invalid_input(format!(
"voice clone sample is too large after base64 encoding ({} bytes > 10 MB)",
base64_audio.len()
)));
}
let extension = path
.extension()
.and_then(|value| value.to_str())
.unwrap_or_default()
.to_ascii_lowercase();
let mime = match extension.as_str() {
"mp3" => "audio/mpeg",
"wav" => "audio/wav",
other => {
return Err(ToolError::invalid_input(format!(
"unsupported voice clone sample extension '{other}'. Use .mp3 or .wav."
)));
}
};
Ok(format!("data:{mime};base64,{base64_audio}"))
}
fn describe_speech_voice(voice: &str) -> String {
if voice.starts_with("data:") {
"embedded voice clone sample".to_string()
} else {
voice.to_string()
}
}
fn openai_compatible_base_url(base_url: &str) -> String {
let trimmed = base_url.trim_end_matches('/');
if trimmed.ends_with("/v1") || trimmed.ends_with("/beta") {
trimmed.to_string()
} else {
format!("{trimmed}/v1")
}
}
fn check_network_policy(context: &ToolContext, base_url: &str) -> Result<(), ToolError> {
let Some(decider) = context.network_policy.as_ref() else {
return Ok(());
};
let display_url = openai_compatible_base_url(base_url);
let Some(host) = host_from_url(&display_url) else {
return Ok(());
};
match decider.evaluate(&host, "speech") {
Decision::Allow => Ok(()),
Decision::Deny => Err(ToolError::permission_denied(format!(
"speech network call to '{host}' blocked by network policy"
))),
Decision::Prompt => Err(ToolError::permission_denied(format!(
"speech network call to '{host}' requires approval; re-run after `/network allow {host}` or set network.default = \"allow\" in config"
))),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn infers_speech_model_from_requested_mode() {
assert_eq!(infer_speech_model(None, false, false), "mimo-v2.5-tts");
assert_eq!(
infer_speech_model(None, false, true),
"mimo-v2.5-tts-voicedesign"
);
assert_eq!(
infer_speech_model(None, true, false),
"mimo-v2.5-tts-voiceclone"
);
assert_eq!(
infer_speech_model(Some("mimo-tts"), false, false),
"mimo-v2.5-tts"
);
assert_eq!(
infer_speech_model(Some("mimo-v2-tts"), false, false),
"mimo-v2-tts"
);
}
#[test]
fn combines_voice_prompt_before_instruction() {
assert_eq!(
combine_speech_instructions(
Some("Speak warmly.".to_string()),
Some("Young Chinese female voice".to_string())
)
.as_deref(),
Some("Young Chinese female voice\n\nSpeak warmly.")
);
assert_eq!(
combine_speech_instructions(Some(" calm ".to_string()), None).as_deref(),
Some("calm")
);
}
#[test]
fn normalizes_documented_speech_formats() {
assert_eq!(normalize_speech_format("WAV").as_deref(), Some("wav"));
assert_eq!(normalize_speech_format("pcm16").as_deref(), Some("pcm16"));
assert_eq!(normalize_speech_format("pcm").as_deref(), Some("pcm16"));
assert_eq!(normalize_speech_format("flac"), None);
}
#[test]
fn displays_openai_compatible_base_url() {
assert_eq!(
openai_compatible_base_url("https://api.xiaomimimo.com"),
"https://api.xiaomimimo.com/v1"
);
assert_eq!(
openai_compatible_base_url("https://api.xiaomimimo.com/v1"),
"https://api.xiaomimimo.com/v1"
);
}
#[test]
fn speech_tool_is_auto_approved_but_not_read_only() {
let tool = SpeechTool::new("speech", None, None);
assert_eq!(tool.name(), "speech");
assert_eq!(tool.approval_requirement(), ApprovalRequirement::Auto);
assert!(!tool.is_read_only());
let schema = tool.input_schema();
assert!(schema.to_string().contains("mimo-v2.5-tts-voiceclone"));
assert!(schema.to_string().contains("pcm16"));
assert!(schema.to_string().contains("stream"));
}
}
+3
View File
@@ -332,6 +332,9 @@ fn picker_model_hint(id: &str) -> &'static str {
}
"arcee-ai/trinity-large-thinking" => "large thinking",
"xiaomi/mimo-v2.5-pro" | "mimo-v2.5-pro" => "long context",
"mimo-v2.5-tts" | "mimo-v2-tts" => "speech / TTS",
"mimo-v2.5-tts-voicedesign" => "voice design",
"mimo-v2.5-tts-voiceclone" => "voice clone",
"minimax/minimax-m3" => "1M multimodal",
_ => "provider model",
}
+1
View File
@@ -781,6 +781,7 @@ fn build_engine_config(app: &App, config: &Config) -> EngineConfig {
prefer_bwrap: config.prefer_bwrap.unwrap_or(false),
memory_enabled: config.memory_enabled(),
memory_path: config.memory_path(),
speech_output_dir: config.speech_output_dir(),
vision_config: config.vision_model_config(),
strict_tool_mode: config.strict_tool_mode.unwrap_or(false),
goal_objective: app.hunt.quarry.clone(),
+7 -3
View File
@@ -118,7 +118,7 @@ endpoint.
| `wanjie-ark` | `[providers.wanjie_ark]` | `WANJIE_ARK_API_KEY`, `WANJIE_API_KEY`, `WANJIE_MAAS_API_KEY` | `WANJIE_ARK_BASE_URL`, `WANJIE_BASE_URL`, `WANJIE_MAAS_BASE_URL`; default `https://maas-openapi.wanjiedata.com/api/v1` | `deepseek-reasoner` | OpenAI-compatible hosted route. `WANJIE_ARK_MODEL`, `WANJIE_MODEL`, and `WANJIE_MAAS_MODEL` are accepted. |
| `volcengine` | `[providers.volcengine]` | `VOLCENGINE_API_KEY`, `VOLCENGINE_ARK_API_KEY`, `ARK_API_KEY` | `VOLCENGINE_BASE_URL`, `VOLCENGINE_ARK_BASE_URL`, `ARK_BASE_URL`; default `https://ark.cn-beijing.volces.com/api/coding/v3` | `DeepSeek-V4-Pro`, `DeepSeek-V4-Flash` | Volcengine/Volcano Engine Ark OpenAI-compatible coding endpoint. `VOLCENGINE_MODEL` and `VOLCENGINE_ARK_MODEL` are accepted. |
| `openrouter` | `[providers.openrouter]` | `OPENROUTER_API_KEY` | `OPENROUTER_BASE_URL`; default `https://openrouter.ai/api/v1` | `deepseek/deepseek-v4-pro`, `deepseek/deepseek-v4-flash`; recent large IDs include `arcee-ai/trinity-large-thinking`, `minimax/minimax-m3`, `xiaomi/mimo-v2.5-pro`, `qwen/qwen3.6-35b-a3b`, `google/gemma-4-31b-it`, `z-ai/glm-5.1`, `moonshotai/kimi-k2.6` | Additive open-model routing layer. It does not replace DeepSeek; it lets users route supported model IDs through OpenRouter when they choose it. |
| `xiaomi-mimo` | `[providers.xiaomi_mimo]` | `XIAOMI_MIMO_API_KEY`, `XIAOMI_API_KEY`, `MIMO_API_KEY` | `XIAOMI_MIMO_BASE_URL`, `MIMO_BASE_URL`; default `https://api.xiaomimimo.com/v1` | `mimo-v2.5-pro`, `mimo-v2.5` | Xiaomi MiMo OpenAI-compatible chat completions route. It sends `max_completion_tokens` and uses MiMo's `thinking` field for reasoning control. |
| `xiaomi-mimo` | `[providers.xiaomi_mimo]` | `XIAOMI_MIMO_API_KEY`, `XIAOMI_API_KEY`, `MIMO_API_KEY` | `XIAOMI_MIMO_BASE_URL`, `MIMO_BASE_URL`; default `https://api.xiaomimimo.com/v1` | `mimo-v2.5-pro`, `mimo-v2.5`, `mimo-v2.5-tts`, `mimo-v2.5-tts-voicedesign`, `mimo-v2.5-tts-voiceclone`, `mimo-v2-tts` | Xiaomi MiMo OpenAI-compatible chat completions route. It sends `max_completion_tokens` and uses MiMo's `thinking` field for reasoning control. `codewhale speech` / `tts` uses the TTS models. |
| `novita` | `[providers.novita]` | `NOVITA_API_KEY` | `NOVITA_BASE_URL`; default `https://api.novita.ai/v1` | `deepseek/deepseek-v4-pro`, `deepseek/deepseek-v4-flash` | OpenAI-compatible hosted route for DeepSeek model IDs. Use config or `CODEWHALE_MODEL` / `DEEPSEEK_MODEL` for model overrides. |
| `fireworks` | `[providers.fireworks]` | `FIREWORKS_API_KEY` | `FIREWORKS_BASE_URL`; default `https://api.fireworks.ai/inference/v1` | `accounts/fireworks/models/deepseek-v4-pro` | OpenAI-compatible hosted route. Use config or `CODEWHALE_MODEL` / `DEEPSEEK_MODEL` for model overrides. |
| `siliconflow` | `[providers.siliconflow]` | `SILICONFLOW_API_KEY` | `SILICONFLOW_BASE_URL`; default `https://api.siliconflow.com/v1` | `deepseek-ai/DeepSeek-V4-Pro`, `deepseek-ai/DeepSeek-V4-Flash` | OpenAI-compatible hosted route. Official docs use the `.com` endpoint; users who need the regional endpoint can set `https://api.siliconflow.cn/v1` explicitly. `SILICONFLOW_MODEL` is accepted. Reasoning aliases `deepseek-reasoner` and `deepseek-r1` map to Pro; `deepseek-chat` and `deepseek-v3` map to Flash. |
@@ -130,7 +130,11 @@ endpoint.
### Xiaomi MiMo Notes
`xiaomi-mimo` defaults to `mimo-v2.5-pro` for long-context reasoning and coding
work, while the static registry also exposes `mimo-v2.5`. Xiaomi's current
work, while the static registry also exposes `mimo-v2.5`. Xiaomi MiMo TTS is
available through `codewhale --provider xiaomi-mimo speech "text" --model tts`
(or the `tts` alias) plus model-visible `speech` / `tts` tools in Agent/YOLO mode.
Voice-design and voice-clone shorthands map to `mimo-v2.5-tts-voicedesign` and
`mimo-v2.5-tts-voiceclone`. Xiaomi's current
[image-understanding guide](https://platform.xiaomimimo.com/docs/en-US/usage-guide/multimodal-understanding/image-understanding)
includes `mimo-v2.5` for image input. CodeWhale exposes image analysis through the
separate `[vision_model]` / `image_analyze` path; set that model to
@@ -164,7 +168,7 @@ endpoint when the endpoint supports model listing.
| `wanjie-ark` | `deepseek-reasoner` | yes | yes |
| `volcengine` | `DeepSeek-V4-Pro`, `DeepSeek-V4-Flash` | yes | yes |
| `openrouter` | `deepseek/deepseek-v4-pro`, `deepseek/deepseek-v4-flash`, `arcee-ai/trinity-large-thinking`, `minimax/minimax-m3`, `xiaomi/mimo-v2.5-pro`, `xiaomi/mimo-v2.5`, `qwen/qwen3.6-35b-a3b`, `qwen/qwen3.6-27b`, `moonshotai/kimi-k2.6`, `z-ai/glm-5.1`, `tencent/hy3-preview`, `google/gemma-4-31b-it`, `google/gemma-4-26b-a4b-it`, `nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free` | yes | yes |
| `xiaomi-mimo` | `mimo-v2.5-pro`, `mimo-v2.5` | yes | yes |
| `xiaomi-mimo` | `mimo-v2.5-pro`, `mimo-v2.5`, `mimo-v2.5-tts`, `mimo-v2.5-tts-voicedesign`, `mimo-v2.5-tts-voiceclone`, `mimo-v2-tts` | yes | yes for chat models; no for TTS models |
| `novita` | `deepseek/deepseek-v4-pro`, `deepseek/deepseek-v4-flash` | yes | yes |
| `fireworks` | `accounts/fireworks/models/deepseek-v4-pro` | yes | yes |
| `siliconflow` | `deepseek-ai/DeepSeek-V4-Pro`, `deepseek-ai/DeepSeek-V4-Flash` | yes | yes |