feat: add Xiaomi MiMo speech support
This commit is contained in:
@@ -323,6 +323,7 @@ codewhale --provider openrouter --model minimax/minimax-m3
|
||||
# Xiaomi MiMo
|
||||
codewhale auth set --provider xiaomi-mimo --api-key "YOUR_XIAOMI_KEY"
|
||||
codewhale --provider xiaomi-mimo --model mimo-v2.5-pro
|
||||
codewhale --provider xiaomi-mimo speech "Hello from MiMo" --model tts -o hello.wav
|
||||
|
||||
# Novita
|
||||
codewhale auth set --provider novita --api-key "YOUR_NOVITA_API_KEY"
|
||||
|
||||
@@ -269,6 +269,7 @@ codewhale --provider openrouter --model qwen/qwen3.7-max
|
||||
# Xiaomi MiMo
|
||||
codewhale auth set --provider xiaomi-mimo --api-key "YOUR_XIAOMI_MIMO_API_KEY"
|
||||
codewhale --provider xiaomi-mimo --model mimo-v2.5-pro
|
||||
codewhale --provider xiaomi-mimo speech "???MiMo" --model tts -o hello.wav
|
||||
|
||||
# Novita
|
||||
codewhale auth set --provider novita --api-key "YOUR_NOVITA_API_KEY"
|
||||
|
||||
+11
-1
@@ -45,6 +45,9 @@ base_url = "https://api.deepseek.com/beta"
|
||||
# deepseek-ai/deepseek-v4-flash — default AtlasCloud model ID
|
||||
# deepseek-reasoner — default Wanjie Ark model ID
|
||||
# mimo-v2.5-pro — default Xiaomi MiMo model ID
|
||||
# mimo-v2.5-tts ? Xiaomi MiMo speech/TTS model ID
|
||||
# mimo-v2.5-tts-voicedesign ? Xiaomi MiMo voice-design TTS model ID
|
||||
# mimo-v2.5-tts-voiceclone ? Xiaomi MiMo voice-clone TTS model ID
|
||||
# accounts/fireworks/models/deepseek-v4-pro — Fireworks AI Pro model ID
|
||||
# deepseek-ai/DeepSeek-V4-Pro — SiliconFlow hosted Pro model ID
|
||||
# deepseek-ai/DeepSeek-V4-Flash — SiliconFlow hosted Flash model ID
|
||||
@@ -120,6 +123,11 @@ memory_path = "~/.codewhale/memory.md"
|
||||
# Parsed but currently unused (reserved for future versions):
|
||||
# tools_file = "./tools.json"
|
||||
|
||||
# Xiaomi MiMo speech/TTS defaults. Also configurable with
|
||||
# XIAOMI_MIMO_SPEECH_OUTPUT_DIR / MIMO_SPEECH_OUTPUT_DIR.
|
||||
[speech]
|
||||
# output_dir = "./speech"
|
||||
|
||||
# Native tool catalog controls (#2076). By default only the core tool surface
|
||||
# is loaded into the model context; less common native tools are discoverable
|
||||
# through ToolSearch and loaded on first use.
|
||||
@@ -301,7 +309,9 @@ max_subagents = 10 # optional (1-20)
|
||||
[providers.xiaomi_mimo]
|
||||
# api_key = "YOUR_XIAOMI_KEY"
|
||||
# base_url = "https://api.xiaomimimo.com/v1"
|
||||
# model = "mimo-v2.5-pro"
|
||||
# model = "mimo-v2.5-pro" # chat/reasoning
|
||||
# TTS aliases are also accepted by `codewhale speech`: tts, voice-design, voice-clone
|
||||
# TTS model IDs: mimo-v2.5-tts, mimo-v2.5-tts-voicedesign, mimo-v2.5-tts-voiceclone, mimo-v2-tts
|
||||
|
||||
# Novita AI-hosted inference (https://novita.ai)
|
||||
[providers.novita]
|
||||
|
||||
@@ -307,6 +307,46 @@ impl Default for ModelRegistry {
|
||||
supports_tools: true,
|
||||
supports_reasoning: true,
|
||||
},
|
||||
ModelInfo {
|
||||
id: "mimo-v2.5-tts".to_string(),
|
||||
provider: ProviderKind::XiaomiMimo,
|
||||
aliases: vec![
|
||||
"tts".to_string(),
|
||||
"speech".to_string(),
|
||||
"mimo-tts".to_string(),
|
||||
],
|
||||
supports_tools: false,
|
||||
supports_reasoning: false,
|
||||
},
|
||||
ModelInfo {
|
||||
id: "mimo-v2.5-tts-voicedesign".to_string(),
|
||||
provider: ProviderKind::XiaomiMimo,
|
||||
aliases: vec![
|
||||
"voicedesign".to_string(),
|
||||
"voice-design".to_string(),
|
||||
"mimo-voice-design".to_string(),
|
||||
],
|
||||
supports_tools: false,
|
||||
supports_reasoning: false,
|
||||
},
|
||||
ModelInfo {
|
||||
id: "mimo-v2.5-tts-voiceclone".to_string(),
|
||||
provider: ProviderKind::XiaomiMimo,
|
||||
aliases: vec![
|
||||
"voiceclone".to_string(),
|
||||
"voice-clone".to_string(),
|
||||
"mimo-voice-clone".to_string(),
|
||||
],
|
||||
supports_tools: false,
|
||||
supports_reasoning: false,
|
||||
},
|
||||
ModelInfo {
|
||||
id: "mimo-v2-tts".to_string(),
|
||||
provider: ProviderKind::XiaomiMimo,
|
||||
aliases: vec!["mimo-v2-speech".to_string()],
|
||||
supports_tools: false,
|
||||
supports_reasoning: false,
|
||||
},
|
||||
ModelInfo {
|
||||
id: "deepseek/deepseek-v4-pro".to_string(),
|
||||
provider: ProviderKind::Novita,
|
||||
@@ -707,6 +747,22 @@ mod tests {
|
||||
assert!(resolved.resolved.supports_reasoning);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn xiaomi_mimo_tts_aliases_resolve_when_provider_hinted() {
|
||||
let registry = ModelRegistry::default();
|
||||
let resolved = registry.resolve(Some("tts"), Some(ProviderKind::XiaomiMimo));
|
||||
assert_eq!(resolved.resolved.provider, ProviderKind::XiaomiMimo);
|
||||
assert_eq!(resolved.resolved.id, "mimo-v2.5-tts");
|
||||
assert!(!resolved.resolved.supports_tools);
|
||||
assert!(!resolved.resolved.supports_reasoning);
|
||||
|
||||
let resolved = registry.resolve(Some("voice-design"), Some(ProviderKind::XiaomiMimo));
|
||||
assert_eq!(resolved.resolved.id, "mimo-v2.5-tts-voicedesign");
|
||||
|
||||
let resolved = registry.resolve(Some("voiceclone"), Some(ProviderKind::XiaomiMimo));
|
||||
assert_eq!(resolved.resolved.id, "mimo-v2.5-tts-voiceclone");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wanjie_ark_default_uses_reasoner_model_id() {
|
||||
let registry = ModelRegistry::default();
|
||||
|
||||
@@ -133,6 +133,9 @@ enum Commands {
|
||||
Doctor(TuiPassthroughArgs),
|
||||
/// List live DeepSeek API models via the TUI binary.
|
||||
Models(TuiPassthroughArgs),
|
||||
/// Generate speech audio with Xiaomi MiMo TTS models via the TUI binary.
|
||||
#[command(visible_alias = "tts")]
|
||||
Speech(TuiPassthroughArgs),
|
||||
/// List saved TUI sessions.
|
||||
Sessions(TuiPassthroughArgs),
|
||||
/// Resume a saved TUI session.
|
||||
@@ -510,6 +513,10 @@ fn run() -> Result<()> {
|
||||
let resolved_runtime = resolve_runtime_for_dispatch(&mut store, &runtime_overrides);
|
||||
delegate_to_tui(&cli, &resolved_runtime, tui_args("models", args))
|
||||
}
|
||||
Some(Commands::Speech(args)) => {
|
||||
let resolved_runtime = resolve_runtime_for_dispatch(&mut store, &runtime_overrides);
|
||||
delegate_to_tui(&cli, &resolved_runtime, tui_args("speech", args))
|
||||
}
|
||||
Some(Commands::Sessions(args)) => {
|
||||
let resolved_runtime = resolve_runtime_for_dispatch(&mut store, &runtime_overrides);
|
||||
delegate_to_tui(&cli, &resolved_runtime, tui_args("sessions", args))
|
||||
|
||||
@@ -44,6 +44,10 @@ const OPENROUTER_TENCENT_HY3_PREVIEW_MODEL: &str = "tencent/hy3-preview";
|
||||
const OPENROUTER_XIAOMI_MIMO_V2_5_PRO_MODEL: &str = "xiaomi/mimo-v2.5-pro";
|
||||
const OPENROUTER_XIAOMI_MIMO_V2_5_MODEL: &str = "xiaomi/mimo-v2.5";
|
||||
const DEFAULT_XIAOMI_MIMO_MODEL: &str = "mimo-v2.5-pro";
|
||||
const XIAOMI_MIMO_TTS_MODEL: &str = "mimo-v2.5-tts";
|
||||
const XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL: &str = "mimo-v2.5-tts-voicedesign";
|
||||
const XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL: &str = "mimo-v2.5-tts-voiceclone";
|
||||
const XIAOMI_MIMO_V2_TTS_MODEL: &str = "mimo-v2-tts";
|
||||
const DEFAULT_NOVITA_MODEL: &str = "deepseek/deepseek-v4-pro";
|
||||
const DEFAULT_NOVITA_FLASH_MODEL: &str = "deepseek/deepseek-v4-flash";
|
||||
const DEFAULT_FIREWORKS_MODEL: &str = "accounts/fireworks/models/deepseek-v4-pro";
|
||||
@@ -1447,6 +1451,12 @@ pub fn load_project_config(workspace: &Path) -> Option<ConfigToml> {
|
||||
}
|
||||
|
||||
fn normalize_model_for_provider(provider: ProviderKind, model: &str) -> String {
|
||||
if matches!(provider, ProviderKind::XiaomiMimo)
|
||||
&& let Some(canonical) = canonical_xiaomi_mimo_model_id(model)
|
||||
{
|
||||
return canonical.to_string();
|
||||
}
|
||||
|
||||
if matches!(
|
||||
provider,
|
||||
ProviderKind::Atlascloud
|
||||
@@ -1521,6 +1531,38 @@ fn normalize_model_for_provider(provider: ProviderKind, model: &str) -> String {
|
||||
}
|
||||
}
|
||||
|
||||
fn canonical_xiaomi_mimo_model_id(model: &str) -> Option<&'static str> {
|
||||
let normalized = model.trim().to_ascii_lowercase();
|
||||
let normalized = normalized.replace(['_', ' '], "-");
|
||||
match normalized.as_str() {
|
||||
"mimo"
|
||||
| DEFAULT_XIAOMI_MIMO_MODEL
|
||||
| "mimo-v2-5-pro"
|
||||
| "xiaomi-mimo-v2.5-pro"
|
||||
| "xiaomi-mimo-v2-5-pro" => Some(DEFAULT_XIAOMI_MIMO_MODEL),
|
||||
"mimo-v2.5" | "mimo-v25" | "mimo-v2-5" | "xiaomi-mimo-v2.5" | "xiaomi-mimo-v2-5" => {
|
||||
Some("mimo-v2.5")
|
||||
}
|
||||
"mimo-tts" | "mimo-v25-tts" | "mimo-v2.5-tts" | "tts" | "speech" => {
|
||||
Some(XIAOMI_MIMO_TTS_MODEL)
|
||||
}
|
||||
"mimo-tts-voicedesign"
|
||||
| "mimo-voice-design"
|
||||
| "mimo-v25-tts-voicedesign"
|
||||
| "mimo-v2.5-tts-voicedesign"
|
||||
| "voicedesign"
|
||||
| "voice-design" => Some(XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL),
|
||||
"mimo-tts-voiceclone"
|
||||
| "mimo-voice-clone"
|
||||
| "mimo-v25-tts-voiceclone"
|
||||
| "mimo-v2.5-tts-voiceclone"
|
||||
| "voiceclone"
|
||||
| "voice-clone" => Some(XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL),
|
||||
"mimo-v2-tts" => Some(XIAOMI_MIMO_V2_TTS_MODEL),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn canonical_openrouter_recent_model_id(model: &str) -> Option<&'static str> {
|
||||
let normalized = model.trim().to_ascii_lowercase();
|
||||
let normalized = normalized.replace(['_', ' '], "-");
|
||||
@@ -3571,6 +3613,26 @@ unix_socket_path = "/tmp/cw-hooks.sock"
|
||||
assert_eq!(resolved.model, DEFAULT_XIAOMI_MIMO_MODEL);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn xiaomi_mimo_tts_aliases_resolve_to_canonical_models() {
|
||||
assert_eq!(
|
||||
normalize_model_for_provider(ProviderKind::XiaomiMimo, "tts"),
|
||||
"mimo-v2.5-tts"
|
||||
);
|
||||
assert_eq!(
|
||||
normalize_model_for_provider(ProviderKind::XiaomiMimo, "voice-design"),
|
||||
"mimo-v2.5-tts-voicedesign"
|
||||
);
|
||||
assert_eq!(
|
||||
normalize_model_for_provider(ProviderKind::XiaomiMimo, "voiceclone"),
|
||||
"mimo-v2.5-tts-voiceclone"
|
||||
);
|
||||
assert_eq!(
|
||||
normalize_model_for_provider(ProviderKind::XiaomiMimo, "custom-mimo-model"),
|
||||
"custom-mimo-model"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn novita_provider_defaults_to_canonical_endpoint_and_model() {
|
||||
let _lock = env_lock();
|
||||
|
||||
@@ -8,6 +8,7 @@ use std::sync::{Arc, Mutex as StdMutex, OnceLock};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use base64::{Engine as _, engine::general_purpose};
|
||||
use reqwest::header::{AUTHORIZATION, CONTENT_TYPE, HeaderMap, HeaderName, HeaderValue};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::{Value, json};
|
||||
@@ -119,6 +120,31 @@ pub struct AvailableModel {
|
||||
pub created: Option<u64>,
|
||||
}
|
||||
|
||||
/// Request payload for Xiaomi MiMo speech synthesis models.
|
||||
///
|
||||
/// MiMo-V2.5-TTS / MiMo-V2-TTS use the OpenAI-compatible
|
||||
/// `/v1/chat/completions` endpoint: the optional style/voice instruction is
|
||||
/// sent as a `user` message, while the text to synthesize is sent as an
|
||||
/// `assistant` message.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SpeechSynthesisRequest {
|
||||
pub model: String,
|
||||
pub text: String,
|
||||
pub instruction: Option<String>,
|
||||
pub audio_format: String,
|
||||
pub voice: Option<String>,
|
||||
}
|
||||
|
||||
/// Decoded speech synthesis result.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SpeechSynthesisResponse {
|
||||
pub model: String,
|
||||
pub audio_format: String,
|
||||
pub audio_bytes: Vec<u8>,
|
||||
pub transcript: Option<String>,
|
||||
pub voice: Option<String>,
|
||||
}
|
||||
|
||||
/// Client for DeepSeek's OpenAI-compatible APIs.
|
||||
#[must_use]
|
||||
pub struct DeepSeekClient {
|
||||
@@ -407,6 +433,49 @@ pub(super) fn api_url(base_url: &str, path: &str) -> String {
|
||||
format!("{}/{}", versioned.trim_end_matches('/'), path)
|
||||
}
|
||||
|
||||
fn normalize_audio_format(format: &str) -> String {
|
||||
let normalized = format.trim().to_ascii_lowercase();
|
||||
if normalized.is_empty() {
|
||||
"wav".to_string()
|
||||
} else {
|
||||
normalized
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_speech_audio_response(payload: &Value) -> Result<(Vec<u8>, Option<String>)> {
|
||||
let audio = payload
|
||||
.get("choices")
|
||||
.and_then(Value::as_array)
|
||||
.and_then(|choices| choices.first())
|
||||
.and_then(|choice| {
|
||||
choice
|
||||
.get("message")
|
||||
.and_then(|message| message.get("audio"))
|
||||
.or_else(|| choice.get("delta").and_then(|delta| delta.get("audio")))
|
||||
})
|
||||
.or_else(|| payload.get("audio"))
|
||||
.context("Speech synthesis response did not include choices[0].message.audio")?;
|
||||
|
||||
let data = audio
|
||||
.get("data")
|
||||
.and_then(Value::as_str)
|
||||
.context("Speech synthesis response did not include audio.data")?
|
||||
.trim();
|
||||
let data = data
|
||||
.split_once(',')
|
||||
.map(|(_, base64)| base64.trim())
|
||||
.unwrap_or(data);
|
||||
let audio_bytes = general_purpose::STANDARD
|
||||
.decode(data)
|
||||
.context("Failed to decode speech audio base64 data")?;
|
||||
let transcript = audio
|
||||
.get("transcript")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string);
|
||||
|
||||
Ok((audio_bytes, transcript))
|
||||
}
|
||||
|
||||
// === DeepSeekClient ===
|
||||
|
||||
/// Returns true when DEEPSEEK_FORCE_HTTP1 is set to a truthy value
|
||||
@@ -645,6 +714,104 @@ impl DeepSeekClient {
|
||||
parse_models_response(&response_text)
|
||||
}
|
||||
|
||||
/// Generate speech with Xiaomi MiMo TTS models.
|
||||
///
|
||||
/// The spoken text is placed in an `assistant` message because Xiaomi
|
||||
/// MiMo's TTS chat-completions surface expects that shape. The optional
|
||||
/// `instruction` is a `user` message that controls style, voice design, or
|
||||
/// voice-clone performance and is not spoken verbatim.
|
||||
pub async fn synthesize_speech(
|
||||
&self,
|
||||
request: SpeechSynthesisRequest,
|
||||
) -> Result<SpeechSynthesisResponse> {
|
||||
if self.api_provider != crate::config::ApiProvider::XiaomiMimo {
|
||||
anyhow::bail!(
|
||||
"speech synthesis requires provider 'xiaomi-mimo' (current: {})",
|
||||
self.api_provider.as_str()
|
||||
);
|
||||
}
|
||||
|
||||
let model = request.model.trim().to_string();
|
||||
if model.is_empty() {
|
||||
anyhow::bail!("Speech model cannot be empty");
|
||||
}
|
||||
let text = request.text.trim().to_string();
|
||||
if text.is_empty() {
|
||||
anyhow::bail!("Speech text cannot be empty");
|
||||
}
|
||||
|
||||
let audio_format = normalize_audio_format(&request.audio_format);
|
||||
let model = wire_model_for_provider(self.api_provider, &model);
|
||||
let model_lower = model.to_ascii_lowercase();
|
||||
let instruction = request
|
||||
.instruction
|
||||
.as_deref()
|
||||
.map(str::trim)
|
||||
.filter(|value| !value.is_empty());
|
||||
let voice = request
|
||||
.voice
|
||||
.as_deref()
|
||||
.map(str::trim)
|
||||
.filter(|value| !value.is_empty())
|
||||
.map(str::to_string);
|
||||
|
||||
if model_lower.contains("voicedesign") && instruction.is_none() {
|
||||
anyhow::bail!(
|
||||
"Model '{model}' requires a voice design prompt. Pass --voice-prompt or --instruction."
|
||||
);
|
||||
}
|
||||
if model_lower.contains("voiceclone") && voice.is_none() {
|
||||
anyhow::bail!(
|
||||
"Model '{model}' requires cloned voice data. Pass --clone-voice <mp3|wav> or --voice <data-uri>."
|
||||
);
|
||||
}
|
||||
|
||||
let mut audio = json!({
|
||||
"format": audio_format.clone(),
|
||||
});
|
||||
if let Some(voice) = voice.as_deref() {
|
||||
audio["voice"] = json!(voice);
|
||||
}
|
||||
|
||||
let body = json!({
|
||||
"model": model,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": instruction.unwrap_or(""),
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": text,
|
||||
}
|
||||
],
|
||||
"audio": audio,
|
||||
});
|
||||
|
||||
let url = api_url(&self.base_url, "chat/completions");
|
||||
let response = self
|
||||
.send_with_retry(|| self.http_client.post(&url).json(&body))
|
||||
.await?;
|
||||
let status = response.status();
|
||||
if !status.is_success() {
|
||||
let error_text = bounded_error_text(response, ERROR_BODY_MAX_BYTES).await;
|
||||
anyhow::bail!("Speech synthesis failed: HTTP {status}: {error_text}");
|
||||
}
|
||||
|
||||
let response_text = response.text().await.unwrap_or_default();
|
||||
let payload: Value = serde_json::from_str(&response_text)
|
||||
.context("Failed to parse speech synthesis response JSON")?;
|
||||
let (audio_bytes, transcript) = parse_speech_audio_response(&payload)?;
|
||||
|
||||
Ok(SpeechSynthesisResponse {
|
||||
model,
|
||||
audio_format,
|
||||
audio_bytes,
|
||||
transcript,
|
||||
voice,
|
||||
})
|
||||
}
|
||||
|
||||
async fn wait_for_rate_limit(&self) {
|
||||
let maybe_delay = {
|
||||
let mut limiter = self.rate_limiter.lock().await;
|
||||
@@ -1166,6 +1333,39 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_speech_audio_response_accepts_message_audio() {
|
||||
let encoded = general_purpose::STANDARD.encode(b"hi");
|
||||
let payload = json!({
|
||||
"choices": [{
|
||||
"message": {
|
||||
"audio": {
|
||||
"data": encoded,
|
||||
"transcript": "hi"
|
||||
}
|
||||
}
|
||||
}]
|
||||
});
|
||||
|
||||
let (audio, transcript) = parse_speech_audio_response(&payload).unwrap();
|
||||
assert_eq!(audio, b"hi");
|
||||
assert_eq!(transcript.as_deref(), Some("hi"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_speech_audio_response_accepts_data_uri() {
|
||||
let encoded = general_purpose::STANDARD.encode(b"wav");
|
||||
let payload = json!({
|
||||
"audio": {
|
||||
"data": format!("data:audio/wav;base64,{encoded}")
|
||||
}
|
||||
});
|
||||
|
||||
let (audio, transcript) = parse_speech_audio_response(&payload).unwrap();
|
||||
assert_eq!(audio, b"wav");
|
||||
assert_eq!(transcript, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tool_name_roundtrip_dot() {
|
||||
let original = "multi_tool_use.parallel";
|
||||
|
||||
@@ -36,9 +36,13 @@ pub fn provider(app: &mut App, args: Option<&str>) -> CommandResult {
|
||||
|
||||
let model = match model_arg {
|
||||
None => None,
|
||||
Some(raw) if matches!(target, ApiProvider::XiaomiMimo) => {
|
||||
let expanded = expand_model_alias_for_provider(target, raw);
|
||||
Some(normalize_model_name_for_provider(target, &expanded).unwrap_or(expanded))
|
||||
}
|
||||
Some(raw) if provider_passes_model_through(target) => Some(raw.trim().to_string()),
|
||||
Some(raw) => {
|
||||
let expanded = expand_model_alias(raw);
|
||||
let expanded = expand_model_alias_for_provider(target, raw);
|
||||
let normalized = if matches!(target, ApiProvider::Deepseek | ApiProvider::DeepseekCN) {
|
||||
normalize_model_name_for_provider(target, &expanded)
|
||||
} else {
|
||||
@@ -48,7 +52,7 @@ pub fn provider(app: &mut App, args: Option<&str>) -> CommandResult {
|
||||
Some(normalized) => Some(normalized),
|
||||
None => {
|
||||
return CommandResult::error(format!(
|
||||
"Invalid model '{raw}'. Try: flash, pro, deepseek-v4-flash, deepseek-v4-pro."
|
||||
"Invalid model '{raw}'. Try: flash, pro, deepseek-v4-flash, deepseek-v4-pro, or xiaomi-mimo tts."
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -65,8 +69,24 @@ pub fn provider(app: &mut App, args: Option<&str>) -> CommandResult {
|
||||
})
|
||||
}
|
||||
|
||||
fn expand_model_alias(name: &str) -> String {
|
||||
match name.trim().to_ascii_lowercase().as_str() {
|
||||
fn expand_model_alias_for_provider(provider: ApiProvider, name: &str) -> String {
|
||||
let lower = name.trim().to_ascii_lowercase();
|
||||
if matches!(provider, ApiProvider::XiaomiMimo) {
|
||||
return match lower.as_str() {
|
||||
"pro" | "mimo" => "mimo-v2.5-pro".to_string(),
|
||||
"text" => "mimo-v2.5".to_string(),
|
||||
"tts" | "speech" | "mimo-tts" => "mimo-v2.5-tts".to_string(),
|
||||
"voicedesign" | "voice-design" | "mimo-voice-design" => {
|
||||
"mimo-v2.5-tts-voicedesign".to_string()
|
||||
}
|
||||
"voiceclone" | "voice-clone" | "mimo-voice-clone" => {
|
||||
"mimo-v2.5-tts-voiceclone".to_string()
|
||||
}
|
||||
other => other.to_string(),
|
||||
};
|
||||
}
|
||||
|
||||
match lower.as_str() {
|
||||
"pro" | "v4-pro" => "deepseek-v4-pro".to_string(),
|
||||
"flash" | "v4-flash" => "deepseek-v4-flash".to_string(),
|
||||
other => other.to_string(),
|
||||
@@ -154,6 +174,28 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn switch_to_xiaomi_mimo_accepts_tts_shorthands() {
|
||||
let mut app = create_test_app();
|
||||
let result = provider(&mut app, Some("xiaomi-mimo tts"));
|
||||
match result.action {
|
||||
Some(AppAction::SwitchProvider { provider, model }) => {
|
||||
assert_eq!(provider, ApiProvider::XiaomiMimo);
|
||||
assert_eq!(model.as_deref(), Some("mimo-v2.5-tts"));
|
||||
}
|
||||
other => panic!("expected SwitchProvider, got {other:?}"),
|
||||
}
|
||||
|
||||
let result = provider(&mut app, Some("xiaomi-mimo voiceclone"));
|
||||
match result.action {
|
||||
Some(AppAction::SwitchProvider { provider, model }) => {
|
||||
assert_eq!(provider, ApiProvider::XiaomiMimo);
|
||||
assert_eq!(model.as_deref(), Some("mimo-v2.5-tts-voiceclone"));
|
||||
}
|
||||
other => panic!("expected SwitchProvider, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn switch_to_atlascloud_emits_action() {
|
||||
let mut app = create_test_app();
|
||||
|
||||
+128
-2
@@ -78,6 +78,10 @@ pub const RECENT_OPENROUTER_LARGE_MODELS: &[&str] = &[
|
||||
pub const DEFAULT_OPENROUTER_BASE_URL: &str = "https://openrouter.ai/api/v1";
|
||||
pub const DEFAULT_XIAOMI_MIMO_MODEL: &str = "mimo-v2.5-pro";
|
||||
pub const DEFAULT_XIAOMI_MIMO_BASE_URL: &str = "https://api.xiaomimimo.com/v1";
|
||||
pub const XIAOMI_MIMO_TTS_MODEL: &str = "mimo-v2.5-tts";
|
||||
pub const XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL: &str = "mimo-v2.5-tts-voicedesign";
|
||||
pub const XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL: &str = "mimo-v2.5-tts-voiceclone";
|
||||
pub const XIAOMI_MIMO_V2_TTS_MODEL: &str = "mimo-v2-tts";
|
||||
pub const DEFAULT_NOVITA_MODEL: &str = "deepseek/deepseek-v4-pro";
|
||||
pub const DEFAULT_NOVITA_FLASH_MODEL: &str = "deepseek/deepseek-v4-flash";
|
||||
pub const DEFAULT_NOVITA_BASE_URL: &str = "https://api.novita.ai/v1";
|
||||
@@ -538,6 +542,38 @@ fn canonical_openrouter_recent_model_id(model: &str) -> Option<&'static str> {
|
||||
}
|
||||
}
|
||||
|
||||
fn canonical_xiaomi_mimo_model_id(model: &str) -> Option<&'static str> {
|
||||
let normalized = model.trim().to_ascii_lowercase();
|
||||
let normalized = normalized.replace(['_', ' '], "-");
|
||||
match normalized.as_str() {
|
||||
"mimo"
|
||||
| DEFAULT_XIAOMI_MIMO_MODEL
|
||||
| "mimo-v2-5-pro"
|
||||
| "xiaomi-mimo-v2.5-pro"
|
||||
| "xiaomi-mimo-v2-5-pro" => Some(DEFAULT_XIAOMI_MIMO_MODEL),
|
||||
"mimo-v2.5" | "mimo-v25" | "mimo-v2-5" | "xiaomi-mimo-v2.5" | "xiaomi-mimo-v2-5" => {
|
||||
Some("mimo-v2.5")
|
||||
}
|
||||
"mimo-tts" | "mimo-v25-tts" | "mimo-v2.5-tts" | "tts" | "speech" => {
|
||||
Some(XIAOMI_MIMO_TTS_MODEL)
|
||||
}
|
||||
"mimo-tts-voicedesign"
|
||||
| "mimo-voice-design"
|
||||
| "mimo-v25-tts-voicedesign"
|
||||
| "mimo-v2.5-tts-voicedesign"
|
||||
| "voicedesign"
|
||||
| "voice-design" => Some(XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL),
|
||||
"mimo-tts-voiceclone"
|
||||
| "mimo-voice-clone"
|
||||
| "mimo-v25-tts-voiceclone"
|
||||
| "mimo-v2.5-tts-voiceclone"
|
||||
| "voiceclone"
|
||||
| "voice-clone" => Some(XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL),
|
||||
"mimo-v2-tts" => Some(XIAOMI_MIMO_V2_TTS_MODEL),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Normalize a model selected through the TUI for the active provider.
|
||||
///
|
||||
/// Official DeepSeek endpoints require bare model IDs. Provider-prefixed
|
||||
@@ -556,6 +592,12 @@ pub fn normalize_model_name_for_provider(provider: ApiProvider, model: &str) ->
|
||||
return Some(canonical.to_string());
|
||||
}
|
||||
|
||||
if matches!(provider, ApiProvider::XiaomiMimo)
|
||||
&& let Some(canonical) = canonical_xiaomi_mimo_model_id(model)
|
||||
{
|
||||
return Some(canonical.to_string());
|
||||
}
|
||||
|
||||
let normalized = normalize_model_name(model)?;
|
||||
if matches!(provider, ApiProvider::Deepseek | ApiProvider::DeepseekCN)
|
||||
&& let Some(canonical) = canonical_official_deepseek_model_id(&normalized)
|
||||
@@ -585,7 +627,14 @@ pub fn normalize_model_name_for_provider(provider: ApiProvider, model: &str) ->
|
||||
#[must_use]
|
||||
pub fn wire_model_for_provider(provider: ApiProvider, model: &str) -> String {
|
||||
let trimmed = model.trim();
|
||||
if trimmed.is_empty() || provider_passes_model_through(provider) {
|
||||
if trimmed.is_empty() {
|
||||
return trimmed.to_string();
|
||||
}
|
||||
if matches!(provider, ApiProvider::XiaomiMimo) {
|
||||
return normalize_model_name_for_provider(provider, trimmed)
|
||||
.unwrap_or_else(|| trimmed.to_string());
|
||||
}
|
||||
if provider_passes_model_through(provider) {
|
||||
return trimmed.to_string();
|
||||
}
|
||||
normalize_model_name_for_provider(provider, trimmed).unwrap_or_else(|| trimmed.to_string())
|
||||
@@ -601,7 +650,14 @@ pub fn model_completion_names_for_provider(provider: ApiProvider) -> Vec<&'stati
|
||||
models.extend_from_slice(RECENT_OPENROUTER_LARGE_MODELS);
|
||||
models
|
||||
}
|
||||
ApiProvider::XiaomiMimo => vec![DEFAULT_XIAOMI_MIMO_MODEL, "mimo-v2.5"],
|
||||
ApiProvider::XiaomiMimo => vec![
|
||||
DEFAULT_XIAOMI_MIMO_MODEL,
|
||||
"mimo-v2.5",
|
||||
XIAOMI_MIMO_TTS_MODEL,
|
||||
XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL,
|
||||
XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL,
|
||||
XIAOMI_MIMO_V2_TTS_MODEL,
|
||||
],
|
||||
ApiProvider::Novita => vec![DEFAULT_NOVITA_MODEL, DEFAULT_NOVITA_FLASH_MODEL],
|
||||
ApiProvider::Fireworks => vec![DEFAULT_FIREWORKS_MODEL],
|
||||
ApiProvider::Siliconflow => {
|
||||
@@ -822,6 +878,15 @@ pub struct MemoryConfig {
|
||||
pub enabled: Option<bool>,
|
||||
}
|
||||
|
||||
/// Xiaomi MiMo speech/TTS output configuration.
|
||||
#[derive(Debug, Clone, Default, Deserialize)]
|
||||
pub struct SpeechConfig {
|
||||
/// Default directory for generated speech/TTS files when no explicit
|
||||
/// output path is provided.
|
||||
#[serde(default)]
|
||||
pub output_dir: Option<String>,
|
||||
}
|
||||
|
||||
impl SnapshotsConfig {
|
||||
#[must_use]
|
||||
pub fn max_age(&self) -> std::time::Duration {
|
||||
@@ -1429,6 +1494,10 @@ pub struct Config {
|
||||
#[serde(default)]
|
||||
pub memory: Option<MemoryConfig>,
|
||||
|
||||
/// Xiaomi MiMo speech/TTS defaults.
|
||||
#[serde(default)]
|
||||
pub speech: Option<SpeechConfig>,
|
||||
|
||||
/// Tunables for `--model auto` (#1207). When absent, the auto router
|
||||
/// keeps its existing balanced behaviour.
|
||||
#[serde(default)]
|
||||
@@ -2353,6 +2422,26 @@ impl Config {
|
||||
.unwrap_or_else(|| PathBuf::from("./memory.md"))
|
||||
}
|
||||
|
||||
/// Resolve the default speech/TTS output directory, if configured.
|
||||
#[must_use]
|
||||
pub fn speech_output_dir(&self) -> Option<PathBuf> {
|
||||
std::env::var("XIAOMI_MIMO_SPEECH_OUTPUT_DIR")
|
||||
.or_else(|_| std::env::var("MIMO_SPEECH_OUTPUT_DIR"))
|
||||
.or_else(|_| std::env::var("XIAOMIMIMO_SPEECH_OUTPUT_DIR"))
|
||||
.ok()
|
||||
.map(|value| value.trim().to_string())
|
||||
.filter(|value| !value.is_empty())
|
||||
.map(|value| expand_path(&value))
|
||||
.or_else(|| {
|
||||
self.speech
|
||||
.as_ref()
|
||||
.and_then(|speech| speech.output_dir.as_deref())
|
||||
.map(str::trim)
|
||||
.filter(|value| !value.is_empty())
|
||||
.map(expand_path)
|
||||
})
|
||||
}
|
||||
|
||||
/// Resolve the configured `instructions = [...]` array (#454)
|
||||
/// to absolute paths, in declared order. Empty when unset or
|
||||
/// when every entry is empty after trimming. Each entry runs
|
||||
@@ -3540,6 +3629,11 @@ fn normalize_model_config(config: &mut Config) {
|
||||
}
|
||||
|
||||
fn normalize_model_for_provider(provider: ApiProvider, model: &str) -> Option<String> {
|
||||
if matches!(provider, ApiProvider::XiaomiMimo)
|
||||
&& let Some(canonical) = canonical_xiaomi_mimo_model_id(model)
|
||||
{
|
||||
return Some(canonical.to_string());
|
||||
}
|
||||
if provider_passes_model_through(provider) {
|
||||
return None;
|
||||
}
|
||||
@@ -3788,6 +3882,7 @@ fn merge_config(base: Config, override_cfg: Config) -> Config {
|
||||
snapshots: override_cfg.snapshots.or(base.snapshots),
|
||||
search: override_cfg.search.or(base.search),
|
||||
memory: override_cfg.memory.or(base.memory),
|
||||
speech: override_cfg.speech.or(base.speech),
|
||||
auto: override_cfg.auto.or(base.auto),
|
||||
update: override_cfg.update.or(base.update),
|
||||
lsp: override_cfg.lsp.or(base.lsp),
|
||||
@@ -6510,6 +6605,37 @@ api_key = "old-openrouter-key"
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_xiaomi_mimo_tts_aliases_for_provider() {
|
||||
assert_eq!(
|
||||
normalize_model_name_for_provider(ApiProvider::XiaomiMimo, "tts").as_deref(),
|
||||
Some("mimo-v2.5-tts")
|
||||
);
|
||||
assert_eq!(
|
||||
normalize_model_name_for_provider(ApiProvider::XiaomiMimo, "voice-design").as_deref(),
|
||||
Some("mimo-v2.5-tts-voicedesign")
|
||||
);
|
||||
assert_eq!(
|
||||
wire_model_for_provider(ApiProvider::XiaomiMimo, "voiceclone"),
|
||||
"mimo-v2.5-tts-voiceclone"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn model_completion_names_for_xiaomi_mimo_include_tts_models() {
|
||||
let models = model_completion_names_for_provider(ApiProvider::XiaomiMimo);
|
||||
for expected in [
|
||||
"mimo-v2.5-pro",
|
||||
"mimo-v2.5",
|
||||
"mimo-v2.5-tts",
|
||||
"mimo-v2.5-tts-voicedesign",
|
||||
"mimo-v2.5-tts-voiceclone",
|
||||
"mimo-v2-tts",
|
||||
] {
|
||||
assert!(models.contains(&expected), "missing {expected}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn model_completion_names_for_deepseek_api_are_deduplicated_bare_ids() {
|
||||
assert_eq!(
|
||||
|
||||
@@ -161,6 +161,8 @@ pub struct EngineConfig {
|
||||
/// Path to the user memory file (#489). Always populated; only
|
||||
/// consulted when `memory_enabled` is `true`.
|
||||
pub memory_path: PathBuf,
|
||||
/// Default directory for Xiaomi MiMo speech/TTS tool outputs.
|
||||
pub speech_output_dir: Option<PathBuf>,
|
||||
pub vision_config: Option<crate::config::VisionModelConfig>,
|
||||
pub goal_objective: Option<String>,
|
||||
/// Tool restriction from custom slash command frontmatter.
|
||||
@@ -236,6 +238,7 @@ impl Default for EngineConfig {
|
||||
subagent_model_overrides: HashMap::new(),
|
||||
memory_enabled: false,
|
||||
memory_path: PathBuf::from("./memory.md"),
|
||||
speech_output_dir: None,
|
||||
vision_config: None,
|
||||
strict_tool_mode: false,
|
||||
goal_objective: None,
|
||||
|
||||
@@ -78,7 +78,11 @@ impl Engine {
|
||||
if mode != AppMode::Plan {
|
||||
builder = builder
|
||||
.with_rlm_tool(self.deepseek_client.clone(), self.session.model.clone())
|
||||
.with_fim_tool(self.deepseek_client.clone(), self.session.model.clone());
|
||||
.with_fim_tool(self.deepseek_client.clone(), self.session.model.clone())
|
||||
.with_speech_tools(
|
||||
self.deepseek_client.clone(),
|
||||
self.config.speech_output_dir.clone(),
|
||||
);
|
||||
}
|
||||
|
||||
if self.config.features.enabled(Feature::ApplyPatch) && mode != AppMode::Plan {
|
||||
|
||||
@@ -6,6 +6,7 @@ use std::process::{Command, Stdio};
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result, anyhow, bail};
|
||||
use base64::{Engine as _, engine::general_purpose};
|
||||
use clap::{Args, CommandFactory, Parser, Subcommand, ValueEnum};
|
||||
use clap_complete::{Shell, generate};
|
||||
use dotenvy::dotenv;
|
||||
@@ -225,6 +226,9 @@ enum Commands {
|
||||
Logout,
|
||||
/// List available models from the configured API endpoint
|
||||
Models(ModelsArgs),
|
||||
/// Generate speech audio with Xiaomi MiMo TTS models
|
||||
#[command(visible_alias = "tts")]
|
||||
Speech(SpeechArgs),
|
||||
/// Run a non-interactive prompt. Use --auto for tool-backed agent mode.
|
||||
Exec(ExecArgs),
|
||||
/// Generate SWE-bench prediction rows from CodeWhale runs
|
||||
@@ -531,6 +535,50 @@ struct ModelsArgs {
|
||||
json: bool,
|
||||
}
|
||||
|
||||
#[derive(Args, Debug, Clone)]
|
||||
struct SpeechArgs {
|
||||
/// Text to synthesize. This is sent as the assistant message content.
|
||||
#[arg(value_name = "TEXT")]
|
||||
text: String,
|
||||
|
||||
/// Output audio path. Defaults to speech.<format> in --output-dir,
|
||||
/// [speech].output_dir, or the current directory.
|
||||
#[arg(short, long, value_name = "FILE")]
|
||||
output: Option<PathBuf>,
|
||||
|
||||
/// Directory for the default speech.<format> output file when -o/--output is omitted.
|
||||
#[arg(long = "output-dir", value_name = "DIR")]
|
||||
output_dir: Option<PathBuf>,
|
||||
|
||||
/// TTS model. Defaults to built-in voices, or is inferred from --voice-prompt/--clone-voice.
|
||||
#[arg(long)]
|
||||
model: Option<String>,
|
||||
|
||||
/// Built-in voice ID, or a data:audio/...;base64,... URI for voice clone.
|
||||
#[arg(long)]
|
||||
voice: Option<String>,
|
||||
|
||||
/// Natural language style instruction; not spoken verbatim.
|
||||
#[arg(long)]
|
||||
instruction: Option<String>,
|
||||
|
||||
/// Voice design prompt. Implies mimo-v2.5-tts-voicedesign when --model is omitted.
|
||||
#[arg(long = "voice-prompt")]
|
||||
voice_prompt: Option<String>,
|
||||
|
||||
/// MP3/WAV sample used for voice cloning. Implies mimo-v2.5-tts-voiceclone when --model is omitted.
|
||||
#[arg(long = "clone-voice", value_name = "FILE")]
|
||||
clone_voice: Option<PathBuf>,
|
||||
|
||||
/// Output audio format requested from the API
|
||||
#[arg(long, default_value = "wav")]
|
||||
format: String,
|
||||
|
||||
/// Emit machine-readable JSON output
|
||||
#[arg(long, default_value_t = false)]
|
||||
json: bool,
|
||||
}
|
||||
|
||||
#[derive(Args, Debug, Default, Clone)]
|
||||
struct FeatureToggles {
|
||||
/// Enable a feature (repeatable). Equivalent to `features.<name>=true`.
|
||||
@@ -896,6 +944,10 @@ async fn main() -> Result<()> {
|
||||
let config = load_config_from_cli(&cli)?;
|
||||
run_models(&config, args).await
|
||||
}
|
||||
Commands::Speech(args) => {
|
||||
let config = load_config_from_cli(&cli)?;
|
||||
run_speech(&config, args).await
|
||||
}
|
||||
Commands::Exec(args) => {
|
||||
let config = load_config_from_cli(&cli)?;
|
||||
let workspace = cli.workspace.clone().unwrap_or_else(|| {
|
||||
@@ -3514,6 +3566,258 @@ async fn run_models(config: &Config, args: ModelsArgs) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn run_speech(config: &Config, args: SpeechArgs) -> Result<()> {
|
||||
use crate::client::{DeepSeekClient, SpeechSynthesisRequest};
|
||||
use crate::config::{ApiProvider, normalize_model_name_for_provider};
|
||||
|
||||
let SpeechArgs {
|
||||
text,
|
||||
output,
|
||||
output_dir,
|
||||
model,
|
||||
voice,
|
||||
instruction,
|
||||
voice_prompt,
|
||||
clone_voice,
|
||||
format,
|
||||
json: json_output,
|
||||
} = args;
|
||||
|
||||
if config.api_provider() != ApiProvider::XiaomiMimo {
|
||||
bail!(
|
||||
"`speech` requires provider = \"xiaomi-mimo\" (current: {}). Run with `--provider xiaomi-mimo` or set it in config.",
|
||||
config.api_provider().as_str()
|
||||
);
|
||||
}
|
||||
|
||||
if text.trim().is_empty() {
|
||||
bail!("Speech text cannot be empty");
|
||||
}
|
||||
let voice_is_data_uri = voice
|
||||
.as_deref()
|
||||
.map(str::trim)
|
||||
.is_some_and(|value| value.starts_with("data:audio/"));
|
||||
if clone_voice.is_some() && voice.is_some() {
|
||||
bail!("Use either --clone-voice or --voice for cloned voice data, not both");
|
||||
}
|
||||
let model = match model {
|
||||
Some(value) => {
|
||||
normalize_model_name_for_provider(ApiProvider::XiaomiMimo, &value).unwrap_or(value)
|
||||
}
|
||||
None => {
|
||||
if clone_voice.is_some() || voice_is_data_uri {
|
||||
"mimo-v2.5-tts-voiceclone".to_string()
|
||||
} else if voice_prompt.is_some() {
|
||||
"mimo-v2.5-tts-voicedesign".to_string()
|
||||
} else {
|
||||
"mimo-v2.5-tts".to_string()
|
||||
}
|
||||
}
|
||||
};
|
||||
let model_lower = model.to_ascii_lowercase();
|
||||
if !model_lower.contains("tts") {
|
||||
bail!(
|
||||
"speech requires a TTS model (examples: mimo-v2.5-tts, mimo-v2.5-tts-voicedesign, mimo-v2.5-tts-voiceclone); got {model}"
|
||||
);
|
||||
}
|
||||
let is_voice_design = model_lower.contains("voicedesign");
|
||||
let is_voice_clone = model_lower.contains("voiceclone");
|
||||
|
||||
let instruction = combine_speech_instructions(instruction, voice_prompt);
|
||||
if is_voice_design
|
||||
&& instruction
|
||||
.as_deref()
|
||||
.is_none_or(|value| value.trim().is_empty())
|
||||
{
|
||||
bail!(
|
||||
"mimo-v2.5-tts-voicedesign requires --voice-prompt or --instruction to describe the voice"
|
||||
);
|
||||
}
|
||||
|
||||
let voice = if let Some(clone_path) = clone_voice {
|
||||
Some(encode_voice_clone_data_uri(&clone_path)?)
|
||||
} else if is_voice_design {
|
||||
None
|
||||
} else if let Some(value) = voice.filter(|value| !value.trim().is_empty()) {
|
||||
Some(value)
|
||||
} else if is_voice_clone {
|
||||
bail!("mimo-v2.5-tts-voiceclone requires --clone-voice <mp3|wav> or --voice <data-uri>");
|
||||
} else {
|
||||
Some("mimo_default".to_string())
|
||||
};
|
||||
let format = normalize_speech_format(&format).with_context(|| {
|
||||
format!("Unsupported speech format '{format}' (allowed: wav, mp3, pcm16)")
|
||||
})?;
|
||||
let output = resolve_speech_output_path(
|
||||
output,
|
||||
output_dir.or_else(|| config.speech_output_dir()),
|
||||
&format,
|
||||
);
|
||||
|
||||
let client = DeepSeekClient::new(config)?;
|
||||
let response = client
|
||||
.synthesize_speech(SpeechSynthesisRequest {
|
||||
model: model.clone(),
|
||||
text,
|
||||
instruction,
|
||||
audio_format: format.clone(),
|
||||
voice,
|
||||
})
|
||||
.await?;
|
||||
|
||||
if let Some(parent) = output.parent().filter(|path| !path.as_os_str().is_empty()) {
|
||||
std::fs::create_dir_all(parent)
|
||||
.with_context(|| format!("Failed to create output directory {}", parent.display()))?;
|
||||
}
|
||||
std::fs::write(&output, &response.audio_bytes)
|
||||
.with_context(|| format!("Failed to write audio file {}", output.display()))?;
|
||||
|
||||
if json_output {
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&serde_json::json!({
|
||||
"mode": "speech",
|
||||
"success": true,
|
||||
"model": response.model,
|
||||
"format": response.audio_format,
|
||||
"output": output.display().to_string(),
|
||||
"bytes": response.audio_bytes.len(),
|
||||
"voice": response.voice.as_deref().map(describe_speech_voice),
|
||||
"transcript": response.transcript,
|
||||
}))?
|
||||
);
|
||||
} else {
|
||||
println!(
|
||||
"Generated speech: {} ({} bytes, model: {}, format: {})",
|
||||
output.display(),
|
||||
response.audio_bytes.len(),
|
||||
response.model,
|
||||
response.audio_format
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn combine_speech_instructions(
|
||||
instruction: Option<String>,
|
||||
voice_prompt: Option<String>,
|
||||
) -> Option<String> {
|
||||
match (instruction, voice_prompt) {
|
||||
(Some(instruction), Some(voice_prompt)) => {
|
||||
let instruction = instruction.trim();
|
||||
let voice_prompt = voice_prompt.trim();
|
||||
if instruction.is_empty() {
|
||||
Some(voice_prompt.to_string()).filter(|value| !value.is_empty())
|
||||
} else if voice_prompt.is_empty() {
|
||||
Some(instruction.to_string()).filter(|value| !value.is_empty())
|
||||
} else {
|
||||
Some(format!("{voice_prompt}\n\n{instruction}"))
|
||||
}
|
||||
}
|
||||
(Some(value), None) | (None, Some(value)) => {
|
||||
let value = value.trim().to_string();
|
||||
if value.is_empty() { None } else { Some(value) }
|
||||
}
|
||||
(None, None) => None,
|
||||
}
|
||||
}
|
||||
|
||||
const VOICE_CLONE_BASE64_MAX_BYTES: usize = 10 * 1024 * 1024;
|
||||
|
||||
fn normalize_speech_format(format: &str) -> Option<String> {
|
||||
let normalized = format.trim().to_ascii_lowercase();
|
||||
match normalized.as_str() {
|
||||
"wav" | "mp3" | "pcm16" => Some(normalized),
|
||||
"pcm" => Some("pcm16".to_string()),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn default_speech_output_name(format: &str) -> String {
|
||||
format!(
|
||||
"speech.{}",
|
||||
normalize_speech_format(format).as_deref().unwrap_or("wav")
|
||||
)
|
||||
}
|
||||
|
||||
fn resolve_speech_output_path(
|
||||
output: Option<PathBuf>,
|
||||
output_dir: Option<PathBuf>,
|
||||
format: &str,
|
||||
) -> PathBuf {
|
||||
output.unwrap_or_else(|| {
|
||||
output_dir
|
||||
.unwrap_or_default()
|
||||
.join(default_speech_output_name(format))
|
||||
})
|
||||
}
|
||||
|
||||
fn encode_voice_clone_data_uri(path: &Path) -> Result<String> {
|
||||
let bytes = std::fs::read(path)
|
||||
.with_context(|| format!("Failed to read voice clone sample {}", path.display()))?;
|
||||
let base64_audio = general_purpose::STANDARD.encode(bytes);
|
||||
if base64_audio.len() > VOICE_CLONE_BASE64_MAX_BYTES {
|
||||
bail!(
|
||||
"Voice clone sample is too large after base64 encoding ({} bytes > 10 MB)",
|
||||
base64_audio.len()
|
||||
);
|
||||
}
|
||||
|
||||
let extension = path
|
||||
.extension()
|
||||
.and_then(|value| value.to_str())
|
||||
.unwrap_or_default()
|
||||
.to_ascii_lowercase();
|
||||
let mime = match extension.as_str() {
|
||||
"mp3" => "audio/mpeg",
|
||||
"wav" => "audio/wav",
|
||||
other => bail!(
|
||||
"Unsupported voice clone sample extension '{}'. Use .mp3 or .wav.",
|
||||
other
|
||||
),
|
||||
};
|
||||
|
||||
Ok(format!("data:{mime};base64,{base64_audio}"))
|
||||
}
|
||||
|
||||
fn describe_speech_voice(voice: &str) -> String {
|
||||
if voice.starts_with("data:") {
|
||||
"embedded voice clone sample".to_string()
|
||||
} else {
|
||||
voice.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod speech_cli_tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn normalizes_documented_speech_formats() {
|
||||
assert_eq!(normalize_speech_format("WAV").as_deref(), Some("wav"));
|
||||
assert_eq!(normalize_speech_format("pcm16").as_deref(), Some("pcm16"));
|
||||
assert_eq!(normalize_speech_format("pcm").as_deref(), Some("pcm16"));
|
||||
assert_eq!(normalize_speech_format("flac"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn default_speech_output_tracks_requested_format() {
|
||||
assert_eq!(
|
||||
resolve_speech_output_path(None, None, "mp3"),
|
||||
PathBuf::from("speech.mp3")
|
||||
);
|
||||
assert_eq!(
|
||||
resolve_speech_output_path(None, Some(PathBuf::from("audio")), "pcm"),
|
||||
PathBuf::from("audio").join("speech.pcm16")
|
||||
);
|
||||
assert_eq!(
|
||||
resolve_speech_output_path(Some(PathBuf::from("custom.wav")), None, "mp3"),
|
||||
PathBuf::from("custom.wav")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Test API connectivity by making a minimal request
|
||||
async fn test_api_connectivity(config: &Config) -> Result<()> {
|
||||
use crate::client::DeepSeekClient;
|
||||
@@ -5462,6 +5766,7 @@ async fn run_exec_agent(
|
||||
prefer_bwrap: config.prefer_bwrap.unwrap_or(false),
|
||||
memory_enabled: config.memory_enabled(),
|
||||
memory_path: config.memory_path(),
|
||||
speech_output_dir: config.speech_output_dir(),
|
||||
vision_config: config.vision_model_config(),
|
||||
strict_tool_mode: config.strict_tool_mode.unwrap_or(false),
|
||||
goal_objective: None,
|
||||
|
||||
@@ -2017,6 +2017,7 @@ impl RuntimeThreadManager {
|
||||
prefer_bwrap: self.config.prefer_bwrap.unwrap_or(false),
|
||||
memory_enabled: self.config.memory_enabled(),
|
||||
memory_path: self.config.memory_path(),
|
||||
speech_output_dir: self.config.speech_output_dir(),
|
||||
vision_config: self.config.vision_model_config(),
|
||||
strict_tool_mode: self.config.strict_tool_mode.unwrap_or(false),
|
||||
goal_objective: None,
|
||||
|
||||
@@ -48,6 +48,7 @@ pub mod shell;
|
||||
mod shell_output;
|
||||
pub mod skill;
|
||||
pub mod spec;
|
||||
pub mod speech;
|
||||
pub mod subagent;
|
||||
pub mod tasks;
|
||||
pub mod test_runner;
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, OnceLock};
|
||||
|
||||
use std::path::Path;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use serde_json::Value;
|
||||
|
||||
@@ -776,6 +776,22 @@ impl ToolRegistryBuilder {
|
||||
self.with_tool(Arc::new(RevertTurnTool))
|
||||
}
|
||||
|
||||
/// Include Xiaomi MiMo speech/TTS tools (`speech`, `tts`).
|
||||
#[must_use]
|
||||
pub fn with_speech_tools(
|
||||
self,
|
||||
client: Option<DeepSeekClient>,
|
||||
output_dir: Option<PathBuf>,
|
||||
) -> Self {
|
||||
use super::speech::SpeechTool;
|
||||
self.with_tool(Arc::new(SpeechTool::new(
|
||||
"speech",
|
||||
client.clone(),
|
||||
output_dir.clone(),
|
||||
)))
|
||||
.with_tool(Arc::new(SpeechTool::new("tts", client, output_dir)))
|
||||
}
|
||||
|
||||
/// Include persistent RLM session tools.
|
||||
#[must_use]
|
||||
pub fn with_rlm_tool(self, client: Option<DeepSeekClient>, _root_model: String) -> Self {
|
||||
@@ -958,11 +974,13 @@ impl ToolRegistryBuilder {
|
||||
todo_list: super::todo::SharedTodoList,
|
||||
plan_state: super::plan::SharedPlanState,
|
||||
) -> Self {
|
||||
let speech_client = client.clone();
|
||||
self.with_agent_tools(allow_shell)
|
||||
.with_todo_tool(todo_list)
|
||||
.with_plan_tool(plan_state)
|
||||
.with_review_tool(client.clone(), model.clone())
|
||||
.with_rlm_tool(client, model)
|
||||
.with_speech_tools(speech_client, None)
|
||||
.with_recall_archive_tool()
|
||||
.with_subagent_tools(manager, runtime)
|
||||
}
|
||||
@@ -1218,6 +1236,18 @@ mod tests {
|
||||
assert!(registry.contains("list_dir"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn builder_registers_speech_alias_tools() {
|
||||
let tmp = tempdir().expect("tempdir");
|
||||
let ctx = ToolContext::new(tmp.path().to_path_buf());
|
||||
let registry = ToolRegistryBuilder::new()
|
||||
.with_speech_tools(None, None)
|
||||
.build(ctx);
|
||||
|
||||
assert!(registry.contains("speech"));
|
||||
assert!(registry.contains("tts"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_registry_names() {
|
||||
let tmp = tempdir().expect("tempdir");
|
||||
|
||||
@@ -0,0 +1,528 @@
|
||||
//! Model-visible Xiaomi MiMo speech/TTS generation tool.
|
||||
//!
|
||||
//! This mirrors the CLI `speech` / `tts` command as a first-class API tool so
|
||||
//! the TUI model can generate narrated audio without shelling out to a nested
|
||||
//! CodeWhale process.
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use base64::{Engine as _, engine::general_purpose};
|
||||
use serde_json::{Value, json};
|
||||
|
||||
use crate::client::{DeepSeekClient, SpeechSynthesisRequest};
|
||||
use crate::config::{ApiProvider, normalize_model_name_for_provider};
|
||||
use crate::network_policy::{Decision, host_from_url};
|
||||
|
||||
use super::spec::{
|
||||
ApprovalRequirement, ToolCapability, ToolContext, ToolError, ToolResult, ToolSpec,
|
||||
optional_bool, optional_str, required_str,
|
||||
};
|
||||
|
||||
const DEFAULT_FORMAT: &str = "wav";
|
||||
const DEFAULT_VOICE: &str = "mimo_default";
|
||||
const VOICE_CLONE_BASE64_MAX_BYTES: usize = 10 * 1024 * 1024;
|
||||
const SUPPORTED_SPEECH_FORMATS: &[&str] = &["wav", "mp3", "pcm16"];
|
||||
|
||||
pub const SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS: &[&str] = &[
|
||||
"mimo-v2.5-pro",
|
||||
"mimo-v2.5",
|
||||
"mimo-v2.5-tts-voiceclone",
|
||||
"mimo-v2.5-tts-voicedesign",
|
||||
"mimo-v2.5-tts",
|
||||
"mimo-v2-pro",
|
||||
"mimo-v2-omni",
|
||||
"mimo-v2-tts",
|
||||
];
|
||||
|
||||
const SPEECH_MODEL_EXAMPLES: &[&str] = &[
|
||||
"mimo-v2.5-tts",
|
||||
"mimo-v2.5-tts-voicedesign",
|
||||
"mimo-v2.5-tts-voiceclone",
|
||||
"mimo-v2-tts",
|
||||
];
|
||||
|
||||
pub struct SpeechTool {
|
||||
name: &'static str,
|
||||
client: Option<DeepSeekClient>,
|
||||
output_dir: Option<PathBuf>,
|
||||
}
|
||||
|
||||
impl SpeechTool {
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
name: &'static str,
|
||||
client: Option<DeepSeekClient>,
|
||||
output_dir: Option<PathBuf>,
|
||||
) -> Self {
|
||||
Self {
|
||||
name,
|
||||
client,
|
||||
output_dir,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ToolSpec for SpeechTool {
|
||||
fn name(&self) -> &str {
|
||||
self.name
|
||||
}
|
||||
|
||||
fn description(&self) -> &str {
|
||||
"Generate speech/audio directly through the configured Xiaomi MiMo OpenAI-compatible API. Use this when the user asks for speech, TTS, narration, read-aloud, voice design, or voice cloning."
|
||||
}
|
||||
|
||||
fn input_schema(&self) -> Value {
|
||||
json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "Text to synthesize. This is sent as the assistant message and is the spoken content; MiMo TTS style/audio tags may be included here."
|
||||
},
|
||||
"output": {
|
||||
"type": "string",
|
||||
"description": "Audio file path to write, relative to the workspace unless absolute. Default: speech.<format> in output_dir, configured [speech].output_dir, or the workspace."
|
||||
},
|
||||
"output_dir": {
|
||||
"type": "string",
|
||||
"description": "Directory for the default speech.<format> output file when output is omitted. Relative paths stay inside the workspace."
|
||||
},
|
||||
"model": {
|
||||
"type": "string",
|
||||
"description": "TTS model. Defaults to mimo-v2.5-tts, or infers voice-design/voice-clone models from voice_prompt/clone_voice.",
|
||||
"enum": SPEECH_MODEL_EXAMPLES
|
||||
},
|
||||
"voice": {
|
||||
"type": "string",
|
||||
"description": "Built-in voice ID (for example mimo_default, 冰糖, 茉莉, 苏打, 白桦, Mia, Chloe, Milo, Dean) or a data:audio/...;base64,... URI for voice clone."
|
||||
},
|
||||
"instruction": {
|
||||
"type": "string",
|
||||
"description": "Natural-language style, emotion, speed, scene, or performance instruction. It is not spoken verbatim."
|
||||
},
|
||||
"voice_prompt": {
|
||||
"type": "string",
|
||||
"description": "Voice design prompt. When model is omitted this uses mimo-v2.5-tts-voicedesign."
|
||||
},
|
||||
"clone_voice": {
|
||||
"type": "string",
|
||||
"description": "Path to a .mp3 or .wav voice sample for cloning. When model is omitted this uses mimo-v2.5-tts-voiceclone."
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"description": "Requested audio format. Default: wav. MiMo-V2.5-TTS documentation examples use wav and pcm16; mp3 is accepted when the API returns it.",
|
||||
"enum": SUPPORTED_SPEECH_FORMATS
|
||||
},
|
||||
"stream": {
|
||||
"type": "boolean",
|
||||
"description": "Low-latency streaming request. The direct tool currently writes complete audio files only, so leave this false."
|
||||
}
|
||||
},
|
||||
"required": ["text"]
|
||||
})
|
||||
}
|
||||
|
||||
fn capabilities(&self) -> Vec<ToolCapability> {
|
||||
vec![
|
||||
ToolCapability::WritesFiles,
|
||||
ToolCapability::Network,
|
||||
ToolCapability::Sandboxable,
|
||||
]
|
||||
}
|
||||
|
||||
fn approval_requirement(&self) -> ApprovalRequirement {
|
||||
// Speech generation is an explicit user-facing generation action.
|
||||
// Path resolution still enforces workspace/trusted-root boundaries.
|
||||
ApprovalRequirement::Auto
|
||||
}
|
||||
|
||||
async fn execute(&self, input: Value, context: &ToolContext) -> Result<ToolResult, ToolError> {
|
||||
let text = required_str(&input, "text")?.trim().to_string();
|
||||
if text.is_empty() {
|
||||
return Err(ToolError::invalid_input("speech text cannot be empty"));
|
||||
}
|
||||
|
||||
let client = self.client.clone().ok_or_else(|| {
|
||||
ToolError::not_available(
|
||||
"speech tool requires an active Xiaomi MiMo API client; configure provider = \"xiaomi-mimo\" and an API key first",
|
||||
)
|
||||
})?;
|
||||
|
||||
let requested_format_raw = optional_str(&input, "format")
|
||||
.map(str::trim)
|
||||
.filter(|value| !value.is_empty())
|
||||
.unwrap_or(DEFAULT_FORMAT);
|
||||
let requested_format = normalize_speech_format(requested_format_raw).ok_or_else(|| {
|
||||
ToolError::invalid_input(format!(
|
||||
"unsupported speech format '{requested_format_raw}' (allowed: {})",
|
||||
SUPPORTED_SPEECH_FORMATS.join(", ")
|
||||
))
|
||||
})?;
|
||||
if optional_bool(&input, "stream", false) {
|
||||
return Err(ToolError::invalid_input(
|
||||
"stream=true low-latency speech output is not implemented in the direct tool yet; use stream=false to generate a complete audio file",
|
||||
));
|
||||
}
|
||||
let output_raw = optional_str(&input, "output")
|
||||
.map(str::trim)
|
||||
.filter(|value| !value.is_empty());
|
||||
let output_path = resolve_speech_output_path(
|
||||
&input,
|
||||
context,
|
||||
output_raw,
|
||||
&requested_format,
|
||||
self.output_dir.as_ref(),
|
||||
)?;
|
||||
let output_label = output_raw
|
||||
.map(str::to_string)
|
||||
.unwrap_or_else(|| output_path.display().to_string());
|
||||
|
||||
let raw_voice = optional_str(&input, "voice")
|
||||
.map(str::trim)
|
||||
.filter(|value| !value.is_empty())
|
||||
.map(str::to_string);
|
||||
let raw_instruction = optional_str(&input, "instruction")
|
||||
.map(str::trim)
|
||||
.filter(|value| !value.is_empty())
|
||||
.map(str::to_string);
|
||||
let voice_prompt = optional_str(&input, "voice_prompt")
|
||||
.map(str::trim)
|
||||
.filter(|value| !value.is_empty())
|
||||
.map(str::to_string);
|
||||
let clone_voice = optional_str(&input, "clone_voice")
|
||||
.map(str::trim)
|
||||
.filter(|value| !value.is_empty())
|
||||
.map(str::to_string);
|
||||
|
||||
let voice_is_data_uri = raw_voice
|
||||
.as_deref()
|
||||
.is_some_and(|value| value.starts_with("data:audio/"));
|
||||
if clone_voice.is_some() && raw_voice.is_some() {
|
||||
return Err(ToolError::invalid_input(
|
||||
"use either clone_voice or voice for cloned voice data, not both",
|
||||
));
|
||||
}
|
||||
let model = infer_speech_model(
|
||||
optional_str(&input, "model"),
|
||||
clone_voice.is_some() || voice_is_data_uri,
|
||||
voice_prompt.is_some(),
|
||||
);
|
||||
let model_lower = model.to_ascii_lowercase();
|
||||
if !model_lower.contains("tts") {
|
||||
return Err(ToolError::invalid_input(format!(
|
||||
"speech tool requires a TTS model (examples: {}), got '{model}'",
|
||||
SPEECH_MODEL_EXAMPLES.join(", ")
|
||||
)));
|
||||
}
|
||||
|
||||
let is_voice_design = model_lower.contains("voicedesign");
|
||||
let is_voice_clone = model_lower.contains("voiceclone");
|
||||
let instruction = combine_speech_instructions(raw_instruction, voice_prompt);
|
||||
if is_voice_design
|
||||
&& instruction
|
||||
.as_deref()
|
||||
.is_none_or(|value| value.trim().is_empty())
|
||||
{
|
||||
return Err(ToolError::invalid_input(
|
||||
"mimo-v2.5-tts-voicedesign requires voice_prompt or instruction",
|
||||
));
|
||||
}
|
||||
|
||||
let voice = if let Some(clone_path) = clone_voice {
|
||||
let clone_path = context.resolve_path(&clone_path)?;
|
||||
Some(encode_voice_clone_data_uri(&clone_path).await?)
|
||||
} else if is_voice_design {
|
||||
None
|
||||
} else if let Some(value) = raw_voice {
|
||||
Some(value)
|
||||
} else if is_voice_clone {
|
||||
return Err(ToolError::invalid_input(
|
||||
"mimo-v2.5-tts-voiceclone requires clone_voice <mp3|wav> or voice <data-uri>",
|
||||
));
|
||||
} else {
|
||||
Some(DEFAULT_VOICE.to_string())
|
||||
};
|
||||
|
||||
check_network_policy(context, client.base_url())?;
|
||||
|
||||
let response = client
|
||||
.synthesize_speech(SpeechSynthesisRequest {
|
||||
model: model.clone(),
|
||||
text,
|
||||
instruction,
|
||||
audio_format: requested_format,
|
||||
voice,
|
||||
})
|
||||
.await
|
||||
.map_err(|err| {
|
||||
ToolError::execution_failed(format!("speech synthesis failed: {err}"))
|
||||
})?;
|
||||
|
||||
if let Some(parent) = output_path
|
||||
.parent()
|
||||
.filter(|path| !path.as_os_str().is_empty())
|
||||
{
|
||||
tokio::fs::create_dir_all(parent).await.map_err(|err| {
|
||||
ToolError::execution_failed(format!(
|
||||
"failed to create output directory {}: {err}",
|
||||
parent.display()
|
||||
))
|
||||
})?;
|
||||
}
|
||||
tokio::fs::write(&output_path, &response.audio_bytes)
|
||||
.await
|
||||
.map_err(|err| {
|
||||
ToolError::execution_failed(format!(
|
||||
"failed to write audio file {}: {err}",
|
||||
output_path.display()
|
||||
))
|
||||
})?;
|
||||
|
||||
let result = json!({
|
||||
"mode": "speech",
|
||||
"success": true,
|
||||
"api": "Xiaomi MiMo OpenAI-compatible chat/completions speech synthesis",
|
||||
"base_url": openai_compatible_base_url(client.base_url()),
|
||||
"model": response.model,
|
||||
"format": response.audio_format,
|
||||
"stream": false,
|
||||
"output": output_label,
|
||||
"absolute_output": output_path.display().to_string(),
|
||||
"bytes": response.audio_bytes.len(),
|
||||
"voice": response.voice.as_deref().map(describe_speech_voice),
|
||||
"transcript": response.transcript,
|
||||
"supported_formats": SUPPORTED_SPEECH_FORMATS,
|
||||
"supported_xiaomi_mimo_models": SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS,
|
||||
});
|
||||
ToolResult::json(&result).map_err(|err| {
|
||||
ToolError::execution_failed(format!("failed to serialize result: {err}"))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn infer_speech_model(
|
||||
model: Option<&str>,
|
||||
has_clone_voice: bool,
|
||||
has_voice_prompt: bool,
|
||||
) -> String {
|
||||
match model.map(str::trim).filter(|value| !value.is_empty()) {
|
||||
Some(value) => normalize_model_name_for_provider(ApiProvider::XiaomiMimo, value)
|
||||
.unwrap_or_else(|| value.into()),
|
||||
None if has_clone_voice => "mimo-v2.5-tts-voiceclone".to_string(),
|
||||
None if has_voice_prompt => "mimo-v2.5-tts-voicedesign".to_string(),
|
||||
None => "mimo-v2.5-tts".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
fn combine_speech_instructions(
|
||||
instruction: Option<String>,
|
||||
voice_prompt: Option<String>,
|
||||
) -> Option<String> {
|
||||
match (instruction, voice_prompt) {
|
||||
(Some(instruction), Some(voice_prompt)) => {
|
||||
let instruction = instruction.trim();
|
||||
let voice_prompt = voice_prompt.trim();
|
||||
if instruction.is_empty() {
|
||||
Some(voice_prompt.to_string()).filter(|value| !value.is_empty())
|
||||
} else if voice_prompt.is_empty() {
|
||||
Some(instruction.to_string()).filter(|value| !value.is_empty())
|
||||
} else {
|
||||
Some(format!("{voice_prompt}\n\n{instruction}"))
|
||||
}
|
||||
}
|
||||
(Some(value), None) | (None, Some(value)) => {
|
||||
let value = value.trim().to_string();
|
||||
if value.is_empty() { None } else { Some(value) }
|
||||
}
|
||||
(None, None) => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn normalize_speech_format(format: &str) -> Option<String> {
|
||||
let normalized = format.trim().to_ascii_lowercase();
|
||||
match normalized.as_str() {
|
||||
"wav" | "mp3" | "pcm16" => Some(normalized),
|
||||
"pcm" => Some("pcm16".to_string()),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn default_speech_output_name(format: &str) -> String {
|
||||
format!(
|
||||
"speech.{}",
|
||||
normalize_speech_format(format)
|
||||
.as_deref()
|
||||
.unwrap_or(DEFAULT_FORMAT)
|
||||
)
|
||||
}
|
||||
|
||||
fn resolve_speech_output_path(
|
||||
input: &Value,
|
||||
context: &ToolContext,
|
||||
output_raw: Option<&str>,
|
||||
format: &str,
|
||||
configured_output_dir: Option<&PathBuf>,
|
||||
) -> Result<PathBuf, ToolError> {
|
||||
if let Some(output) = output_raw {
|
||||
return context.resolve_path(output);
|
||||
}
|
||||
|
||||
let filename = default_speech_output_name(format);
|
||||
if let Some(output_dir) = optional_str(input, "output_dir")
|
||||
.map(str::trim)
|
||||
.filter(|value| !value.is_empty())
|
||||
{
|
||||
return Ok(context.resolve_path(output_dir)?.join(filename));
|
||||
}
|
||||
|
||||
if let Some(output_dir) = configured_output_dir {
|
||||
return Ok(output_dir.join(filename));
|
||||
}
|
||||
|
||||
Ok(context.workspace.join(filename))
|
||||
}
|
||||
|
||||
async fn encode_voice_clone_data_uri(path: &Path) -> Result<String, ToolError> {
|
||||
let bytes = tokio::fs::read(path).await.map_err(|err| {
|
||||
ToolError::execution_failed(format!(
|
||||
"failed to read voice clone sample {}: {err}",
|
||||
path.display()
|
||||
))
|
||||
})?;
|
||||
let base64_audio = general_purpose::STANDARD.encode(bytes);
|
||||
if base64_audio.len() > VOICE_CLONE_BASE64_MAX_BYTES {
|
||||
return Err(ToolError::invalid_input(format!(
|
||||
"voice clone sample is too large after base64 encoding ({} bytes > 10 MB)",
|
||||
base64_audio.len()
|
||||
)));
|
||||
}
|
||||
|
||||
let extension = path
|
||||
.extension()
|
||||
.and_then(|value| value.to_str())
|
||||
.unwrap_or_default()
|
||||
.to_ascii_lowercase();
|
||||
let mime = match extension.as_str() {
|
||||
"mp3" => "audio/mpeg",
|
||||
"wav" => "audio/wav",
|
||||
other => {
|
||||
return Err(ToolError::invalid_input(format!(
|
||||
"unsupported voice clone sample extension '{other}'. Use .mp3 or .wav."
|
||||
)));
|
||||
}
|
||||
};
|
||||
|
||||
Ok(format!("data:{mime};base64,{base64_audio}"))
|
||||
}
|
||||
|
||||
fn describe_speech_voice(voice: &str) -> String {
|
||||
if voice.starts_with("data:") {
|
||||
"embedded voice clone sample".to_string()
|
||||
} else {
|
||||
voice.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
fn openai_compatible_base_url(base_url: &str) -> String {
|
||||
let trimmed = base_url.trim_end_matches('/');
|
||||
if trimmed.ends_with("/v1") || trimmed.ends_with("/beta") {
|
||||
trimmed.to_string()
|
||||
} else {
|
||||
format!("{trimmed}/v1")
|
||||
}
|
||||
}
|
||||
|
||||
fn check_network_policy(context: &ToolContext, base_url: &str) -> Result<(), ToolError> {
|
||||
let Some(decider) = context.network_policy.as_ref() else {
|
||||
return Ok(());
|
||||
};
|
||||
let display_url = openai_compatible_base_url(base_url);
|
||||
let Some(host) = host_from_url(&display_url) else {
|
||||
return Ok(());
|
||||
};
|
||||
match decider.evaluate(&host, "speech") {
|
||||
Decision::Allow => Ok(()),
|
||||
Decision::Deny => Err(ToolError::permission_denied(format!(
|
||||
"speech network call to '{host}' blocked by network policy"
|
||||
))),
|
||||
Decision::Prompt => Err(ToolError::permission_denied(format!(
|
||||
"speech network call to '{host}' requires approval; re-run after `/network allow {host}` or set network.default = \"allow\" in config"
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn infers_speech_model_from_requested_mode() {
|
||||
assert_eq!(infer_speech_model(None, false, false), "mimo-v2.5-tts");
|
||||
assert_eq!(
|
||||
infer_speech_model(None, false, true),
|
||||
"mimo-v2.5-tts-voicedesign"
|
||||
);
|
||||
assert_eq!(
|
||||
infer_speech_model(None, true, false),
|
||||
"mimo-v2.5-tts-voiceclone"
|
||||
);
|
||||
assert_eq!(
|
||||
infer_speech_model(Some("mimo-tts"), false, false),
|
||||
"mimo-v2.5-tts"
|
||||
);
|
||||
assert_eq!(
|
||||
infer_speech_model(Some("mimo-v2-tts"), false, false),
|
||||
"mimo-v2-tts"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn combines_voice_prompt_before_instruction() {
|
||||
assert_eq!(
|
||||
combine_speech_instructions(
|
||||
Some("Speak warmly.".to_string()),
|
||||
Some("Young Chinese female voice".to_string())
|
||||
)
|
||||
.as_deref(),
|
||||
Some("Young Chinese female voice\n\nSpeak warmly.")
|
||||
);
|
||||
assert_eq!(
|
||||
combine_speech_instructions(Some(" calm ".to_string()), None).as_deref(),
|
||||
Some("calm")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalizes_documented_speech_formats() {
|
||||
assert_eq!(normalize_speech_format("WAV").as_deref(), Some("wav"));
|
||||
assert_eq!(normalize_speech_format("pcm16").as_deref(), Some("pcm16"));
|
||||
assert_eq!(normalize_speech_format("pcm").as_deref(), Some("pcm16"));
|
||||
assert_eq!(normalize_speech_format("flac"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn displays_openai_compatible_base_url() {
|
||||
assert_eq!(
|
||||
openai_compatible_base_url("https://api.xiaomimimo.com"),
|
||||
"https://api.xiaomimimo.com/v1"
|
||||
);
|
||||
assert_eq!(
|
||||
openai_compatible_base_url("https://api.xiaomimimo.com/v1"),
|
||||
"https://api.xiaomimimo.com/v1"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn speech_tool_is_auto_approved_but_not_read_only() {
|
||||
let tool = SpeechTool::new("speech", None, None);
|
||||
assert_eq!(tool.name(), "speech");
|
||||
assert_eq!(tool.approval_requirement(), ApprovalRequirement::Auto);
|
||||
assert!(!tool.is_read_only());
|
||||
let schema = tool.input_schema();
|
||||
assert!(schema.to_string().contains("mimo-v2.5-tts-voiceclone"));
|
||||
assert!(schema.to_string().contains("pcm16"));
|
||||
assert!(schema.to_string().contains("stream"));
|
||||
}
|
||||
}
|
||||
@@ -332,6 +332,9 @@ fn picker_model_hint(id: &str) -> &'static str {
|
||||
}
|
||||
"arcee-ai/trinity-large-thinking" => "large thinking",
|
||||
"xiaomi/mimo-v2.5-pro" | "mimo-v2.5-pro" => "long context",
|
||||
"mimo-v2.5-tts" | "mimo-v2-tts" => "speech / TTS",
|
||||
"mimo-v2.5-tts-voicedesign" => "voice design",
|
||||
"mimo-v2.5-tts-voiceclone" => "voice clone",
|
||||
"minimax/minimax-m3" => "1M multimodal",
|
||||
_ => "provider model",
|
||||
}
|
||||
|
||||
@@ -781,6 +781,7 @@ fn build_engine_config(app: &App, config: &Config) -> EngineConfig {
|
||||
prefer_bwrap: config.prefer_bwrap.unwrap_or(false),
|
||||
memory_enabled: config.memory_enabled(),
|
||||
memory_path: config.memory_path(),
|
||||
speech_output_dir: config.speech_output_dir(),
|
||||
vision_config: config.vision_model_config(),
|
||||
strict_tool_mode: config.strict_tool_mode.unwrap_or(false),
|
||||
goal_objective: app.hunt.quarry.clone(),
|
||||
|
||||
+7
-3
@@ -118,7 +118,7 @@ endpoint.
|
||||
| `wanjie-ark` | `[providers.wanjie_ark]` | `WANJIE_ARK_API_KEY`, `WANJIE_API_KEY`, `WANJIE_MAAS_API_KEY` | `WANJIE_ARK_BASE_URL`, `WANJIE_BASE_URL`, `WANJIE_MAAS_BASE_URL`; default `https://maas-openapi.wanjiedata.com/api/v1` | `deepseek-reasoner` | OpenAI-compatible hosted route. `WANJIE_ARK_MODEL`, `WANJIE_MODEL`, and `WANJIE_MAAS_MODEL` are accepted. |
|
||||
| `volcengine` | `[providers.volcengine]` | `VOLCENGINE_API_KEY`, `VOLCENGINE_ARK_API_KEY`, `ARK_API_KEY` | `VOLCENGINE_BASE_URL`, `VOLCENGINE_ARK_BASE_URL`, `ARK_BASE_URL`; default `https://ark.cn-beijing.volces.com/api/coding/v3` | `DeepSeek-V4-Pro`, `DeepSeek-V4-Flash` | Volcengine/Volcano Engine Ark OpenAI-compatible coding endpoint. `VOLCENGINE_MODEL` and `VOLCENGINE_ARK_MODEL` are accepted. |
|
||||
| `openrouter` | `[providers.openrouter]` | `OPENROUTER_API_KEY` | `OPENROUTER_BASE_URL`; default `https://openrouter.ai/api/v1` | `deepseek/deepseek-v4-pro`, `deepseek/deepseek-v4-flash`; recent large IDs include `arcee-ai/trinity-large-thinking`, `minimax/minimax-m3`, `xiaomi/mimo-v2.5-pro`, `qwen/qwen3.6-35b-a3b`, `google/gemma-4-31b-it`, `z-ai/glm-5.1`, `moonshotai/kimi-k2.6` | Additive open-model routing layer. It does not replace DeepSeek; it lets users route supported model IDs through OpenRouter when they choose it. |
|
||||
| `xiaomi-mimo` | `[providers.xiaomi_mimo]` | `XIAOMI_MIMO_API_KEY`, `XIAOMI_API_KEY`, `MIMO_API_KEY` | `XIAOMI_MIMO_BASE_URL`, `MIMO_BASE_URL`; default `https://api.xiaomimimo.com/v1` | `mimo-v2.5-pro`, `mimo-v2.5` | Xiaomi MiMo OpenAI-compatible chat completions route. It sends `max_completion_tokens` and uses MiMo's `thinking` field for reasoning control. |
|
||||
| `xiaomi-mimo` | `[providers.xiaomi_mimo]` | `XIAOMI_MIMO_API_KEY`, `XIAOMI_API_KEY`, `MIMO_API_KEY` | `XIAOMI_MIMO_BASE_URL`, `MIMO_BASE_URL`; default `https://api.xiaomimimo.com/v1` | `mimo-v2.5-pro`, `mimo-v2.5`, `mimo-v2.5-tts`, `mimo-v2.5-tts-voicedesign`, `mimo-v2.5-tts-voiceclone`, `mimo-v2-tts` | Xiaomi MiMo OpenAI-compatible chat completions route. It sends `max_completion_tokens` and uses MiMo's `thinking` field for reasoning control. `codewhale speech` / `tts` uses the TTS models. |
|
||||
| `novita` | `[providers.novita]` | `NOVITA_API_KEY` | `NOVITA_BASE_URL`; default `https://api.novita.ai/v1` | `deepseek/deepseek-v4-pro`, `deepseek/deepseek-v4-flash` | OpenAI-compatible hosted route for DeepSeek model IDs. Use config or `CODEWHALE_MODEL` / `DEEPSEEK_MODEL` for model overrides. |
|
||||
| `fireworks` | `[providers.fireworks]` | `FIREWORKS_API_KEY` | `FIREWORKS_BASE_URL`; default `https://api.fireworks.ai/inference/v1` | `accounts/fireworks/models/deepseek-v4-pro` | OpenAI-compatible hosted route. Use config or `CODEWHALE_MODEL` / `DEEPSEEK_MODEL` for model overrides. |
|
||||
| `siliconflow` | `[providers.siliconflow]` | `SILICONFLOW_API_KEY` | `SILICONFLOW_BASE_URL`; default `https://api.siliconflow.com/v1` | `deepseek-ai/DeepSeek-V4-Pro`, `deepseek-ai/DeepSeek-V4-Flash` | OpenAI-compatible hosted route. Official docs use the `.com` endpoint; users who need the regional endpoint can set `https://api.siliconflow.cn/v1` explicitly. `SILICONFLOW_MODEL` is accepted. Reasoning aliases `deepseek-reasoner` and `deepseek-r1` map to Pro; `deepseek-chat` and `deepseek-v3` map to Flash. |
|
||||
@@ -130,7 +130,11 @@ endpoint.
|
||||
### Xiaomi MiMo Notes
|
||||
|
||||
`xiaomi-mimo` defaults to `mimo-v2.5-pro` for long-context reasoning and coding
|
||||
work, while the static registry also exposes `mimo-v2.5`. Xiaomi's current
|
||||
work, while the static registry also exposes `mimo-v2.5`. Xiaomi MiMo TTS is
|
||||
available through `codewhale --provider xiaomi-mimo speech "text" --model tts`
|
||||
(or the `tts` alias) plus model-visible `speech` / `tts` tools in Agent/YOLO mode.
|
||||
Voice-design and voice-clone shorthands map to `mimo-v2.5-tts-voicedesign` and
|
||||
`mimo-v2.5-tts-voiceclone`. Xiaomi's current
|
||||
[image-understanding guide](https://platform.xiaomimimo.com/docs/en-US/usage-guide/multimodal-understanding/image-understanding)
|
||||
includes `mimo-v2.5` for image input. CodeWhale exposes image analysis through the
|
||||
separate `[vision_model]` / `image_analyze` path; set that model to
|
||||
@@ -164,7 +168,7 @@ endpoint when the endpoint supports model listing.
|
||||
| `wanjie-ark` | `deepseek-reasoner` | yes | yes |
|
||||
| `volcengine` | `DeepSeek-V4-Pro`, `DeepSeek-V4-Flash` | yes | yes |
|
||||
| `openrouter` | `deepseek/deepseek-v4-pro`, `deepseek/deepseek-v4-flash`, `arcee-ai/trinity-large-thinking`, `minimax/minimax-m3`, `xiaomi/mimo-v2.5-pro`, `xiaomi/mimo-v2.5`, `qwen/qwen3.6-35b-a3b`, `qwen/qwen3.6-27b`, `moonshotai/kimi-k2.6`, `z-ai/glm-5.1`, `tencent/hy3-preview`, `google/gemma-4-31b-it`, `google/gemma-4-26b-a4b-it`, `nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free` | yes | yes |
|
||||
| `xiaomi-mimo` | `mimo-v2.5-pro`, `mimo-v2.5` | yes | yes |
|
||||
| `xiaomi-mimo` | `mimo-v2.5-pro`, `mimo-v2.5`, `mimo-v2.5-tts`, `mimo-v2.5-tts-voicedesign`, `mimo-v2.5-tts-voiceclone`, `mimo-v2-tts` | yes | yes for chat models; no for TTS models |
|
||||
| `novita` | `deepseek/deepseek-v4-pro`, `deepseek/deepseek-v4-flash` | yes | yes |
|
||||
| `fireworks` | `accounts/fireworks/models/deepseek-v4-pro` | yes | yes |
|
||||
| `siliconflow` | `deepseek-ai/DeepSeek-V4-Pro`, `deepseek-ai/DeepSeek-V4-Flash` | yes | yes |
|
||||
|
||||
Reference in New Issue
Block a user