diff --git a/crates/tui/src/client.rs b/crates/tui/src/client.rs index 386b8a12..7215f6d8 100644 --- a/crates/tui/src/client.rs +++ b/crates/tui/src/client.rs @@ -476,6 +476,31 @@ fn parse_speech_audio_response(payload: &Value) -> Result<(Vec, Option, + audio: Value, +) -> Value { + let mut messages = Vec::new(); + if let Some(instruction) = instruction.map(str::trim).filter(|value| !value.is_empty()) { + messages.push(json!({ + "role": "user", + "content": instruction, + })); + } + messages.push(json!({ + "role": "assistant", + "content": text, + })); + + json!({ + "model": model, + "messages": messages, + "audio": audio, + }) +} + // === DeepSeekClient === /// Returns true when DEEPSEEK_FORCE_HTTP1 is set to a truthy value @@ -773,20 +798,7 @@ impl DeepSeekClient { audio["voice"] = json!(voice); } - let body = json!({ - "model": model, - "messages": [ - { - "role": "user", - "content": instruction.unwrap_or(""), - }, - { - "role": "assistant", - "content": text, - } - ], - "audio": audio, - }); + let body = build_speech_synthesis_body(&model, &text, instruction, audio); let url = api_url(&self.base_url, "chat/completions"); let response = self @@ -1366,6 +1378,53 @@ mod tests { assert_eq!(transcript, None); } + #[test] + fn speech_synthesis_body_omits_user_message_without_instruction() { + let body = + build_speech_synthesis_body("mimo-v2.5-tts", "hello", None, json!({"format": "wav"})); + let messages = body["messages"].as_array().expect("messages array"); + + assert_eq!(messages.len(), 1); + assert_eq!(messages[0]["role"], "assistant"); + assert_eq!(messages[0]["content"], "hello"); + assert!( + messages + .iter() + .all(|message| message["content"].as_str() != Some("")) + ); + } + + #[test] + fn speech_synthesis_body_ignores_blank_instruction() { + let body = build_speech_synthesis_body( + "mimo-v2.5-tts", + "hello", + Some(" \t\n "), + json!({"format": "wav"}), + ); + let messages = body["messages"].as_array().expect("messages array"); + + assert_eq!(messages.len(), 1); + assert_eq!(messages[0]["role"], "assistant"); + } + + #[test] + fn speech_synthesis_body_includes_non_empty_instruction_first() { + let body = build_speech_synthesis_body( + "mimo-v2.5-tts-voicedesign", + "hello", + Some("warm and calm"), + json!({"format": "wav"}), + ); + let messages = body["messages"].as_array().expect("messages array"); + + assert_eq!(messages.len(), 2); + assert_eq!(messages[0]["role"], "user"); + assert_eq!(messages[0]["content"], "warm and calm"); + assert_eq!(messages[1]["role"], "assistant"); + assert_eq!(messages[1]["content"], "hello"); + } + #[test] fn tool_name_roundtrip_dot() { let original = "multi_tool_use.parallel"; diff --git a/crates/tui/src/core/engine.rs b/crates/tui/src/core/engine.rs index 34c0c6ce..54ba0c24 100644 --- a/crates/tui/src/core/engine.rs +++ b/crates/tui/src/core/engine.rs @@ -992,6 +992,7 @@ impl Engine { ) .with_max_spawn_depth(self.config.max_spawn_depth) .with_step_api_timeout(self.config.subagent_api_timeout) + .with_speech_output_dir(self.config.speech_output_dir.clone()) .with_mcp_pool(mcp_pool) .background_runtime(); let route = resolve_subagent_assignment_route( @@ -1496,6 +1497,7 @@ impl Engine { ) .with_max_spawn_depth(self.config.max_spawn_depth) .with_step_api_timeout(self.config.subagent_api_timeout) + .with_speech_output_dir(self.config.speech_output_dir.clone()) .with_mcp_pool(mcp_pool.clone()) .with_parent_completion_tx(self.tx_subagent_completion.clone()); if let Some(context) = fork_context_for_runtime.clone() { diff --git a/crates/tui/src/main.rs b/crates/tui/src/main.rs index 3428c6d1..178b06f1 100644 --- a/crates/tui/src/main.rs +++ b/crates/tui/src/main.rs @@ -6,7 +6,6 @@ use std::process::{Command, Stdio}; use std::time::Duration; use anyhow::{Context, Result, anyhow, bail}; -use base64::{Engine as _, engine::general_purpose}; use clap::{Args, CommandFactory, Parser, Subcommand, ValueEnum}; use clap_complete::{Shell, generate}; use dotenvy::dotenv; @@ -3568,7 +3567,12 @@ async fn run_models(config: &Config, args: ModelsArgs) -> Result<()> { async fn run_speech(config: &Config, args: SpeechArgs) -> Result<()> { use crate::client::{DeepSeekClient, SpeechSynthesisRequest}; - use crate::config::{ApiProvider, normalize_model_name_for_provider}; + use crate::config::ApiProvider; + use crate::tools::speech::{ + DEFAULT_VOICE, SPEECH_MODEL_EXAMPLES, combine_speech_instructions, + default_speech_output_name, describe_speech_voice, encode_voice_clone_sample_data_uri, + infer_speech_model, normalize_speech_format, + }; let SpeechArgs { text, @@ -3600,24 +3604,16 @@ async fn run_speech(config: &Config, args: SpeechArgs) -> Result<()> { if clone_voice.is_some() && voice.is_some() { bail!("Use either --clone-voice or --voice for cloned voice data, not both"); } - let model = match model { - Some(value) => { - normalize_model_name_for_provider(ApiProvider::XiaomiMimo, &value).unwrap_or(value) - } - None => { - if clone_voice.is_some() || voice_is_data_uri { - "mimo-v2.5-tts-voiceclone".to_string() - } else if voice_prompt.is_some() { - "mimo-v2.5-tts-voicedesign".to_string() - } else { - "mimo-v2.5-tts".to_string() - } - } - }; + let model = infer_speech_model( + model.as_deref(), + clone_voice.is_some() || voice_is_data_uri, + voice_prompt.is_some(), + ); let model_lower = model.to_ascii_lowercase(); if !model_lower.contains("tts") { bail!( - "speech requires a TTS model (examples: mimo-v2.5-tts, mimo-v2.5-tts-voicedesign, mimo-v2.5-tts-voiceclone); got {model}" + "speech requires a TTS model (examples: {}); got {model}", + SPEECH_MODEL_EXAMPLES.join(", ") ); } let is_voice_design = model_lower.contains("voicedesign"); @@ -3635,7 +3631,7 @@ async fn run_speech(config: &Config, args: SpeechArgs) -> Result<()> { } let voice = if let Some(clone_path) = clone_voice { - Some(encode_voice_clone_data_uri(&clone_path)?) + Some(encode_voice_clone_sample_data_uri(&clone_path)?) } else if is_voice_design { None } else if let Some(value) = voice.filter(|value| !value.trim().is_empty()) { @@ -3643,16 +3639,17 @@ async fn run_speech(config: &Config, args: SpeechArgs) -> Result<()> { } else if is_voice_clone { bail!("mimo-v2.5-tts-voiceclone requires --clone-voice or --voice "); } else { - Some("mimo_default".to_string()) + Some(DEFAULT_VOICE.to_string()) }; let format = normalize_speech_format(&format).with_context(|| { format!("Unsupported speech format '{format}' (allowed: wav, mp3, pcm16)") })?; - let output = resolve_speech_output_path( - output, - output_dir.or_else(|| config.speech_output_dir()), - &format, - ); + let output = output.unwrap_or_else(|| { + output_dir + .or_else(|| config.speech_output_dir()) + .unwrap_or_default() + .join(default_speech_output_name(&format)) + }); let client = DeepSeekClient::new(config)?; let response = client @@ -3699,99 +3696,12 @@ async fn run_speech(config: &Config, args: SpeechArgs) -> Result<()> { Ok(()) } -fn combine_speech_instructions( - instruction: Option, - voice_prompt: Option, -) -> Option { - match (instruction, voice_prompt) { - (Some(instruction), Some(voice_prompt)) => { - let instruction = instruction.trim(); - let voice_prompt = voice_prompt.trim(); - if instruction.is_empty() { - Some(voice_prompt.to_string()).filter(|value| !value.is_empty()) - } else if voice_prompt.is_empty() { - Some(instruction.to_string()).filter(|value| !value.is_empty()) - } else { - Some(format!("{voice_prompt}\n\n{instruction}")) - } - } - (Some(value), None) | (None, Some(value)) => { - let value = value.trim().to_string(); - if value.is_empty() { None } else { Some(value) } - } - (None, None) => None, - } -} - -const VOICE_CLONE_BASE64_MAX_BYTES: usize = 10 * 1024 * 1024; - -fn normalize_speech_format(format: &str) -> Option { - let normalized = format.trim().to_ascii_lowercase(); - match normalized.as_str() { - "wav" | "mp3" | "pcm16" => Some(normalized), - "pcm" => Some("pcm16".to_string()), - _ => None, - } -} - -fn default_speech_output_name(format: &str) -> String { - format!( - "speech.{}", - normalize_speech_format(format).as_deref().unwrap_or("wav") - ) -} - -fn resolve_speech_output_path( - output: Option, - output_dir: Option, - format: &str, -) -> PathBuf { - output.unwrap_or_else(|| { - output_dir - .unwrap_or_default() - .join(default_speech_output_name(format)) - }) -} - -fn encode_voice_clone_data_uri(path: &Path) -> Result { - let bytes = std::fs::read(path) - .with_context(|| format!("Failed to read voice clone sample {}", path.display()))?; - let base64_audio = general_purpose::STANDARD.encode(bytes); - if base64_audio.len() > VOICE_CLONE_BASE64_MAX_BYTES { - bail!( - "Voice clone sample is too large after base64 encoding ({} bytes > 10 MB)", - base64_audio.len() - ); - } - - let extension = path - .extension() - .and_then(|value| value.to_str()) - .unwrap_or_default() - .to_ascii_lowercase(); - let mime = match extension.as_str() { - "mp3" => "audio/mpeg", - "wav" => "audio/wav", - other => bail!( - "Unsupported voice clone sample extension '{}'. Use .mp3 or .wav.", - other - ), - }; - - Ok(format!("data:{mime};base64,{base64_audio}")) -} - -fn describe_speech_voice(voice: &str) -> String { - if voice.starts_with("data:") { - "embedded voice clone sample".to_string() - } else { - voice.to_string() - } -} - #[cfg(test)] mod speech_cli_tests { use super::*; + use crate::tools::speech::{ + default_speech_output_name, infer_speech_model, normalize_speech_format, + }; #[test] fn normalizes_documented_speech_formats() { @@ -3804,18 +3714,52 @@ mod speech_cli_tests { #[test] fn default_speech_output_tracks_requested_format() { assert_eq!( - resolve_speech_output_path(None, None, "mp3"), + PathBuf::from(default_speech_output_name("mp3")), PathBuf::from("speech.mp3") ); assert_eq!( - resolve_speech_output_path(None, Some(PathBuf::from("audio")), "pcm"), + PathBuf::from("audio").join(default_speech_output_name("pcm")), PathBuf::from("audio").join("speech.pcm16") ); assert_eq!( - resolve_speech_output_path(Some(PathBuf::from("custom.wav")), None, "mp3"), + Some(PathBuf::from("custom.wav")) + .unwrap_or_else(|| PathBuf::from(default_speech_output_name("mp3"))), PathBuf::from("custom.wav") ); } + + #[test] + fn speech_command_parses_cli_passthrough_smoke() { + let cli = Cli::try_parse_from([ + "codewhale-tui", + "speech", + "hello", + "--model", + "tts", + "--format", + "pcm", + "--output-dir", + "audio", + "--voice", + "Mia", + ]) + .expect("speech command parses"); + + let Some(Commands::Speech(args)) = cli.command else { + panic!("expected speech command"); + }; + assert_eq!(args.text, "hello"); + assert_eq!( + infer_speech_model(args.model.as_deref(), false, false), + "mimo-v2.5-tts" + ); + assert_eq!( + normalize_speech_format(&args.format).as_deref(), + Some("pcm16") + ); + assert_eq!(args.output_dir, Some(PathBuf::from("audio"))); + assert_eq!(args.voice.as_deref(), Some("Mia")); + } } /// Test API connectivity by making a minimal request diff --git a/crates/tui/src/tools/registry.rs b/crates/tui/src/tools/registry.rs index b8efe51e..8dcf0e54 100644 --- a/crates/tui/src/tools/registry.rs +++ b/crates/tui/src/tools/registry.rs @@ -975,12 +975,13 @@ impl ToolRegistryBuilder { plan_state: super::plan::SharedPlanState, ) -> Self { let speech_client = client.clone(); + let speech_output_dir = runtime.speech_output_dir.clone(); self.with_agent_tools(allow_shell) .with_todo_tool(todo_list) .with_plan_tool(plan_state) .with_review_tool(client.clone(), model.clone()) .with_rlm_tool(client, model) - .with_speech_tools(speech_client, None) + .with_speech_tools(speech_client, speech_output_dir) .with_recall_archive_tool() .with_subagent_tools(manager, runtime) } diff --git a/crates/tui/src/tools/speech.rs b/crates/tui/src/tools/speech.rs index 92550e69..9c690512 100644 --- a/crates/tui/src/tools/speech.rs +++ b/crates/tui/src/tools/speech.rs @@ -6,6 +6,7 @@ use std::path::{Path, PathBuf}; +use anyhow::Context as _; use async_trait::async_trait; use base64::{Engine as _, engine::general_purpose}; use serde_json::{Value, json}; @@ -19,23 +20,19 @@ use super::spec::{ optional_bool, optional_str, required_str, }; -const DEFAULT_FORMAT: &str = "wav"; -const DEFAULT_VOICE: &str = "mimo_default"; +pub(crate) const DEFAULT_FORMAT: &str = "wav"; +pub(crate) const DEFAULT_VOICE: &str = "mimo_default"; const VOICE_CLONE_BASE64_MAX_BYTES: usize = 10 * 1024 * 1024; -const SUPPORTED_SPEECH_FORMATS: &[&str] = &["wav", "mp3", "pcm16"]; +pub(crate) const SUPPORTED_SPEECH_FORMATS: &[&str] = &["wav", "mp3", "pcm16"]; pub const SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS: &[&str] = &[ - "mimo-v2.5-pro", - "mimo-v2.5", "mimo-v2.5-tts-voiceclone", "mimo-v2.5-tts-voicedesign", "mimo-v2.5-tts", - "mimo-v2-pro", - "mimo-v2-omni", "mimo-v2-tts", ]; -const SPEECH_MODEL_EXAMPLES: &[&str] = &[ +pub(crate) const SPEECH_MODEL_EXAMPLES: &[&str] = &[ "mimo-v2.5-tts", "mimo-v2.5-tts-voicedesign", "mimo-v2.5-tts-voiceclone", @@ -302,7 +299,7 @@ impl ToolSpec for SpeechTool { } } -fn infer_speech_model( +pub(crate) fn infer_speech_model( model: Option<&str>, has_clone_voice: bool, has_voice_prompt: bool, @@ -316,7 +313,7 @@ fn infer_speech_model( } } -fn combine_speech_instructions( +pub(crate) fn combine_speech_instructions( instruction: Option, voice_prompt: Option, ) -> Option { @@ -340,7 +337,7 @@ fn combine_speech_instructions( } } -fn normalize_speech_format(format: &str) -> Option { +pub(crate) fn normalize_speech_format(format: &str) -> Option { let normalized = format.trim().to_ascii_lowercase(); match normalized.as_str() { "wav" | "mp3" | "pcm16" => Some(normalized), @@ -349,7 +346,7 @@ fn normalize_speech_format(format: &str) -> Option { } } -fn default_speech_output_name(format: &str) -> String { +pub(crate) fn default_speech_output_name(format: &str) -> String { format!( "speech.{}", normalize_speech_format(format) @@ -391,12 +388,25 @@ async fn encode_voice_clone_data_uri(path: &Path) -> Result { path.display() )) })?; + + voice_clone_data_uri_from_bytes(path, &bytes) + .map_err(|err| ToolError::invalid_input(err.to_string())) +} + +pub(crate) fn encode_voice_clone_sample_data_uri(path: &Path) -> anyhow::Result { + let bytes = std::fs::read(path) + .with_context(|| format!("Failed to read voice clone sample {}", path.display()))?; + + voice_clone_data_uri_from_bytes(path, &bytes) +} + +fn voice_clone_data_uri_from_bytes(path: &Path, bytes: &[u8]) -> anyhow::Result { let base64_audio = general_purpose::STANDARD.encode(bytes); if base64_audio.len() > VOICE_CLONE_BASE64_MAX_BYTES { - return Err(ToolError::invalid_input(format!( + anyhow::bail!( "voice clone sample is too large after base64 encoding ({} bytes > 10 MB)", base64_audio.len() - ))); + ); } let extension = path @@ -408,16 +418,14 @@ async fn encode_voice_clone_data_uri(path: &Path) -> Result { "mp3" => "audio/mpeg", "wav" => "audio/wav", other => { - return Err(ToolError::invalid_input(format!( - "unsupported voice clone sample extension '{other}'. Use .mp3 or .wav." - ))); + anyhow::bail!("unsupported voice clone sample extension '{other}'. Use .mp3 or .wav."); } }; Ok(format!("data:{mime};base64,{base64_audio}")) } -fn describe_speech_voice(voice: &str) -> String { +pub(crate) fn describe_speech_voice(voice: &str) -> String { if voice.starts_with("data:") { "embedded voice clone sample".to_string() } else { @@ -502,6 +510,37 @@ mod tests { assert_eq!(normalize_speech_format("flac"), None); } + #[test] + fn supported_xiaomi_mimo_speech_models_are_tts_only() { + assert!( + SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS + .iter() + .all(|model| model.to_ascii_lowercase().contains("tts")), + "model-visible speech list must not include chat-only MiMo models" + ); + assert!(SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS.contains(&"mimo-v2.5-tts")); + assert!(!SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS.contains(&"mimo-v2.5-pro")); + assert!(!SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS.contains(&"mimo-v2.5")); + } + + #[test] + fn configured_output_dir_is_used_for_default_tool_output() { + let tmp = tempfile::tempdir().expect("tempdir"); + let context = ToolContext::new(tmp.path().to_path_buf()); + let configured = tmp.path().join("speech-artifacts"); + + let output = resolve_speech_output_path( + &json!({"text": "hello"}), + &context, + None, + "pcm", + Some(&configured), + ) + .expect("output path"); + + assert_eq!(output, configured.join("speech.pcm16")); + } + #[test] fn displays_openai_compatible_base_url() { assert_eq!( diff --git a/crates/tui/src/tools/subagent/mod.rs b/crates/tui/src/tools/subagent/mod.rs index 3ab494b5..55b74985 100644 --- a/crates/tui/src/tools/subagent/mod.rs +++ b/crates/tui/src/tools/subagent/mod.rs @@ -800,6 +800,10 @@ pub struct SubAgentRuntime { /// false-timeout the child mid-thinking. `child_runtime()` and /// `background_runtime()` preserve the parent's value (#1806, #1808). pub step_api_timeout: Duration, + /// Default directory for Xiaomi MiMo speech/TTS tool outputs inherited by + /// child registries. Keeps parent and sub-agent `speech` / `tts` tools on + /// the same `[speech].output_dir` / env override. + pub speech_output_dir: Option, } impl SubAgentRuntime { @@ -835,6 +839,7 @@ impl SubAgentRuntime { fork_context: None, mcp_pool: None, step_api_timeout: DEFAULT_STEP_API_TIMEOUT, + speech_output_dir: None, } } @@ -858,6 +863,13 @@ impl SubAgentRuntime { self } + /// Preserve the configured speech output directory for sub-agent tools. + #[must_use] + pub fn with_speech_output_dir(mut self, output_dir: Option) -> Self { + self.speech_output_dir = output_dir; + self + } + /// Attach the wakeup channel so the engine's parent turn loop can resume /// when this runtime's direct children finish (issue #756). The channel /// is propagated to descendants via clone, but only `spawn_depth == 1` @@ -980,6 +992,7 @@ impl SubAgentRuntime { fork_context: self.fork_context.clone(), mcp_pool: self.mcp_pool.clone(), step_api_timeout: self.step_api_timeout, + speech_output_dir: self.speech_output_dir.clone(), } } diff --git a/crates/tui/src/tools/subagent/tests.rs b/crates/tui/src/tools/subagent/tests.rs index a2039a46..2fd3a51a 100644 --- a/crates/tui/src/tools/subagent/tests.rs +++ b/crates/tui/src/tools/subagent/tests.rs @@ -1805,6 +1805,7 @@ fn stub_runtime() -> SubAgentRuntime { fork_context: None, mcp_pool: None, step_api_timeout: DEFAULT_STEP_API_TIMEOUT, + speech_output_dir: None, } } @@ -2036,6 +2037,16 @@ fn emit_parent_completion_fires_for_direct_child() { assert!(rx.try_recv().is_err(), "should be exactly one message"); } +#[test] +fn child_runtime_inherits_speech_output_dir() { + let output_dir = PathBuf::from("configured-speech-output"); + let runtime = stub_runtime().with_speech_output_dir(Some(output_dir.clone())); + + let child = runtime.child_runtime(); + + assert_eq!(child.speech_output_dir, Some(output_dir)); +} + #[test] fn emit_parent_completion_skips_grandchildren() { let (tx, mut rx) = mpsc::unbounded_channel::();