From 9b31621b195d62454b2bdc3e288b169308a9f7b7 Mon Sep 17 00:00:00 2001 From: CodeWhale Agent Date: Fri, 12 Jun 2026 14:43:27 -0700 Subject: [PATCH] Harvest PR #3051: voice input commands and hotbar integration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port /voice, /voice-send, and /voice-control into the command strategy registry as groups/core/voice.rs. The handlers only flip App state (voice_enabled, voice_send_enabled, voice_control_enabled) and emit the new AppAction::VoiceCapture; the UI event loop performs the actual record + transcribe cycle so credentials come from the live Config (deepseek_api_key/deepseek_base_url) instead of auth fields cached on App, and no audio is ever recorded by the registry smoke tests. - voice.toggle hotbar action dispatches the real /voice command and reports voice_enabled as its active state, replacing the placeholder. - Recording uses sox/rec/arecord with RMS-based silence detection; transcription posts input_audio blocks to the provider chat completions API (async reqwest — the blocking client would panic inside the tokio event loop). - Transcripts insert at the composer cursor via App::insert_str. With /voice-send enabled, a transcript ending in "send it" / 发送 strips the suffix and submits; a bare "send it" submits the current composer content. With /voice-control enabled, transcription runs through the AI dictation pipeline that sees the composer text. - Failures (no recorder, no API key, short recording, network) surface as localized status messages and disarm voice input. - Localized command help and status strings for all seven shipped locales; /voice now appears in the command palette. Harvested from PR #3051 by @huqiantao Co-authored-by: huqiantao Co-authored-by: Hunter B Co-Authored-By: Claude Fable 5 --- crates/tui/src/commands/groups/core/mod.rs | 36 ++ crates/tui/src/commands/groups/core/voice.rs | 541 +++++++++++++++++++ crates/tui/src/commands/mod.rs | 46 ++ crates/tui/src/localization.rs | 211 ++++++++ crates/tui/src/tui/app.rs | 16 + crates/tui/src/tui/command_palette.rs | 2 +- crates/tui/src/tui/hotbar/actions.rs | 40 +- crates/tui/src/tui/ui.rs | 22 + 8 files changed, 903 insertions(+), 11 deletions(-) create mode 100644 crates/tui/src/commands/groups/core/voice.rs diff --git a/crates/tui/src/commands/groups/core/mod.rs b/crates/tui/src/commands/groups/core/mod.rs index ea3d1adc..8ae05be9 100644 --- a/crates/tui/src/commands/groups/core/mod.rs +++ b/crates/tui/src/commands/groups/core/mod.rs @@ -10,6 +10,7 @@ mod hooks; mod provider; mod queue; mod stash; +pub mod voice; pub(in crate::commands) use self::core::reset_conversation_state; @@ -43,6 +44,9 @@ impl CommandGroup for CoreCommands { Box::new(FunctionCommand::new(&PROFILE_INFO, run_profile)), Box::new(FunctionCommand::new(&RLM_INFO, run_rlm)), Box::new(FunctionCommand::new(&TRANSLATE_INFO, run_translate)), + Box::new(FunctionCommand::new(&VOICE_INFO, run_voice)), + Box::new(FunctionCommand::new(&VOICE_SEND_INFO, run_voice_send)), + Box::new(FunctionCommand::new(&VOICE_CONTROL_INFO, run_voice_control)), ] } } @@ -167,6 +171,24 @@ static TRANSLATE_INFO: CommandInfo = CommandInfo { usage: "/translate", description_id: MessageId::CmdTranslateDescription, }; +static VOICE_INFO: CommandInfo = CommandInfo { + name: "voice", + aliases: &["yuyin", "语音"], + usage: "/voice", + description_id: MessageId::CmdVoiceDescription, +}; +static VOICE_SEND_INFO: CommandInfo = CommandInfo { + name: "voicesend", + aliases: &["voice-send", "yuyinsend", "语音发送"], + usage: "/voicesend", + description_id: MessageId::CmdVoiceSendDescription, +}; +static VOICE_CONTROL_INFO: CommandInfo = CommandInfo { + name: "voicecontrol", + aliases: &["voice-control", "yuyincontrol", "语音控制"], + usage: "/voicecontrol", + description_id: MessageId::CmdVoiceControlDescription, +}; fn run_registered(app: &mut App, name: &str, arg: Option<&str>) -> CommandResult { dispatch(app, name, arg).expect("registered core command should dispatch") @@ -232,6 +254,15 @@ fn run_rlm(app: &mut App, arg: Option<&str>) -> CommandResult { fn run_translate(app: &mut App, arg: Option<&str>) -> CommandResult { run_registered(app, "translate", arg) } +fn run_voice(app: &mut App, arg: Option<&str>) -> CommandResult { + run_registered(app, "voice", arg) +} +fn run_voice_send(app: &mut App, arg: Option<&str>) -> CommandResult { + run_registered(app, "voicesend", arg) +} +fn run_voice_control(app: &mut App, arg: Option<&str>) -> CommandResult { + run_registered(app, "voicecontrol", arg) +} pub(in crate::commands) fn dispatch( app: &mut App, @@ -259,6 +290,11 @@ pub(in crate::commands) fn dispatch( "profile" | "dangan" => core::profile_switch(app, arg), "rlm" | "recursive" | "digui" => rlm(app, arg), "translate" | "translation" | "transale" => core::translate(app), + "voice" | "yuyin" | "语音" => voice::voice(app), + "voicesend" | "voice-send" | "yuyinsend" | "语音发送" => voice::voice_send(app), + "voicecontrol" | "voice-control" | "yuyincontrol" | "语音控制" => { + voice::voice_control(app) + } _ => return None, }; Some(result) diff --git a/crates/tui/src/commands/groups/core/voice.rs b/crates/tui/src/commands/groups/core/voice.rs new file mode 100644 index 00000000..5d6e9472 --- /dev/null +++ b/crates/tui/src/commands/groups/core/voice.rs @@ -0,0 +1,541 @@ +//! Voice input commands — `/voice`, `/voice-send`, `/voice-control`. +//! +//! Records audio from the default microphone, sends it to the configured +//! provider's API for transcription, and inserts the transcribed text into +//! the composer. The interaction model mirrors MiMo Code's voice UX: +//! +//! `/voice` — toggle voice input on/off (records when toggled on) +//! `/voice-send` — toggle auto-send when the transcript ends with +//! "send it" / "发送" +//! `/voice-control` — toggle AI-assisted dictation that sees the current +//! composer text +//! +//! The slash commands only flip state and emit [`AppAction::VoiceCapture`]; +//! the actual capture runs in the UI event loop where the live [`Config`] +//! supplies provider credentials. That keeps the handlers side-effect free +//! (the registry smoke tests execute every command) and avoids caching +//! auth material on [`App`]. +//! +//! ## Recording +//! +//! Uses platform-specific command-line tools (sox, rec, arecord) to capture +//! 16kHz mono 16-bit PCM audio. Records until a silence gap is detected or +//! the maximum duration is reached (default 10 s). + +use std::process::{Command, Stdio}; +use std::sync::LazyLock; +use std::time::Duration; + +use regex::Regex; + +use crate::commands::CommandResult; +use crate::config::Config; +use crate::localization::{MessageId, tr}; +use crate::tui::app::{App, AppAction}; + +/// Transcription model requested from the provider's chat-completions API. +const ASR_MODEL: &str = "mimo-v2.5-asr"; +/// Model used for the AI-assisted voice-control pipeline. +const VOICE_CONTROL_MODEL: &str = "mimo-v2.5"; + +// --- Recorder detection ---------------------------------------------------- + +/// Platform-specific recorder definitions. +#[derive(Debug, Clone)] +struct Recorder { + cmd: &'static str, + /// CLI arguments for piping raw 16kHz mono S16_LE PCM to stdout. + pipe_args: &'static [&'static str], +} + +fn detect_recorder() -> Option { + let candidates: &[Recorder] = if cfg!(target_os = "macos") { + &[ + Recorder { + cmd: "sox", + pipe_args: &["-d", "-r", "16000", "-c", "1", "-b", "16", "-t", "raw", "-"], + }, + Recorder { + cmd: "rec", + pipe_args: &["-r", "16000", "-c", "1", "-b", "16", "-t", "raw", "-"], + }, + ] + } else if cfg!(target_os = "linux") { + &[ + Recorder { + cmd: "arecord", + pipe_args: &["-f", "S16_LE", "-r", "16000", "-c", "1", "-t", "raw"], + }, + Recorder { + cmd: "sox", + pipe_args: &["-d", "-r", "16000", "-c", "1", "-b", "16", "-t", "raw", "-"], + }, + ] + } else if cfg!(target_os = "windows") { + &[Recorder { + cmd: "sox", + pipe_args: &["-d", "-r", "16000", "-c", "1", "-b", "16", "-t", "raw", "-"], + }] + } else { + &[] + }; + + candidates + .iter() + .find(|r| { + Command::new(r.cmd) + .arg("--version") + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .is_ok() + }) + .cloned() +} + +/// Check whether voice recording is available on this system. +pub fn is_available() -> bool { + detect_recorder().is_some() +} + +// --- WAV encoding ---------------------------------------------------------- + +/// Encode raw 16kHz mono S16_LE PCM samples as a WAV buffer. +fn encode_wav(samples: &[i16]) -> Vec { + let data_size = (samples.len() * 2) as u32; + let sample_rate: u32 = 16000; + let mut buf = Vec::with_capacity(44 + data_size as usize); + + // RIFF header + buf.extend_from_slice(b"RIFF"); + buf.extend_from_slice(&(36 + data_size).to_le_bytes()); + buf.extend_from_slice(b"WAVE"); + + // fmt chunk + buf.extend_from_slice(b"fmt "); + buf.extend_from_slice(&16u32.to_le_bytes()); // chunk size + buf.extend_from_slice(&1u16.to_le_bytes()); // PCM + buf.extend_from_slice(&1u16.to_le_bytes()); // mono + buf.extend_from_slice(&sample_rate.to_le_bytes()); + buf.extend_from_slice(&(sample_rate * 2).to_le_bytes()); // byte rate + buf.extend_from_slice(&2u16.to_le_bytes()); // block align + buf.extend_from_slice(&16u16.to_le_bytes()); // bits per sample + + // data chunk + buf.extend_from_slice(b"data"); + buf.extend_from_slice(&data_size.to_le_bytes()); + for &sample in samples { + buf.extend_from_slice(&sample.to_le_bytes()); + } + + buf +} + +// --- Recording ------------------------------------------------------------- + +/// Maximum recording duration in seconds before auto-stopping. +const MAX_RECORD_SECS: u64 = 10; +/// Minimum segment duration in seconds to consider as valid speech. +const MIN_SEGMENT_SECS: f64 = 0.3; + +/// Record audio from the default microphone. +/// +/// Returns raw 16kHz mono S16_LE PCM samples. Returns `None` if no recorder +/// is available, the recording failed, or no speech was detected. +fn record_audio() -> Option<(Vec, Duration)> { + let recorder = detect_recorder()?; + let start = std::time::Instant::now(); + + let mut child = Command::new(recorder.cmd) + .args(recorder.pipe_args) + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::null()) + .spawn() + .ok()?; + + let stdout = child.stdout.take()?; + let mut reader = std::io::BufReader::new(stdout); + let mut all_samples: Vec = Vec::with_capacity(16000 * MAX_RECORD_SECS as usize); + + // Read until timeout or silence + let mut buf = [0u8; 320]; // 10ms of 16kHz S16_LE + let max_duration = Duration::from_secs(MAX_RECORD_SECS); + let mut silence_samples = 0u32; + let mut had_speech = false; + let speech_threshold: i16 = 500; // RMS-based speech detection threshold + let silence_duration_samples = 16000u32; // 1 second of silence to stop + + loop { + use std::io::Read; + match reader.read_exact(&mut buf) { + Ok(()) => { + let chunk: Vec = buf + .chunks_exact(2) + .map(|b| i16::from_le_bytes([b[0], b[1]])) + .collect(); + + // Simple RMS-based VAD + let rms = (chunk.iter().map(|&s| (s as f64) * (s as f64)).sum::() + / chunk.len() as f64) + .sqrt(); + let is_speech = rms > speech_threshold as f64; + + if is_speech { + had_speech = true; + silence_samples = 0; + } else if had_speech { + silence_samples += chunk.len() as u32; + } + + if had_speech { + all_samples.extend_from_slice(&chunk); + } + + if start.elapsed() > max_duration { + let _ = child.kill(); + break; + } + if had_speech && silence_samples >= silence_duration_samples { + let _ = child.kill(); + break; + } + } + Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => break, + Err(_) => { + let _ = child.kill(); + break; + } + } + } + + let _ = child.wait(); + let elapsed = start.elapsed(); + + let min_samples = (MIN_SEGMENT_SECS * 16000.0) as usize; + if all_samples.len() < min_samples { + return None; + } + + Some((all_samples, elapsed)) +} + +// --- Auto-send suffix ------------------------------------------------------ + +/// Matches an explicit send instruction at the end of transcribed text: +/// "send it" (any spacing/case) or 发送/發送, with trailing punctuation. +static SEND_SUFFIX_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"(?i)(?:^|[\s,,.。!!??]+)(?:send\s*it|发送|發送)[\s.。!!??]*$").unwrap() +}); + +/// Split a transcript into the message remainder and whether it ended with an +/// explicit send instruction. `"ship the fix, send it"` → `("ship the fix", true)`. +fn split_send_suffix(text: &str) -> (&str, bool) { + match SEND_SUFFIX_RE.find(text) { + Some(found) => (text[..found.start()].trim(), true), + None => (text.trim(), false), + } +} + +// --- Transcription --------------------------------------------------------- + +fn base64_encode(data: &[u8]) -> String { + use base64::Engine; + base64::engine::general_purpose::STANDARD.encode(data) +} + +fn chat_completions_url(base_url: &str) -> String { + format!("{}/chat/completions", base_url.trim_end_matches('/')) +} + +async fn post_chat_completions( + api_key: &str, + base_url: &str, + body: serde_json::Value, +) -> Result { + let client = crate::tls::reqwest_client(); + let resp = client + .post(chat_completions_url(base_url)) + .header("Content-Type", "application/json") + .header("Authorization", format!("Bearer {api_key}")) + .timeout(Duration::from_secs(30)) + .json(&body) + .send() + .await + .map_err(|e| format!("request failed: {e}"))?; + + if !resp.status().is_success() { + return Err(format!("API returned status {}", resp.status())); + } + + resp.json() + .await + .map_err(|e| format!("failed to parse response: {e}")) +} + +/// Send audio to the provider's API for plain transcription. +/// +/// Uses the chat completions endpoint with `input_audio` content blocks. +async fn transcribe( + api_key: &str, + base_url: &str, + audio_samples: &[i16], +) -> Result { + let wav = encode_wav(audio_samples); + let data_url = format!("data:audio/wav;base64,{}", base64_encode(&wav)); + + let body = serde_json::json!({ + "model": ASR_MODEL, + "messages": [ + { + "role": "user", + "content": [ + { + "type": "input_audio", + "input_audio": { + "data": data_url + } + } + ] + } + ], + "asr_options": { + "language": "auto" + } + }); + + let data = post_chat_completions(api_key, base_url, body).await?; + data["choices"][0]["message"]["content"] + .as_str() + .map(|s| s.trim().to_string()) + .ok_or_else(|| "no transcription in response".to_string()) +} + +/// Process audio through the voice-control pipeline: AI-assisted dictation +/// that sees the current composer text, mirroring MiMo Code's +/// `processVoiceControl`. Used when `/voice-control` is enabled. +async fn process_voice_control( + api_key: &str, + base_url: &str, + audio_samples: &[i16], + current_text: &str, +) -> Result { + let wav = encode_wav(audio_samples); + let data_url = format!("data:audio/wav;base64,{}", base64_encode(&wav)); + + let user_context = serde_json::json!({ + "current_text": current_text, + "cursor": "end", + }); + + let body = serde_json::json!({ + "model": VOICE_CONTROL_MODEL, + "messages": [ + { + "role": "system", + "content": "You are a voice input assistant. Transcribe the user's speech. Output JSON: {\"text\": \"transcribed text\"}." + }, + { + "role": "user", + "content": [ + { "type": "text", "text": user_context.to_string() }, + { "type": "input_audio", "input_audio": { "data": data_url } } + ] + } + ], + "response_format": { "type": "json_object" } + }); + + let data = post_chat_completions(api_key, base_url, body).await?; + let content = data["choices"][0]["message"]["content"] + .as_str() + .ok_or_else(|| "no response content".to_string())?; + + let parsed: serde_json::Value = serde_json::from_str(content) + .map_err(|e| format!("failed to parse voice control JSON: {e}"))?; + + parsed["text"] + .as_str() + .map(|s| s.to_string()) + .ok_or_else(|| "no text field in voice control response".to_string()) +} + +// --- Capture orchestration (UI event loop) --------------------------------- + +/// What the UI should do with a finished capture. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum VoiceCaptureOutcome { + /// Insert the transcribed text into the composer at the cursor. + Insert(String), + /// Submit this text as a message (auto-send). + Send(String), +} + +/// Perform a complete record + transcribe cycle. +/// +/// Runs in the UI event loop (see [`AppAction::VoiceCapture`]) so provider +/// credentials come from the live [`Config`] rather than state cached on +/// [`App`]. Recording happens on a blocking thread; transcription uses the +/// shared async HTTP client. Every failure path returns a localized message +/// so callers can surface it as a status line. +pub async fn capture_and_transcribe( + app: &mut App, + config: &Config, +) -> Result { + let locale = app.ui_locale; + + if !is_available() { + return Err(tr(locale, MessageId::VoiceErrNoRecorder).to_string()); + } + let api_key = config + .deepseek_api_key() + .map_err(|_| tr(locale, MessageId::VoiceErrNoAuth).to_string())?; + let base_url = config.deepseek_base_url(); + + app.status_message = Some(tr(locale, MessageId::VoiceRecording).to_string()); + let (samples, _duration) = tokio::task::spawn_blocking(record_audio) + .await + .ok() + .flatten() + .ok_or_else(|| tr(locale, MessageId::VoiceErrTooShort).to_string())?; + + app.status_message = Some(tr(locale, MessageId::VoiceProcessing).to_string()); + let text = if app.voice_control_enabled { + process_voice_control(&api_key, &base_url, &samples, &app.composer.input).await + } else { + transcribe(&api_key, &base_url, &samples).await + } + .map_err(|e| format!("{}: {e}", tr(locale, MessageId::VoiceErrNetwork)))?; + + let clean = text.trim(); + if app.voice_send_enabled { + let (remainder, wants_send) = split_send_suffix(clean); + if wants_send { + // A bare "send it" submits whatever is already in the composer. + let outgoing = if remainder.is_empty() { + let existing = app.composer.input.trim().to_string(); + if !existing.is_empty() { + app.clear_input(); + } + existing + } else { + remainder.to_string() + }; + if outgoing.is_empty() { + return Err(tr(locale, MessageId::VoiceErrEmptySend).to_string()); + } + return Ok(VoiceCaptureOutcome::Send(outgoing)); + } + } + if clean.is_empty() { + return Err(tr(locale, MessageId::VoiceErrEmptySend).to_string()); + } + Ok(VoiceCaptureOutcome::Insert(clean.to_string())) +} + +// --- Command handlers ------------------------------------------------------ + +/// Handle the `/voice` command: toggle voice input. Toggling on requests a +/// one-shot recording + transcription via [`AppAction::VoiceCapture`]. +pub fn voice(app: &mut App) -> CommandResult { + let locale = app.ui_locale; + + if app.voice_enabled { + app.voice_enabled = false; + return CommandResult::message(tr(locale, MessageId::VoiceDisabled)); + } + if !is_available() { + return CommandResult::error(tr(locale, MessageId::VoiceErrNoRecorder)); + } + app.voice_enabled = true; + CommandResult::with_message_and_action( + tr(locale, MessageId::VoiceEnabled), + AppAction::VoiceCapture, + ) +} + +/// Handle the `/voice-send` command: toggle auto-send after transcription. +pub fn voice_send(app: &mut App) -> CommandResult { + let locale = app.ui_locale; + app.voice_send_enabled = !app.voice_send_enabled; + + let msg = if app.voice_send_enabled { + tr(locale, MessageId::VoiceSendEnabled) + } else { + tr(locale, MessageId::VoiceSendDisabled) + }; + CommandResult::message(msg) +} + +/// Handle the `/voice-control` command: toggle AI-assisted dictation. +pub fn voice_control(app: &mut App) -> CommandResult { + let locale = app.ui_locale; + app.voice_control_enabled = !app.voice_control_enabled; + + let msg = if app.voice_control_enabled { + tr(locale, MessageId::VoiceControlEnabled) + } else { + tr(locale, MessageId::VoiceControlDisabled) + }; + CommandResult::message(msg) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn wav_encoding_produces_valid_header() { + let samples = vec![0i16; 16000]; // 1 second of silence + let wav = encode_wav(&samples); + assert_eq!(&wav[0..4], b"RIFF"); + assert_eq!(&wav[8..12], b"WAVE"); + assert_eq!(&wav[12..16], b"fmt "); + // data size = 16000 * 2 = 32000 + assert_eq!(&wav[4..8], &(36 + 32000u32).to_le_bytes()); + } + + #[test] + fn wav_encoding_empty_is_minimal() { + let wav = encode_wav(&[]); + assert_eq!(wav.len(), 44); + assert_eq!(&wav[4..8], &36u32.to_le_bytes()); + } + + #[test] + fn send_suffix_detected_and_stripped() { + assert_eq!(split_send_suffix("send it"), ("", true)); + assert_eq!(split_send_suffix("Send It!"), ("", true)); + assert_eq!(split_send_suffix("发送"), ("", true)); + assert_eq!(split_send_suffix("發送。"), ("", true)); + assert_eq!( + split_send_suffix("ship the fix, send it"), + ("ship the fix", true) + ); + assert_eq!( + split_send_suffix("修复这个问题,发送"), + ("修复这个问题", true) + ); + } + + #[test] + fn send_suffix_leaves_plain_text_alone() { + assert_eq!(split_send_suffix("send it now"), ("send it now", false)); + assert_eq!( + split_send_suffix("帮我发送一封邮件"), + ("帮我发送一封邮件", false) + ); + assert_eq!(split_send_suffix("发送邮件"), ("发送邮件", false)); + assert_eq!( + split_send_suffix("resend it to the queue"), + ("resend it to the queue", false) + ); + } + + #[test] + fn recorder_detection_does_not_crash() { + // Just verify the function runs without panicking + let _ = is_available(); + } +} diff --git a/crates/tui/src/commands/mod.rs b/crates/tui/src/commands/mod.rs index 8e6072d2..01aad281 100644 --- a/crates/tui/src/commands/mod.rs +++ b/crates/tui/src/commands/mod.rs @@ -17,6 +17,9 @@ pub use traits::CommandInfo; // Long-standing public paths that predate the group layout. pub use groups::project::share; +// Voice capture plumbing shared with the hotbar and the UI event loop. +pub use groups::core::voice; + use crate::tui::app::{App, AppAction}; /// Result of executing a command @@ -664,6 +667,49 @@ mod tests { assert!(result.message.unwrap().contains("off")); } + #[test] + fn voice_send_and_voice_control_commands_toggle_state() { + let mut app = create_test_app(); + assert!(!app.voice_send_enabled); + assert!(!app.voice_control_enabled); + + for invocation in ["/voicesend", "/voice-send", "/yuyinsend", "/语音发送"] { + let result = execute(invocation, &mut app); + assert!(!result.is_error, "{invocation} should toggle cleanly"); + assert!(result.action.is_none()); + assert!(result.message.is_some()); + } + // Four toggles land back at disabled. + assert!(!app.voice_send_enabled); + + let result = execute("/voicecontrol", &mut app); + assert!(!result.is_error); + assert!(app.voice_control_enabled); + let result = execute("/voice-control", &mut app); + assert!(!result.is_error); + assert!(!app.voice_control_enabled); + } + + /// `/voice` defers the actual capture to the UI event loop via + /// `AppAction::VoiceCapture`, so executing it never records audio. + /// On hosts without a recorder it must fail gracefully instead. + #[test] + fn voice_command_toggles_on_and_off_or_fails_gracefully() { + let mut app = create_test_app(); + let result = execute("/voice", &mut app); + if app.voice_enabled { + assert!(!result.is_error); + assert!(matches!(result.action, Some(AppAction::VoiceCapture))); + let off = execute("/voice", &mut app); + assert!(!off.is_error); + assert!(off.action.is_none()); + assert!(!app.voice_enabled); + } else { + assert!(result.is_error); + assert!(result.action.is_none()); + } + } + #[test] fn execute_sidebar_toggles_visibility() { let mut app = create_test_app(); diff --git a/crates/tui/src/localization.rs b/crates/tui/src/localization.rs index da179577..a829615b 100644 --- a/crates/tui/src/localization.rs +++ b/crates/tui/src/localization.rs @@ -636,6 +636,24 @@ pub enum MessageId { ToolFamilyVerify, ToolFamilyThink, ToolFamilyGeneric, + // Voice commands (/voice, /voice-send, /voice-control) + CmdVoiceDescription, + CmdVoiceSendDescription, + CmdVoiceControlDescription, + VoiceEnabled, + VoiceDisabled, + VoiceSendEnabled, + VoiceSendDisabled, + VoiceControlEnabled, + VoiceControlDisabled, + VoiceErrNoAuth, + VoiceErrNoRecorder, + VoiceErrNetwork, + VoiceErrEmptySend, + VoiceErrTooShort, + VoiceRecording, + VoiceProcessing, + VoiceTranscribed, } #[allow(dead_code)] @@ -1041,6 +1059,23 @@ pub const ALL_MESSAGE_IDS: &[MessageId] = &[ MessageId::ToolFamilyVerify, MessageId::ToolFamilyThink, MessageId::ToolFamilyGeneric, + MessageId::CmdVoiceDescription, + MessageId::CmdVoiceSendDescription, + MessageId::CmdVoiceControlDescription, + MessageId::VoiceEnabled, + MessageId::VoiceDisabled, + MessageId::VoiceSendEnabled, + MessageId::VoiceSendDisabled, + MessageId::VoiceControlEnabled, + MessageId::VoiceControlDisabled, + MessageId::VoiceErrNoAuth, + MessageId::VoiceErrNoRecorder, + MessageId::VoiceErrNetwork, + MessageId::VoiceErrEmptySend, + MessageId::VoiceErrTooShort, + MessageId::VoiceRecording, + MessageId::VoiceProcessing, + MessageId::VoiceTranscribed, ]; pub fn tr(locale: Locale, id: MessageId) -> &'static str { @@ -1774,6 +1809,32 @@ fn english(id: MessageId) -> &'static str { MessageId::ToolFamilyVerify => "verify", MessageId::ToolFamilyThink => "think", MessageId::ToolFamilyGeneric => "tool", + // Voice commands + MessageId::CmdVoiceDescription => { + "Toggle voice input: record speech and transcribe into the composer" + } + MessageId::CmdVoiceSendDescription => { + "Toggle voice auto-send: submit when the transcript ends with \"send it\"" + } + MessageId::CmdVoiceControlDescription => { + "Toggle voice control: AI-assisted dictation aware of the composer text" + } + MessageId::VoiceEnabled => "Voice input enabled. Speak to record.", + MessageId::VoiceDisabled => "Voice input disabled.", + MessageId::VoiceSendEnabled => "Voice auto-send enabled.", + MessageId::VoiceSendDisabled => "Voice auto-send disabled.", + MessageId::VoiceControlEnabled => "Voice control enabled.", + MessageId::VoiceControlDisabled => "Voice control disabled.", + MessageId::VoiceErrNoAuth => "Voice: no API key configured for the active provider", + MessageId::VoiceErrNoRecorder => { + "Voice: no recording tool found. Install sox, arecord, or rec." + } + MessageId::VoiceErrNetwork => "Voice: transcription request failed", + MessageId::VoiceErrEmptySend => "Voice: nothing to send", + MessageId::VoiceErrTooShort => "Voice: no speech detected, recording too short", + MessageId::VoiceRecording => "🎙 Recording... speak now", + MessageId::VoiceProcessing => "🎙 Transcribing...", + MessageId::VoiceTranscribed => "🎙 Transcribed", } } @@ -2375,6 +2436,32 @@ fn vietnamese(id: MessageId) -> Option<&'static str> { MessageId::ToolFamilyVerify => "xác minh", MessageId::ToolFamilyThink => "suy nghĩ", MessageId::ToolFamilyGeneric => "công cụ", + // Voice commands + MessageId::CmdVoiceDescription => { + "Bật/tắt nhập liệu bằng giọng nói: ghi âm và chuyển thành văn bản" + } + MessageId::CmdVoiceSendDescription => { + "Bật/tắt tự gửi bằng giọng nói: gửi khi bản ghi kết thúc bằng \"send it\"" + } + MessageId::CmdVoiceControlDescription => { + "Bật/tắt điều khiển giọng nói: đọc chính tả có AI hỗ trợ" + } + MessageId::VoiceEnabled => "Đã bật nhập liệu bằng giọng nói. Hãy nói để ghi âm.", + MessageId::VoiceDisabled => "Đã tắt nhập liệu bằng giọng nói.", + MessageId::VoiceSendEnabled => "Đã bật tự gửi bằng giọng nói.", + MessageId::VoiceSendDisabled => "Đã tắt tự gửi bằng giọng nói.", + MessageId::VoiceControlEnabled => "Đã bật điều khiển giọng nói.", + MessageId::VoiceControlDisabled => "Đã tắt điều khiển giọng nói.", + MessageId::VoiceErrNoAuth => "Giọng nói: nhà cung cấp hiện tại chưa có khóa API", + MessageId::VoiceErrNoRecorder => { + "Giọng nói: không tìm thấy công cụ ghi âm. Hãy cài sox, arecord hoặc rec." + } + MessageId::VoiceErrNetwork => "Giọng nói: yêu cầu chuyển giọng nói thất bại", + MessageId::VoiceErrEmptySend => "Giọng nói: không có nội dung để gửi", + MessageId::VoiceErrTooShort => "Giọng nói: không phát hiện giọng nói, bản ghi quá ngắn", + MessageId::VoiceRecording => "🎙 Đang ghi âm... hãy nói", + MessageId::VoiceProcessing => "🎙 Đang chuyển thành văn bản...", + MessageId::VoiceTranscribed => "🎙 Đã chuyển xong", }) } @@ -2530,6 +2617,28 @@ fn traditional_chinese(id: MessageId) -> Option<&'static str> { MessageId::ToolFamilyVerify => "驗證", MessageId::ToolFamilyThink => "思考", MessageId::ToolFamilyGeneric => "工具", + // Voice commands + MessageId::CmdVoiceDescription => "切換語音輸入:錄製語音並轉錄為文字", + MessageId::CmdVoiceSendDescription => { + "切換語音自動傳送:轉錄以「發送」或「send it」結尾時自動提交" + } + MessageId::CmdVoiceControlDescription => { + "切換語音控制:AI 輔助的語音聽寫(結合當前輸入內容)" + } + MessageId::VoiceEnabled => "語音輸入已開啟,開始說話即可錄製", + MessageId::VoiceDisabled => "語音輸入已關閉", + MessageId::VoiceSendEnabled => "語音自動傳送已開啟", + MessageId::VoiceSendDisabled => "語音自動傳送已關閉", + MessageId::VoiceControlEnabled => "語音控制已開啟", + MessageId::VoiceControlDisabled => "語音控制已關閉", + MessageId::VoiceErrNoAuth => "語音:目前供應商未設定 API 金鑰", + MessageId::VoiceErrNoRecorder => "語音:未找到錄音工具,請安裝 sox、arecord 或 rec", + MessageId::VoiceErrNetwork => "語音:轉錄請求失敗", + MessageId::VoiceErrEmptySend => "語音:沒有可傳送的內容", + MessageId::VoiceErrTooShort => "語音:未偵測到有效語音,錄製時間過短", + MessageId::VoiceRecording => "🎙 正在錄音...請說話", + MessageId::VoiceProcessing => "🎙 正在轉錄...", + MessageId::VoiceTranscribed => "🎙 轉錄完成", other => chinese_simplified(other)?, }) } @@ -3090,6 +3199,32 @@ fn japanese(id: MessageId) -> Option<&'static str> { MessageId::ToolFamilyVerify => "検証", MessageId::ToolFamilyThink => "思考", MessageId::ToolFamilyGeneric => "ツール", + // Voice commands + MessageId::CmdVoiceDescription => "音声入力の切替:音声を録音してテキストに変換", + MessageId::CmdVoiceSendDescription => { + "音声自動送信の切替:転写が「send it」で終わると自動送信" + } + MessageId::CmdVoiceControlDescription => { + "音声コントロールの切替:入力欄を考慮した AI 音声ディクテーション" + } + MessageId::VoiceEnabled => "音声入力を有効にしました。話すと録音されます。", + MessageId::VoiceDisabled => "音声入力を無効にしました。", + MessageId::VoiceSendEnabled => "音声自動送信を有効にしました。", + MessageId::VoiceSendDisabled => "音声自動送信を無効にしました。", + MessageId::VoiceControlEnabled => "音声コントロールを有効にしました。", + MessageId::VoiceControlDisabled => "音声コントロールを無効にしました。", + MessageId::VoiceErrNoAuth => { + "音声:アクティブなプロバイダーに API キーが設定されていません" + } + MessageId::VoiceErrNoRecorder => { + "音声:録音ツールが見つかりません。sox、arecord、rec のいずれかをインストールしてください" + } + MessageId::VoiceErrNetwork => "音声:文字起こしリクエストに失敗しました", + MessageId::VoiceErrEmptySend => "音声:送信する内容がありません", + MessageId::VoiceErrTooShort => "音声:音声が検出されませんでした。録音が短すぎます", + MessageId::VoiceRecording => "🎙 録音中...お話しください", + MessageId::VoiceProcessing => "🎙 文字起こし中...", + MessageId::VoiceTranscribed => "🎙 文字起こし完了", }) } @@ -3585,6 +3720,28 @@ fn chinese_simplified(id: MessageId) -> Option<&'static str> { MessageId::ToolFamilyVerify => "验证", MessageId::ToolFamilyThink => "思考", MessageId::ToolFamilyGeneric => "工具", + // Voice commands + MessageId::CmdVoiceDescription => "切换语音输入:录制语音并转录为文字", + MessageId::CmdVoiceSendDescription => { + "切换语音自动发送:转录以「发送」或「send it」结尾时自动提交" + } + MessageId::CmdVoiceControlDescription => { + "切换语音控制:AI 辅助的语音听写(结合当前输入内容)" + } + MessageId::VoiceEnabled => "语音输入已开启,开始说话即可录制", + MessageId::VoiceDisabled => "语音输入已关闭", + MessageId::VoiceSendEnabled => "语音自动发送已开启", + MessageId::VoiceSendDisabled => "语音自动发送已关闭", + MessageId::VoiceControlEnabled => "语音控制已开启", + MessageId::VoiceControlDisabled => "语音控制已关闭", + MessageId::VoiceErrNoAuth => "语音:当前提供商未配置 API 密钥", + MessageId::VoiceErrNoRecorder => "语音:未找到录音工具,请安装 sox、arecord 或 rec", + MessageId::VoiceErrNetwork => "语音:转录请求失败", + MessageId::VoiceErrEmptySend => "语音:没有可发送的内容", + MessageId::VoiceErrTooShort => "语音:未检测到有效语音,录制时间过短", + MessageId::VoiceRecording => "🎙 正在录音...请说话", + MessageId::VoiceProcessing => "🎙 正在转录...", + MessageId::VoiceTranscribed => "🎙 转录完成", }) } @@ -4170,6 +4327,32 @@ fn portuguese_brazil(id: MessageId) -> Option<&'static str> { MessageId::ToolFamilyVerify => "verificar", MessageId::ToolFamilyThink => "pensar", MessageId::ToolFamilyGeneric => "ferramenta", + // Voice commands + MessageId::CmdVoiceDescription => { + "Alternar entrada de voz: gravar fala e transcrever para texto" + } + MessageId::CmdVoiceSendDescription => { + "Alternar envio automático por voz: envia quando a transcrição termina com \"send it\"" + } + MessageId::CmdVoiceControlDescription => { + "Alternar controle por voz: ditado assistido por IA" + } + MessageId::VoiceEnabled => "Entrada de voz ativada. Fale para gravar.", + MessageId::VoiceDisabled => "Entrada de voz desativada.", + MessageId::VoiceSendEnabled => "Envio automático por voz ativado.", + MessageId::VoiceSendDisabled => "Envio automático por voz desativado.", + MessageId::VoiceControlEnabled => "Controle por voz ativado.", + MessageId::VoiceControlDisabled => "Controle por voz desativado.", + MessageId::VoiceErrNoAuth => "Voz: nenhuma chave de API configurada para o provedor ativo", + MessageId::VoiceErrNoRecorder => { + "Voz: nenhuma ferramenta de gravação encontrada. Instale sox, arecord ou rec." + } + MessageId::VoiceErrNetwork => "Voz: falha na solicitação de transcrição", + MessageId::VoiceErrEmptySend => "Voz: nada para enviar", + MessageId::VoiceErrTooShort => "Voz: nenhuma fala detectada, gravação muito curta", + MessageId::VoiceRecording => "🎙 Gravando... fale agora", + MessageId::VoiceProcessing => "🎙 Transcrevendo...", + MessageId::VoiceTranscribed => "🎙 Transcrito", }) } @@ -4765,6 +4948,34 @@ fn spanish_latin_america(id: MessageId) -> Option<&'static str> { MessageId::ToolFamilyVerify => "verificar", MessageId::ToolFamilyThink => "pensar", MessageId::ToolFamilyGeneric => "herramienta", + // Voice commands + MessageId::CmdVoiceDescription => { + "Alternar entrada de voz: grabar voz y transcribir a texto" + } + MessageId::CmdVoiceSendDescription => { + "Alternar envío automático por voz: envía cuando la transcripción termina con \"send it\"" + } + MessageId::CmdVoiceControlDescription => { + "Alternar control por voz: dictado asistido por IA" + } + MessageId::VoiceEnabled => "Entrada de voz activada. Habla para grabar.", + MessageId::VoiceDisabled => "Entrada de voz desactivada.", + MessageId::VoiceSendEnabled => "Envío automático por voz activado.", + MessageId::VoiceSendDisabled => "Envío automático por voz desactivado.", + MessageId::VoiceControlEnabled => "Control por voz activado.", + MessageId::VoiceControlDisabled => "Control por voz desactivado.", + MessageId::VoiceErrNoAuth => { + "Voz: no hay clave de API configurada para el proveedor activo" + } + MessageId::VoiceErrNoRecorder => { + "Voz: no se encontró herramienta de grabación. Instala sox, arecord o rec." + } + MessageId::VoiceErrNetwork => "Voz: falló la solicitud de transcripción", + MessageId::VoiceErrEmptySend => "Voz: nada que enviar", + MessageId::VoiceErrTooShort => "Voz: no se detectó voz, grabación demasiado corta", + MessageId::VoiceRecording => "🎙 Grabando... habla ahora", + MessageId::VoiceProcessing => "🎙 Transcribiendo...", + MessageId::VoiceTranscribed => "🎙 Transcrito", }) } diff --git a/crates/tui/src/tui/app.rs b/crates/tui/src/tui/app.rs index fb6a3ad4..d8bed4c9 100644 --- a/crates/tui/src/tui/app.rs +++ b/crates/tui/src/tui/app.rs @@ -1481,6 +1481,14 @@ pub struct App { pub cost_currency: CostCurrency, pub composer_density: ComposerDensity, pub composer_border: bool, + /// Voice input state — toggled by `/voice` and the voice hotbar action. + pub voice_enabled: bool, + /// Auto-send after transcription when the transcript ends with an + /// explicit send instruction ("send it" / "发送"). Toggled by `/voice-send`. + pub voice_send_enabled: bool, + /// AI-assisted dictation that sees the current composer text. + /// Toggled by `/voice-control`. + pub voice_control_enabled: bool, pub transcript_spacing: TranscriptSpacing, pub sidebar_width_percent: u16, pub sidebar_focus: SidebarFocus, @@ -2275,6 +2283,9 @@ impl App { cost_currency, composer_density, composer_border, + voice_enabled: false, + voice_send_enabled: false, + voice_control_enabled: false, transcript_spacing, sidebar_width_percent, sidebar_focus, @@ -5323,6 +5334,11 @@ pub enum AppAction { SwitchWorkspace { workspace: PathBuf, }, + /// Record from the microphone and route the transcription into the + /// composer (or auto-send it). Emitted by `/voice` and the voice hotbar + /// action; handled in the UI event loop where the live `Config` supplies + /// provider credentials. + VoiceCapture, /// Export and share the current session as a web URL. ShareSession { history_len: usize, diff --git a/crates/tui/src/tui/command_palette.rs b/crates/tui/src/tui/command_palette.rs index 5ed222cb..64447c77 100644 --- a/crates/tui/src/tui/command_palette.rs +++ b/crates/tui/src/tui/command_palette.rs @@ -1141,7 +1141,7 @@ mod tests { assert!(command_labels.contains(&"/config")); assert!(command_labels.contains(&"/links")); - assert!(!command_labels.contains(&"/voice")); + assert!(command_labels.contains(&"/voice")); assert!(!command_labels.contains(&"/set")); assert!(!command_labels.contains(&"/deepseek")); } diff --git a/crates/tui/src/tui/hotbar/actions.rs b/crates/tui/src/tui/hotbar/actions.rs index 0a76c493..8e801663 100644 --- a/crates/tui/src/tui/hotbar/actions.rs +++ b/crates/tui/src/tui/hotbar/actions.rs @@ -181,7 +181,7 @@ impl HotbarAction for AppHotbarAction { fn is_active(&self, app: &App) -> bool { match self.kind { - AppHotbarKind::VoiceToggle => false, + AppHotbarKind::VoiceToggle => app.voice_enabled, AppHotbarKind::SessionCompact => app.is_compacting, AppHotbarKind::Mode(mode) => app.mode == mode, AppHotbarKind::ReasoningCycle => { @@ -197,9 +197,12 @@ impl HotbarAction for AppHotbarAction { fn dispatch(&self, app: &mut App) -> Result { match self.kind { AppHotbarKind::VoiceToggle => { - app.status_message = - Some("Voice input is not available in this terminal session yet.".to_string()); - Ok(HotbarDispatch::Handled) + let result = crate::commands::voice::voice(app); + app.status_message = result.message; + match result.action { + Some(action) => Ok(HotbarDispatch::AppAction(action)), + None => Ok(HotbarDispatch::Handled), + } } AppHotbarKind::SessionCompact => { if app.is_compacting { @@ -539,19 +542,36 @@ mod tests { } #[test] - fn voice_toggle_is_safe_until_voice_input_lands() { + fn voice_toggle_dispatches_the_voice_command() { let registry = HotbarActionRegistry::with_builtins(); let voice = registry.get("voice.toggle").expect("voice action"); let mut app = test_app(); assert!(!voice.is_active(&app)); - assert_eq!( - voice.dispatch(&mut app).expect("dispatch voice"), - HotbarDispatch::Handled - ); - assert_eq!( + // The toggle is wired to the /voice command. With a recorder on the + // host it arms voice input and defers capture to the UI event loop; + // without one it fails gracefully with a localized error. No audio + // is recorded in either case. + let result = voice.dispatch(&mut app).expect("dispatch voice"); + assert!(app.status_message.is_some()); + // The old placeholder message must be gone — voice is implemented. + assert_ne!( app.status_message.as_deref(), Some("Voice input is not available in this terminal session yet.") ); + if app.voice_enabled { + assert_eq!( + result, + HotbarDispatch::AppAction(crate::tui::app::AppAction::VoiceCapture) + ); + assert!(voice.is_active(&app)); + // A second press toggles voice input back off. + let off = voice.dispatch(&mut app).expect("dispatch voice off"); + assert_eq!(off, HotbarDispatch::Handled); + assert!(!app.voice_enabled); + assert!(!voice.is_active(&app)); + } else { + assert_eq!(result, HotbarDispatch::Handled); + } } } diff --git a/crates/tui/src/tui/ui.rs b/crates/tui/src/tui/ui.rs index add27d97..bcf2330d 100644 --- a/crates/tui/src/tui/ui.rs +++ b/crates/tui/src/tui/ui.rs @@ -6431,6 +6431,28 @@ async fn apply_command_result( let queued = build_queued_message(app, content); submit_or_steer_message(app, config, engine_handle, queued).await?; } + AppAction::VoiceCapture => { + use commands::voice::VoiceCaptureOutcome; + match commands::voice::capture_and_transcribe(app, config).await { + Ok(VoiceCaptureOutcome::Insert(text)) => { + app.insert_str(&text); + app.status_message = Some(format!( + "{}: {text}", + tr(app.ui_locale, MessageId::VoiceTranscribed) + )); + } + Ok(VoiceCaptureOutcome::Send(content)) => { + app.status_message = + Some(tr(app.ui_locale, MessageId::VoiceTranscribed).to_string()); + let queued = build_queued_message(app, content); + submit_or_steer_message(app, config, engine_handle, queued).await?; + } + Err(err) => { + app.voice_enabled = false; + app.status_message = Some(err); + } + } + } AppAction::ListSubAgents => { let _ = engine_handle.send(Op::ListSubAgents).await; }