Harvest PR #3051: voice input commands and hotbar integration

Port /voice, /voice-send, and /voice-control into the command strategy registry as groups/core/voice.rs. The handlers only flip App state (voice_enabled, voice_send_enabled, voice_control_enabled) and emit the new AppAction::VoiceCapture; the UI event loop performs the actual record + transcribe cycle so credentials come from the live Config (deepseek_api_key/deepseek_base_url) instead of auth fields cached on App, and no audio is ever recorded by the registry smoke tests. - voice.toggle hotbar action dispatches the real /voice command and reports voice_enabled as its active state, replacing the placeholder. - Recording uses sox/rec/arecord with RMS-based silence detection; transcription posts input_audio blocks to the provider chat completions API (async reqwest — the blocking client would panic inside the tokio event loop). - Transcripts insert at the composer cursor via App::insert_str. With /voice-send enabled, a transcript ending in "send it" / 发送 strips the suffix and submits; a bare "send it" submits the current composer content. With /voice-control enabled, transcription runs through the AI dictation pipeline that sees the composer text. - Failures (no recorder, no API key, short recording, network) surface as localized status messages and disarm voice input. - Localized command help and status strings for all seven shipped locales; /voice now appears in the command palette. Harvested from PR #3051 by @huqiantao Co-authored-by: huqiantao <huqiantao@users.noreply.github.com> Co-authored-by: Hunter B <hmbown@gmail.com> Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-12 14:43:27 -07:00
parent 662a459ee5
commit 9b31621b19
8 changed files with 903 additions and 11 deletions
@@ -10,6 +10,7 @@ mod hooks;
 mod provider;
 mod queue;
 mod stash;
+pub mod voice;

 pub(in crate::commands) use self::core::reset_conversation_state;

@@ -43,6 +44,9 @@ impl CommandGroup for CoreCommands {
            Box::new(FunctionCommand::new(&PROFILE_INFO, run_profile)),
            Box::new(FunctionCommand::new(&RLM_INFO, run_rlm)),
            Box::new(FunctionCommand::new(&TRANSLATE_INFO, run_translate)),
+            Box::new(FunctionCommand::new(&VOICE_INFO, run_voice)),
+            Box::new(FunctionCommand::new(&VOICE_SEND_INFO, run_voice_send)),
+            Box::new(FunctionCommand::new(&VOICE_CONTROL_INFO, run_voice_control)),
        ]
    }
 }
@@ -167,6 +171,24 @@ static TRANSLATE_INFO: CommandInfo = CommandInfo {
    usage: "/translate",
    description_id: MessageId::CmdTranslateDescription,
 };
+static VOICE_INFO: CommandInfo = CommandInfo {
+    name: "voice",
+    aliases: &["yuyin", "语音"],
+    usage: "/voice",
+    description_id: MessageId::CmdVoiceDescription,
+};
+static VOICE_SEND_INFO: CommandInfo = CommandInfo {
+    name: "voicesend",
+    aliases: &["voice-send", "yuyinsend", "语音发送"],
+    usage: "/voicesend",
+    description_id: MessageId::CmdVoiceSendDescription,
+};
+static VOICE_CONTROL_INFO: CommandInfo = CommandInfo {
+    name: "voicecontrol",
+    aliases: &["voice-control", "yuyincontrol", "语音控制"],
+    usage: "/voicecontrol",
+    description_id: MessageId::CmdVoiceControlDescription,
+};

 fn run_registered(app: &mut App, name: &str, arg: Option<&str>) -> CommandResult {
    dispatch(app, name, arg).expect("registered core command should dispatch")
@@ -232,6 +254,15 @@ fn run_rlm(app: &mut App, arg: Option<&str>) -> CommandResult {
 fn run_translate(app: &mut App, arg: Option<&str>) -> CommandResult {
    run_registered(app, "translate", arg)
 }
+fn run_voice(app: &mut App, arg: Option<&str>) -> CommandResult {
+    run_registered(app, "voice", arg)
+}
+fn run_voice_send(app: &mut App, arg: Option<&str>) -> CommandResult {
+    run_registered(app, "voicesend", arg)
+}
+fn run_voice_control(app: &mut App, arg: Option<&str>) -> CommandResult {
+    run_registered(app, "voicecontrol", arg)
+}

 pub(in crate::commands) fn dispatch(
    app: &mut App,
@@ -259,6 +290,11 @@ pub(in crate::commands) fn dispatch(
        "profile" | "dangan" => core::profile_switch(app, arg),
        "rlm" | "recursive" | "digui" => rlm(app, arg),
        "translate" | "translation" | "transale" => core::translate(app),
+        "voice" | "yuyin" | "语音" => voice::voice(app),
+        "voicesend" | "voice-send" | "yuyinsend" | "语音发送" => voice::voice_send(app),
+        "voicecontrol" | "voice-control" | "yuyincontrol" | "语音控制" => {
+            voice::voice_control(app)
+        }
        _ => return None,
    };
    Some(result)
@@ -0,0 +1,541 @@
+//! Voice input commands — `/voice`, `/voice-send`, `/voice-control`.
+//!
+//! Records audio from the default microphone, sends it to the configured
+//! provider's API for transcription, and inserts the transcribed text into
+//! the composer. The interaction model mirrors MiMo Code's voice UX:
+//!
+//!   `/voice`         — toggle voice input on/off (records when toggled on)
+//!   `/voice-send`    — toggle auto-send when the transcript ends with
+//!                      "send it" / "发送"
+//!   `/voice-control` — toggle AI-assisted dictation that sees the current
+//!                      composer text
+//!
+//! The slash commands only flip state and emit [`AppAction::VoiceCapture`];
+//! the actual capture runs in the UI event loop where the live [`Config`]
+//! supplies provider credentials. That keeps the handlers side-effect free
+//! (the registry smoke tests execute every command) and avoids caching
+//! auth material on [`App`].
+//!
+//! ## Recording
+//!
+//! Uses platform-specific command-line tools (sox, rec, arecord) to capture
+//! 16kHz mono 16-bit PCM audio. Records until a silence gap is detected or
+//! the maximum duration is reached (default 10 s).
+
+use std::process::{Command, Stdio};
+use std::sync::LazyLock;
+use std::time::Duration;
+
+use regex::Regex;
+
+use crate::commands::CommandResult;
+use crate::config::Config;
+use crate::localization::{MessageId, tr};
+use crate::tui::app::{App, AppAction};
+
+/// Transcription model requested from the provider's chat-completions API.
+const ASR_MODEL: &str = "mimo-v2.5-asr";
+/// Model used for the AI-assisted voice-control pipeline.
+const VOICE_CONTROL_MODEL: &str = "mimo-v2.5";
+
+// --- Recorder detection ----------------------------------------------------
+
+/// Platform-specific recorder definitions.
+#[derive(Debug, Clone)]
+struct Recorder {
+    cmd: &'static str,
+    /// CLI arguments for piping raw 16kHz mono S16_LE PCM to stdout.
+    pipe_args: &'static [&'static str],
+}
+
+fn detect_recorder() -> Option<Recorder> {
+    let candidates: &[Recorder] = if cfg!(target_os = "macos") {
+        &[
+            Recorder {
+                cmd: "sox",
+                pipe_args: &["-d", "-r", "16000", "-c", "1", "-b", "16", "-t", "raw", "-"],
+            },
+            Recorder {
+                cmd: "rec",
+                pipe_args: &["-r", "16000", "-c", "1", "-b", "16", "-t", "raw", "-"],
+            },
+        ]
+    } else if cfg!(target_os = "linux") {
+        &[
+            Recorder {
+                cmd: "arecord",
+                pipe_args: &["-f", "S16_LE", "-r", "16000", "-c", "1", "-t", "raw"],
+            },
+            Recorder {
+                cmd: "sox",
+                pipe_args: &["-d", "-r", "16000", "-c", "1", "-b", "16", "-t", "raw", "-"],
+            },
+        ]
+    } else if cfg!(target_os = "windows") {
+        &[Recorder {
+            cmd: "sox",
+            pipe_args: &["-d", "-r", "16000", "-c", "1", "-b", "16", "-t", "raw", "-"],
+        }]
+    } else {
+        &[]
+    };
+
+    candidates
+        .iter()
+        .find(|r| {
+            Command::new(r.cmd)
+                .arg("--version")
+                .stdin(Stdio::null())
+                .stdout(Stdio::null())
+                .stderr(Stdio::null())
+                .spawn()
+                .is_ok()
+        })
+        .cloned()
+}
+
+/// Check whether voice recording is available on this system.
+pub fn is_available() -> bool {
+    detect_recorder().is_some()
+}
+
+// --- WAV encoding ----------------------------------------------------------
+
+/// Encode raw 16kHz mono S16_LE PCM samples as a WAV buffer.
+fn encode_wav(samples: &[i16]) -> Vec<u8> {
+    let data_size = (samples.len() * 2) as u32;
+    let sample_rate: u32 = 16000;
+    let mut buf = Vec::with_capacity(44 + data_size as usize);
+
+    // RIFF header
+    buf.extend_from_slice(b"RIFF");
+    buf.extend_from_slice(&(36 + data_size).to_le_bytes());
+    buf.extend_from_slice(b"WAVE");
+
+    // fmt chunk
+    buf.extend_from_slice(b"fmt ");
+    buf.extend_from_slice(&16u32.to_le_bytes()); // chunk size
+    buf.extend_from_slice(&1u16.to_le_bytes()); // PCM
+    buf.extend_from_slice(&1u16.to_le_bytes()); // mono
+    buf.extend_from_slice(&sample_rate.to_le_bytes());
+    buf.extend_from_slice(&(sample_rate * 2).to_le_bytes()); // byte rate
+    buf.extend_from_slice(&2u16.to_le_bytes()); // block align
+    buf.extend_from_slice(&16u16.to_le_bytes()); // bits per sample
+
+    // data chunk
+    buf.extend_from_slice(b"data");
+    buf.extend_from_slice(&data_size.to_le_bytes());
+    for &sample in samples {
+        buf.extend_from_slice(&sample.to_le_bytes());
+    }
+
+    buf
+}
+
+// --- Recording -------------------------------------------------------------
+
+/// Maximum recording duration in seconds before auto-stopping.
+const MAX_RECORD_SECS: u64 = 10;
+/// Minimum segment duration in seconds to consider as valid speech.
+const MIN_SEGMENT_SECS: f64 = 0.3;
+
+/// Record audio from the default microphone.
+///
+/// Returns raw 16kHz mono S16_LE PCM samples. Returns `None` if no recorder
+/// is available, the recording failed, or no speech was detected.
+fn record_audio() -> Option<(Vec<i16>, Duration)> {
+    let recorder = detect_recorder()?;
+    let start = std::time::Instant::now();
+
+    let mut child = Command::new(recorder.cmd)
+        .args(recorder.pipe_args)
+        .stdin(Stdio::null())
+        .stdout(Stdio::piped())
+        .stderr(Stdio::null())
+        .spawn()
+        .ok()?;
+
+    let stdout = child.stdout.take()?;
+    let mut reader = std::io::BufReader::new(stdout);
+    let mut all_samples: Vec<i16> = Vec::with_capacity(16000 * MAX_RECORD_SECS as usize);
+
+    // Read until timeout or silence
+    let mut buf = [0u8; 320]; // 10ms of 16kHz S16_LE
+    let max_duration = Duration::from_secs(MAX_RECORD_SECS);
+    let mut silence_samples = 0u32;
+    let mut had_speech = false;
+    let speech_threshold: i16 = 500; // RMS-based speech detection threshold
+    let silence_duration_samples = 16000u32; // 1 second of silence to stop
+
+    loop {
+        use std::io::Read;
+        match reader.read_exact(&mut buf) {
+            Ok(()) => {
+                let chunk: Vec<i16> = buf
+                    .chunks_exact(2)
+                    .map(|b| i16::from_le_bytes([b[0], b[1]]))
+                    .collect();
+
+                // Simple RMS-based VAD
+                let rms = (chunk.iter().map(|&s| (s as f64) * (s as f64)).sum::<f64>()
+                    / chunk.len() as f64)
+                    .sqrt();
+                let is_speech = rms > speech_threshold as f64;
+
+                if is_speech {
+                    had_speech = true;
+                    silence_samples = 0;
+                } else if had_speech {
+                    silence_samples += chunk.len() as u32;
+                }
+
+                if had_speech {
+                    all_samples.extend_from_slice(&chunk);
+                }
+
+                if start.elapsed() > max_duration {
+                    let _ = child.kill();
+                    break;
+                }
+                if had_speech && silence_samples >= silence_duration_samples {
+                    let _ = child.kill();
+                    break;
+                }
+            }
+            Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => break,
+            Err(_) => {
+                let _ = child.kill();
+                break;
+            }
+        }
+    }
+
+    let _ = child.wait();
+    let elapsed = start.elapsed();
+
+    let min_samples = (MIN_SEGMENT_SECS * 16000.0) as usize;
+    if all_samples.len() < min_samples {
+        return None;
+    }
+
+    Some((all_samples, elapsed))
+}
+
+// --- Auto-send suffix ------------------------------------------------------
+
+/// Matches an explicit send instruction at the end of transcribed text:
+/// "send it" (any spacing/case) or 发送/發送, with trailing punctuation.
+static SEND_SUFFIX_RE: LazyLock<Regex> = LazyLock::new(|| {
+    Regex::new(r"(?i)(?:^|[\s,，.。!！?？]+)(?:send\s*it|发送|發送)[\s.。!！?？]*$").unwrap()
+});
+
+/// Split a transcript into the message remainder and whether it ended with an
+/// explicit send instruction. `"ship the fix, send it"` → `("ship the fix", true)`.
+fn split_send_suffix(text: &str) -> (&str, bool) {
+    match SEND_SUFFIX_RE.find(text) {
+        Some(found) => (text[..found.start()].trim(), true),
+        None => (text.trim(), false),
+    }
+}
+
+// --- Transcription ---------------------------------------------------------
+
+fn base64_encode(data: &[u8]) -> String {
+    use base64::Engine;
+    base64::engine::general_purpose::STANDARD.encode(data)
+}
+
+fn chat_completions_url(base_url: &str) -> String {
+    format!("{}/chat/completions", base_url.trim_end_matches('/'))
+}
+
+async fn post_chat_completions(
+    api_key: &str,
+    base_url: &str,
+    body: serde_json::Value,
+) -> Result<serde_json::Value, String> {
+    let client = crate::tls::reqwest_client();
+    let resp = client
+        .post(chat_completions_url(base_url))
+        .header("Content-Type", "application/json")
+        .header("Authorization", format!("Bearer {api_key}"))
+        .timeout(Duration::from_secs(30))
+        .json(&body)
+        .send()
+        .await
+        .map_err(|e| format!("request failed: {e}"))?;
+
+    if !resp.status().is_success() {
+        return Err(format!("API returned status {}", resp.status()));
+    }
+
+    resp.json()
+        .await
+        .map_err(|e| format!("failed to parse response: {e}"))
+}
+
+/// Send audio to the provider's API for plain transcription.
+///
+/// Uses the chat completions endpoint with `input_audio` content blocks.
+async fn transcribe(
+    api_key: &str,
+    base_url: &str,
+    audio_samples: &[i16],
+) -> Result<String, String> {
+    let wav = encode_wav(audio_samples);
+    let data_url = format!("data:audio/wav;base64,{}", base64_encode(&wav));
+
+    let body = serde_json::json!({
+        "model": ASR_MODEL,
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "input_audio",
+                        "input_audio": {
+                            "data": data_url
+                        }
+                    }
+                ]
+            }
+        ],
+        "asr_options": {
+            "language": "auto"
+        }
+    });
+
+    let data = post_chat_completions(api_key, base_url, body).await?;
+    data["choices"][0]["message"]["content"]
+        .as_str()
+        .map(|s| s.trim().to_string())
+        .ok_or_else(|| "no transcription in response".to_string())
+}
+
+/// Process audio through the voice-control pipeline: AI-assisted dictation
+/// that sees the current composer text, mirroring MiMo Code's
+/// `processVoiceControl`. Used when `/voice-control` is enabled.
+async fn process_voice_control(
+    api_key: &str,
+    base_url: &str,
+    audio_samples: &[i16],
+    current_text: &str,
+) -> Result<String, String> {
+    let wav = encode_wav(audio_samples);
+    let data_url = format!("data:audio/wav;base64,{}", base64_encode(&wav));
+
+    let user_context = serde_json::json!({
+        "current_text": current_text,
+        "cursor": "end",
+    });
+
+    let body = serde_json::json!({
+        "model": VOICE_CONTROL_MODEL,
+        "messages": [
+            {
+                "role": "system",
+                "content": "You are a voice input assistant. Transcribe the user's speech. Output JSON: {\"text\": \"transcribed text\"}."
+            },
+            {
+                "role": "user",
+                "content": [
+                    { "type": "text", "text": user_context.to_string() },
+                    { "type": "input_audio", "input_audio": { "data": data_url } }
+                ]
+            }
+        ],
+        "response_format": { "type": "json_object" }
+    });
+
+    let data = post_chat_completions(api_key, base_url, body).await?;
+    let content = data["choices"][0]["message"]["content"]
+        .as_str()
+        .ok_or_else(|| "no response content".to_string())?;
+
+    let parsed: serde_json::Value = serde_json::from_str(content)
+        .map_err(|e| format!("failed to parse voice control JSON: {e}"))?;
+
+    parsed["text"]
+        .as_str()
+        .map(|s| s.to_string())
+        .ok_or_else(|| "no text field in voice control response".to_string())
+}
+
+// --- Capture orchestration (UI event loop) ---------------------------------
+
+/// What the UI should do with a finished capture.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum VoiceCaptureOutcome {
+    /// Insert the transcribed text into the composer at the cursor.
+    Insert(String),
+    /// Submit this text as a message (auto-send).
+    Send(String),
+}
+
+/// Perform a complete record + transcribe cycle.
+///
+/// Runs in the UI event loop (see [`AppAction::VoiceCapture`]) so provider
+/// credentials come from the live [`Config`] rather than state cached on
+/// [`App`]. Recording happens on a blocking thread; transcription uses the
+/// shared async HTTP client. Every failure path returns a localized message
+/// so callers can surface it as a status line.
+pub async fn capture_and_transcribe(
+    app: &mut App,
+    config: &Config,
+) -> Result<VoiceCaptureOutcome, String> {
+    let locale = app.ui_locale;
+
+    if !is_available() {
+        return Err(tr(locale, MessageId::VoiceErrNoRecorder).to_string());
+    }
+    let api_key = config
+        .deepseek_api_key()
+        .map_err(|_| tr(locale, MessageId::VoiceErrNoAuth).to_string())?;
+    let base_url = config.deepseek_base_url();
+
+    app.status_message = Some(tr(locale, MessageId::VoiceRecording).to_string());
+    let (samples, _duration) = tokio::task::spawn_blocking(record_audio)
+        .await
+        .ok()
+        .flatten()
+        .ok_or_else(|| tr(locale, MessageId::VoiceErrTooShort).to_string())?;
+
+    app.status_message = Some(tr(locale, MessageId::VoiceProcessing).to_string());
+    let text = if app.voice_control_enabled {
+        process_voice_control(&api_key, &base_url, &samples, &app.composer.input).await
+    } else {
+        transcribe(&api_key, &base_url, &samples).await
+    }
+    .map_err(|e| format!("{}: {e}", tr(locale, MessageId::VoiceErrNetwork)))?;
+
+    let clean = text.trim();
+    if app.voice_send_enabled {
+        let (remainder, wants_send) = split_send_suffix(clean);
+        if wants_send {
+            // A bare "send it" submits whatever is already in the composer.
+            let outgoing = if remainder.is_empty() {
+                let existing = app.composer.input.trim().to_string();
+                if !existing.is_empty() {
+                    app.clear_input();
+                }
+                existing
+            } else {
+                remainder.to_string()
+            };
+            if outgoing.is_empty() {
+                return Err(tr(locale, MessageId::VoiceErrEmptySend).to_string());
+            }
+            return Ok(VoiceCaptureOutcome::Send(outgoing));
+        }
+    }
+    if clean.is_empty() {
+        return Err(tr(locale, MessageId::VoiceErrEmptySend).to_string());
+    }
+    Ok(VoiceCaptureOutcome::Insert(clean.to_string()))
+}
+
+// --- Command handlers ------------------------------------------------------
+
+/// Handle the `/voice` command: toggle voice input. Toggling on requests a
+/// one-shot recording + transcription via [`AppAction::VoiceCapture`].
+pub fn voice(app: &mut App) -> CommandResult {
+    let locale = app.ui_locale;
+
+    if app.voice_enabled {
+        app.voice_enabled = false;
+        return CommandResult::message(tr(locale, MessageId::VoiceDisabled));
+    }
+    if !is_available() {
+        return CommandResult::error(tr(locale, MessageId::VoiceErrNoRecorder));
+    }
+    app.voice_enabled = true;
+    CommandResult::with_message_and_action(
+        tr(locale, MessageId::VoiceEnabled),
+        AppAction::VoiceCapture,
+    )
+}
+
+/// Handle the `/voice-send` command: toggle auto-send after transcription.
+pub fn voice_send(app: &mut App) -> CommandResult {
+    let locale = app.ui_locale;
+    app.voice_send_enabled = !app.voice_send_enabled;
+
+    let msg = if app.voice_send_enabled {
+        tr(locale, MessageId::VoiceSendEnabled)
+    } else {
+        tr(locale, MessageId::VoiceSendDisabled)
+    };
+    CommandResult::message(msg)
+}
+
+/// Handle the `/voice-control` command: toggle AI-assisted dictation.
+pub fn voice_control(app: &mut App) -> CommandResult {
+    let locale = app.ui_locale;
+    app.voice_control_enabled = !app.voice_control_enabled;
+
+    let msg = if app.voice_control_enabled {
+        tr(locale, MessageId::VoiceControlEnabled)
+    } else {
+        tr(locale, MessageId::VoiceControlDisabled)
+    };
+    CommandResult::message(msg)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn wav_encoding_produces_valid_header() {
+        let samples = vec![0i16; 16000]; // 1 second of silence
+        let wav = encode_wav(&samples);
+        assert_eq!(&wav[0..4], b"RIFF");
+        assert_eq!(&wav[8..12], b"WAVE");
+        assert_eq!(&wav[12..16], b"fmt ");
+        // data size = 16000 * 2 = 32000
+        assert_eq!(&wav[4..8], &(36 + 32000u32).to_le_bytes());
+    }
+
+    #[test]
+    fn wav_encoding_empty_is_minimal() {
+        let wav = encode_wav(&[]);
+        assert_eq!(wav.len(), 44);
+        assert_eq!(&wav[4..8], &36u32.to_le_bytes());
+    }
+
+    #[test]
+    fn send_suffix_detected_and_stripped() {
+        assert_eq!(split_send_suffix("send it"), ("", true));
+        assert_eq!(split_send_suffix("Send It!"), ("", true));
+        assert_eq!(split_send_suffix("发送"), ("", true));
+        assert_eq!(split_send_suffix("發送。"), ("", true));
+        assert_eq!(
+            split_send_suffix("ship the fix, send it"),
+            ("ship the fix", true)
+        );
+        assert_eq!(
+            split_send_suffix("修复这个问题，发送"),
+            ("修复这个问题", true)
+        );
+    }
+
+    #[test]
+    fn send_suffix_leaves_plain_text_alone() {
+        assert_eq!(split_send_suffix("send it now"), ("send it now", false));
+        assert_eq!(
+            split_send_suffix("帮我发送一封邮件"),
+            ("帮我发送一封邮件", false)
+        );
+        assert_eq!(split_send_suffix("发送邮件"), ("发送邮件", false));
+        assert_eq!(
+            split_send_suffix("resend it to the queue"),
+            ("resend it to the queue", false)
+        );
+    }
+
+    #[test]
+    fn recorder_detection_does_not_crash() {
+        // Just verify the function runs without panicking
+        let _ = is_available();
+    }
+}
@@ -17,6 +17,9 @@ pub use traits::CommandInfo;
 // Long-standing public paths that predate the group layout.
 pub use groups::project::share;

+// Voice capture plumbing shared with the hotbar and the UI event loop.
+pub use groups::core::voice;
+
 use crate::tui::app::{App, AppAction};

 /// Result of executing a command
@@ -664,6 +667,49 @@ mod tests {
        assert!(result.message.unwrap().contains("off"));
    }

+    #[test]
+    fn voice_send_and_voice_control_commands_toggle_state() {
+        let mut app = create_test_app();
+        assert!(!app.voice_send_enabled);
+        assert!(!app.voice_control_enabled);
+
+        for invocation in ["/voicesend", "/voice-send", "/yuyinsend", "/语音发送"] {
+            let result = execute(invocation, &mut app);
+            assert!(!result.is_error, "{invocation} should toggle cleanly");
+            assert!(result.action.is_none());
+            assert!(result.message.is_some());
+        }
+        // Four toggles land back at disabled.
+        assert!(!app.voice_send_enabled);
+
+        let result = execute("/voicecontrol", &mut app);
+        assert!(!result.is_error);
+        assert!(app.voice_control_enabled);
+        let result = execute("/voice-control", &mut app);
+        assert!(!result.is_error);
+        assert!(!app.voice_control_enabled);
+    }
+
+    /// `/voice` defers the actual capture to the UI event loop via
+    /// `AppAction::VoiceCapture`, so executing it never records audio.
+    /// On hosts without a recorder it must fail gracefully instead.
+    #[test]
+    fn voice_command_toggles_on_and_off_or_fails_gracefully() {
+        let mut app = create_test_app();
+        let result = execute("/voice", &mut app);
+        if app.voice_enabled {
+            assert!(!result.is_error);
+            assert!(matches!(result.action, Some(AppAction::VoiceCapture)));
+            let off = execute("/voice", &mut app);
+            assert!(!off.is_error);
+            assert!(off.action.is_none());
+            assert!(!app.voice_enabled);
+        } else {
+            assert!(result.is_error);
+            assert!(result.action.is_none());
+        }
+    }
+
    #[test]
    fn execute_sidebar_toggles_visibility() {
        let mut app = create_test_app();
@@ -636,6 +636,24 @@ pub enum MessageId {
    ToolFamilyVerify,
    ToolFamilyThink,
    ToolFamilyGeneric,
+    // Voice commands (/voice, /voice-send, /voice-control)
+    CmdVoiceDescription,
+    CmdVoiceSendDescription,
+    CmdVoiceControlDescription,
+    VoiceEnabled,
+    VoiceDisabled,
+    VoiceSendEnabled,
+    VoiceSendDisabled,
+    VoiceControlEnabled,
+    VoiceControlDisabled,
+    VoiceErrNoAuth,
+    VoiceErrNoRecorder,
+    VoiceErrNetwork,
+    VoiceErrEmptySend,
+    VoiceErrTooShort,
+    VoiceRecording,
+    VoiceProcessing,
+    VoiceTranscribed,
 }

 #[allow(dead_code)]
@@ -1041,6 +1059,23 @@ pub const ALL_MESSAGE_IDS: &[MessageId] = &[
    MessageId::ToolFamilyVerify,
    MessageId::ToolFamilyThink,
    MessageId::ToolFamilyGeneric,
+    MessageId::CmdVoiceDescription,
+    MessageId::CmdVoiceSendDescription,
+    MessageId::CmdVoiceControlDescription,
+    MessageId::VoiceEnabled,
+    MessageId::VoiceDisabled,
+    MessageId::VoiceSendEnabled,
+    MessageId::VoiceSendDisabled,
+    MessageId::VoiceControlEnabled,
+    MessageId::VoiceControlDisabled,
+    MessageId::VoiceErrNoAuth,
+    MessageId::VoiceErrNoRecorder,
+    MessageId::VoiceErrNetwork,
+    MessageId::VoiceErrEmptySend,
+    MessageId::VoiceErrTooShort,
+    MessageId::VoiceRecording,
+    MessageId::VoiceProcessing,
+    MessageId::VoiceTranscribed,
 ];

 pub fn tr(locale: Locale, id: MessageId) -> &'static str {
@@ -1774,6 +1809,32 @@ fn english(id: MessageId) -> &'static str {
        MessageId::ToolFamilyVerify => "verify",
        MessageId::ToolFamilyThink => "think",
        MessageId::ToolFamilyGeneric => "tool",
+        // Voice commands
+        MessageId::CmdVoiceDescription => {
+            "Toggle voice input: record speech and transcribe into the composer"
+        }
+        MessageId::CmdVoiceSendDescription => {
+            "Toggle voice auto-send: submit when the transcript ends with \"send it\""
+        }
+        MessageId::CmdVoiceControlDescription => {
+            "Toggle voice control: AI-assisted dictation aware of the composer text"
+        }
+        MessageId::VoiceEnabled => "Voice input enabled. Speak to record.",
+        MessageId::VoiceDisabled => "Voice input disabled.",
+        MessageId::VoiceSendEnabled => "Voice auto-send enabled.",
+        MessageId::VoiceSendDisabled => "Voice auto-send disabled.",
+        MessageId::VoiceControlEnabled => "Voice control enabled.",
+        MessageId::VoiceControlDisabled => "Voice control disabled.",
+        MessageId::VoiceErrNoAuth => "Voice: no API key configured for the active provider",
+        MessageId::VoiceErrNoRecorder => {
+            "Voice: no recording tool found. Install sox, arecord, or rec."
+        }
+        MessageId::VoiceErrNetwork => "Voice: transcription request failed",
+        MessageId::VoiceErrEmptySend => "Voice: nothing to send",
+        MessageId::VoiceErrTooShort => "Voice: no speech detected, recording too short",
+        MessageId::VoiceRecording => "🎙 Recording... speak now",
+        MessageId::VoiceProcessing => "🎙 Transcribing...",
+        MessageId::VoiceTranscribed => "🎙 Transcribed",
    }
 }

@@ -2375,6 +2436,32 @@ fn vietnamese(id: MessageId) -> Option<&'static str> {
        MessageId::ToolFamilyVerify => "xác minh",
        MessageId::ToolFamilyThink => "suy nghĩ",
        MessageId::ToolFamilyGeneric => "công cụ",
+        // Voice commands
+        MessageId::CmdVoiceDescription => {
+            "Bật/tắt nhập liệu bằng giọng nói: ghi âm và chuyển thành văn bản"
+        }
+        MessageId::CmdVoiceSendDescription => {
+            "Bật/tắt tự gửi bằng giọng nói: gửi khi bản ghi kết thúc bằng \"send it\""
+        }
+        MessageId::CmdVoiceControlDescription => {
+            "Bật/tắt điều khiển giọng nói: đọc chính tả có AI hỗ trợ"
+        }
+        MessageId::VoiceEnabled => "Đã bật nhập liệu bằng giọng nói. Hãy nói để ghi âm.",
+        MessageId::VoiceDisabled => "Đã tắt nhập liệu bằng giọng nói.",
+        MessageId::VoiceSendEnabled => "Đã bật tự gửi bằng giọng nói.",
+        MessageId::VoiceSendDisabled => "Đã tắt tự gửi bằng giọng nói.",
+        MessageId::VoiceControlEnabled => "Đã bật điều khiển giọng nói.",
+        MessageId::VoiceControlDisabled => "Đã tắt điều khiển giọng nói.",
+        MessageId::VoiceErrNoAuth => "Giọng nói: nhà cung cấp hiện tại chưa có khóa API",
+        MessageId::VoiceErrNoRecorder => {
+            "Giọng nói: không tìm thấy công cụ ghi âm. Hãy cài sox, arecord hoặc rec."
+        }
+        MessageId::VoiceErrNetwork => "Giọng nói: yêu cầu chuyển giọng nói thất bại",
+        MessageId::VoiceErrEmptySend => "Giọng nói: không có nội dung để gửi",
+        MessageId::VoiceErrTooShort => "Giọng nói: không phát hiện giọng nói, bản ghi quá ngắn",
+        MessageId::VoiceRecording => "🎙 Đang ghi âm... hãy nói",
+        MessageId::VoiceProcessing => "🎙 Đang chuyển thành văn bản...",
+        MessageId::VoiceTranscribed => "🎙 Đã chuyển xong",
    })
 }

@@ -2530,6 +2617,28 @@ fn traditional_chinese(id: MessageId) -> Option<&'static str> {
        MessageId::ToolFamilyVerify => "驗證",
        MessageId::ToolFamilyThink => "思考",
        MessageId::ToolFamilyGeneric => "工具",
+        // Voice commands
+        MessageId::CmdVoiceDescription => "切換語音輸入：錄製語音並轉錄為文字",
+        MessageId::CmdVoiceSendDescription => {
+            "切換語音自動傳送：轉錄以「發送」或「send it」結尾時自動提交"
+        }
+        MessageId::CmdVoiceControlDescription => {
+            "切換語音控制：AI 輔助的語音聽寫（結合當前輸入內容）"
+        }
+        MessageId::VoiceEnabled => "語音輸入已開啟，開始說話即可錄製",
+        MessageId::VoiceDisabled => "語音輸入已關閉",
+        MessageId::VoiceSendEnabled => "語音自動傳送已開啟",
+        MessageId::VoiceSendDisabled => "語音自動傳送已關閉",
+        MessageId::VoiceControlEnabled => "語音控制已開啟",
+        MessageId::VoiceControlDisabled => "語音控制已關閉",
+        MessageId::VoiceErrNoAuth => "語音：目前供應商未設定 API 金鑰",
+        MessageId::VoiceErrNoRecorder => "語音：未找到錄音工具，請安裝 sox、arecord 或 rec",
+        MessageId::VoiceErrNetwork => "語音：轉錄請求失敗",
+        MessageId::VoiceErrEmptySend => "語音：沒有可傳送的內容",
+        MessageId::VoiceErrTooShort => "語音：未偵測到有效語音，錄製時間過短",
+        MessageId::VoiceRecording => "🎙 正在錄音...請說話",
+        MessageId::VoiceProcessing => "🎙 正在轉錄...",
+        MessageId::VoiceTranscribed => "🎙 轉錄完成",
        other => chinese_simplified(other)?,
    })
 }
@@ -3090,6 +3199,32 @@ fn japanese(id: MessageId) -> Option<&'static str> {
        MessageId::ToolFamilyVerify => "検証",
        MessageId::ToolFamilyThink => "思考",
        MessageId::ToolFamilyGeneric => "ツール",
+        // Voice commands
+        MessageId::CmdVoiceDescription => "音声入力の切替：音声を録音してテキストに変換",
+        MessageId::CmdVoiceSendDescription => {
+            "音声自動送信の切替：転写が「send it」で終わると自動送信"
+        }
+        MessageId::CmdVoiceControlDescription => {
+            "音声コントロールの切替：入力欄を考慮した AI 音声ディクテーション"
+        }
+        MessageId::VoiceEnabled => "音声入力を有効にしました。話すと録音されます。",
+        MessageId::VoiceDisabled => "音声入力を無効にしました。",
+        MessageId::VoiceSendEnabled => "音声自動送信を有効にしました。",
+        MessageId::VoiceSendDisabled => "音声自動送信を無効にしました。",
+        MessageId::VoiceControlEnabled => "音声コントロールを有効にしました。",
+        MessageId::VoiceControlDisabled => "音声コントロールを無効にしました。",
+        MessageId::VoiceErrNoAuth => {
+            "音声：アクティブなプロバイダーに API キーが設定されていません"
+        }
+        MessageId::VoiceErrNoRecorder => {
+            "音声：録音ツールが見つかりません。sox、arecord、rec のいずれかをインストールしてください"
+        }
+        MessageId::VoiceErrNetwork => "音声：文字起こしリクエストに失敗しました",
+        MessageId::VoiceErrEmptySend => "音声：送信する内容がありません",
+        MessageId::VoiceErrTooShort => "音声：音声が検出されませんでした。録音が短すぎます",
+        MessageId::VoiceRecording => "🎙 録音中...お話しください",
+        MessageId::VoiceProcessing => "🎙 文字起こし中...",
+        MessageId::VoiceTranscribed => "🎙 文字起こし完了",
    })
 }

@@ -3585,6 +3720,28 @@ fn chinese_simplified(id: MessageId) -> Option<&'static str> {
        MessageId::ToolFamilyVerify => "验证",
        MessageId::ToolFamilyThink => "思考",
        MessageId::ToolFamilyGeneric => "工具",
+        // Voice commands
+        MessageId::CmdVoiceDescription => "切换语音输入：录制语音并转录为文字",
+        MessageId::CmdVoiceSendDescription => {
+            "切换语音自动发送：转录以「发送」或「send it」结尾时自动提交"
+        }
+        MessageId::CmdVoiceControlDescription => {
+            "切换语音控制：AI 辅助的语音听写（结合当前输入内容）"
+        }
+        MessageId::VoiceEnabled => "语音输入已开启，开始说话即可录制",
+        MessageId::VoiceDisabled => "语音输入已关闭",
+        MessageId::VoiceSendEnabled => "语音自动发送已开启",
+        MessageId::VoiceSendDisabled => "语音自动发送已关闭",
+        MessageId::VoiceControlEnabled => "语音控制已开启",
+        MessageId::VoiceControlDisabled => "语音控制已关闭",
+        MessageId::VoiceErrNoAuth => "语音：当前提供商未配置 API 密钥",
+        MessageId::VoiceErrNoRecorder => "语音：未找到录音工具，请安装 sox、arecord 或 rec",
+        MessageId::VoiceErrNetwork => "语音：转录请求失败",
+        MessageId::VoiceErrEmptySend => "语音：没有可发送的内容",
+        MessageId::VoiceErrTooShort => "语音：未检测到有效语音，录制时间过短",
+        MessageId::VoiceRecording => "🎙 正在录音...请说话",
+        MessageId::VoiceProcessing => "🎙 正在转录...",
+        MessageId::VoiceTranscribed => "🎙 转录完成",
    })
 }

@@ -4170,6 +4327,32 @@ fn portuguese_brazil(id: MessageId) -> Option<&'static str> {
        MessageId::ToolFamilyVerify => "verificar",
        MessageId::ToolFamilyThink => "pensar",
        MessageId::ToolFamilyGeneric => "ferramenta",
+        // Voice commands
+        MessageId::CmdVoiceDescription => {
+            "Alternar entrada de voz: gravar fala e transcrever para texto"
+        }
+        MessageId::CmdVoiceSendDescription => {
+            "Alternar envio automático por voz: envia quando a transcrição termina com \"send it\""
+        }
+        MessageId::CmdVoiceControlDescription => {
+            "Alternar controle por voz: ditado assistido por IA"
+        }
+        MessageId::VoiceEnabled => "Entrada de voz ativada. Fale para gravar.",
+        MessageId::VoiceDisabled => "Entrada de voz desativada.",
+        MessageId::VoiceSendEnabled => "Envio automático por voz ativado.",
+        MessageId::VoiceSendDisabled => "Envio automático por voz desativado.",
+        MessageId::VoiceControlEnabled => "Controle por voz ativado.",
+        MessageId::VoiceControlDisabled => "Controle por voz desativado.",
+        MessageId::VoiceErrNoAuth => "Voz: nenhuma chave de API configurada para o provedor ativo",
+        MessageId::VoiceErrNoRecorder => {
+            "Voz: nenhuma ferramenta de gravação encontrada. Instale sox, arecord ou rec."
+        }
+        MessageId::VoiceErrNetwork => "Voz: falha na solicitação de transcrição",
+        MessageId::VoiceErrEmptySend => "Voz: nada para enviar",
+        MessageId::VoiceErrTooShort => "Voz: nenhuma fala detectada, gravação muito curta",
+        MessageId::VoiceRecording => "🎙 Gravando... fale agora",
+        MessageId::VoiceProcessing => "🎙 Transcrevendo...",
+        MessageId::VoiceTranscribed => "🎙 Transcrito",
    })
 }

@@ -4765,6 +4948,34 @@ fn spanish_latin_america(id: MessageId) -> Option<&'static str> {
        MessageId::ToolFamilyVerify => "verificar",
        MessageId::ToolFamilyThink => "pensar",
        MessageId::ToolFamilyGeneric => "herramienta",
+        // Voice commands
+        MessageId::CmdVoiceDescription => {
+            "Alternar entrada de voz: grabar voz y transcribir a texto"
+        }
+        MessageId::CmdVoiceSendDescription => {
+            "Alternar envío automático por voz: envía cuando la transcripción termina con \"send it\""
+        }
+        MessageId::CmdVoiceControlDescription => {
+            "Alternar control por voz: dictado asistido por IA"
+        }
+        MessageId::VoiceEnabled => "Entrada de voz activada. Habla para grabar.",
+        MessageId::VoiceDisabled => "Entrada de voz desactivada.",
+        MessageId::VoiceSendEnabled => "Envío automático por voz activado.",
+        MessageId::VoiceSendDisabled => "Envío automático por voz desactivado.",
+        MessageId::VoiceControlEnabled => "Control por voz activado.",
+        MessageId::VoiceControlDisabled => "Control por voz desactivado.",
+        MessageId::VoiceErrNoAuth => {
+            "Voz: no hay clave de API configurada para el proveedor activo"
+        }
+        MessageId::VoiceErrNoRecorder => {
+            "Voz: no se encontró herramienta de grabación. Instala sox, arecord o rec."
+        }
+        MessageId::VoiceErrNetwork => "Voz: falló la solicitud de transcripción",
+        MessageId::VoiceErrEmptySend => "Voz: nada que enviar",
+        MessageId::VoiceErrTooShort => "Voz: no se detectó voz, grabación demasiado corta",
+        MessageId::VoiceRecording => "🎙 Grabando... habla ahora",
+        MessageId::VoiceProcessing => "🎙 Transcribiendo...",
+        MessageId::VoiceTranscribed => "🎙 Transcrito",
    })
 }

@@ -1481,6 +1481,14 @@ pub struct App {
    pub cost_currency: CostCurrency,
    pub composer_density: ComposerDensity,
    pub composer_border: bool,
+    /// Voice input state — toggled by `/voice` and the voice hotbar action.
+    pub voice_enabled: bool,
+    /// Auto-send after transcription when the transcript ends with an
+    /// explicit send instruction ("send it" / "发送"). Toggled by `/voice-send`.
+    pub voice_send_enabled: bool,
+    /// AI-assisted dictation that sees the current composer text.
+    /// Toggled by `/voice-control`.
+    pub voice_control_enabled: bool,
    pub transcript_spacing: TranscriptSpacing,
    pub sidebar_width_percent: u16,
    pub sidebar_focus: SidebarFocus,
@@ -2275,6 +2283,9 @@ impl App {
            cost_currency,
            composer_density,
            composer_border,
+            voice_enabled: false,
+            voice_send_enabled: false,
+            voice_control_enabled: false,
            transcript_spacing,
            sidebar_width_percent,
            sidebar_focus,
@@ -5323,6 +5334,11 @@ pub enum AppAction {
    SwitchWorkspace {
        workspace: PathBuf,
    },
+    /// Record from the microphone and route the transcription into the
+    /// composer (or auto-send it). Emitted by `/voice` and the voice hotbar
+    /// action; handled in the UI event loop where the live `Config` supplies
+    /// provider credentials.
+    VoiceCapture,
    /// Export and share the current session as a web URL.
    ShareSession {
        history_len: usize,
@@ -1141,7 +1141,7 @@ mod tests {

        assert!(command_labels.contains(&"/config"));
        assert!(command_labels.contains(&"/links"));
-        assert!(!command_labels.contains(&"/voice"));
+        assert!(command_labels.contains(&"/voice"));
        assert!(!command_labels.contains(&"/set"));
        assert!(!command_labels.contains(&"/deepseek"));
    }
@@ -181,7 +181,7 @@ impl HotbarAction for AppHotbarAction {

    fn is_active(&self, app: &App) -> bool {
        match self.kind {
-            AppHotbarKind::VoiceToggle => false,
+            AppHotbarKind::VoiceToggle => app.voice_enabled,
            AppHotbarKind::SessionCompact => app.is_compacting,
            AppHotbarKind::Mode(mode) => app.mode == mode,
            AppHotbarKind::ReasoningCycle => {
@@ -197,9 +197,12 @@ impl HotbarAction for AppHotbarAction {
    fn dispatch(&self, app: &mut App) -> Result<HotbarDispatch> {
        match self.kind {
            AppHotbarKind::VoiceToggle => {
-                app.status_message =
-                    Some("Voice input is not available in this terminal session yet.".to_string());
-                Ok(HotbarDispatch::Handled)
+                let result = crate::commands::voice::voice(app);
+                app.status_message = result.message;
+                match result.action {
+                    Some(action) => Ok(HotbarDispatch::AppAction(action)),
+                    None => Ok(HotbarDispatch::Handled),
+                }
            }
            AppHotbarKind::SessionCompact => {
                if app.is_compacting {
@@ -539,19 +542,36 @@ mod tests {
    }

    #[test]
-    fn voice_toggle_is_safe_until_voice_input_lands() {
+    fn voice_toggle_dispatches_the_voice_command() {
        let registry = HotbarActionRegistry::with_builtins();
        let voice = registry.get("voice.toggle").expect("voice action");
        let mut app = test_app();

        assert!(!voice.is_active(&app));
-        assert_eq!(
-            voice.dispatch(&mut app).expect("dispatch voice"),
-            HotbarDispatch::Handled
-        );
-        assert_eq!(
+        // The toggle is wired to the /voice command. With a recorder on the
+        // host it arms voice input and defers capture to the UI event loop;
+        // without one it fails gracefully with a localized error. No audio
+        // is recorded in either case.
+        let result = voice.dispatch(&mut app).expect("dispatch voice");
+        assert!(app.status_message.is_some());
+        // The old placeholder message must be gone — voice is implemented.
+        assert_ne!(
            app.status_message.as_deref(),
            Some("Voice input is not available in this terminal session yet.")
        );
+        if app.voice_enabled {
+            assert_eq!(
+                result,
+                HotbarDispatch::AppAction(crate::tui::app::AppAction::VoiceCapture)
+            );
+            assert!(voice.is_active(&app));
+            // A second press toggles voice input back off.
+            let off = voice.dispatch(&mut app).expect("dispatch voice off");
+            assert_eq!(off, HotbarDispatch::Handled);
+            assert!(!app.voice_enabled);
+            assert!(!voice.is_active(&app));
+        } else {
+            assert_eq!(result, HotbarDispatch::Handled);
+        }
    }
 }
@@ -6431,6 +6431,28 @@ async fn apply_command_result(
                let queued = build_queued_message(app, content);
                submit_or_steer_message(app, config, engine_handle, queued).await?;
            }
+            AppAction::VoiceCapture => {
+                use commands::voice::VoiceCaptureOutcome;
+                match commands::voice::capture_and_transcribe(app, config).await {
+                    Ok(VoiceCaptureOutcome::Insert(text)) => {
+                        app.insert_str(&text);
+                        app.status_message = Some(format!(
+                            "{}: {text}",
+                            tr(app.ui_locale, MessageId::VoiceTranscribed)
+                        ));
+                    }
+                    Ok(VoiceCaptureOutcome::Send(content)) => {
+                        app.status_message =
+                            Some(tr(app.ui_locale, MessageId::VoiceTranscribed).to_string());
+                        let queued = build_queued_message(app, content);
+                        submit_or_steer_message(app, config, engine_handle, queued).await?;
+                    }
+                    Err(err) => {
+                        app.voice_enabled = false;
+                        app.status_message = Some(err);
+                    }
+                }
+            }
            AppAction::ListSubAgents => {
                let _ = engine_handle.send(Op::ListSubAgents).await;
            }