Harvest PR #3051: voice input commands and hotbar integration

Port /voice, /voice-send, and /voice-control into the command strategy
registry as groups/core/voice.rs. The handlers only flip App state
(voice_enabled, voice_send_enabled, voice_control_enabled) and emit the
new AppAction::VoiceCapture; the UI event loop performs the actual
record + transcribe cycle so credentials come from the live Config
(deepseek_api_key/deepseek_base_url) instead of auth fields cached on
App, and no audio is ever recorded by the registry smoke tests.

- voice.toggle hotbar action dispatches the real /voice command and
  reports voice_enabled as its active state, replacing the placeholder.
- Recording uses sox/rec/arecord with RMS-based silence detection;
  transcription posts input_audio blocks to the provider chat
  completions API (async reqwest — the blocking client would panic
  inside the tokio event loop).
- Transcripts insert at the composer cursor via App::insert_str. With
  /voice-send enabled, a transcript ending in "send it" / 发送 strips
  the suffix and submits; a bare "send it" submits the current composer
  content. With /voice-control enabled, transcription runs through the
  AI dictation pipeline that sees the composer text.
- Failures (no recorder, no API key, short recording, network) surface
  as localized status messages and disarm voice input.
- Localized command help and status strings for all seven shipped
  locales; /voice now appears in the command palette.

Harvested from PR #3051 by @huqiantao

Co-authored-by: huqiantao <huqiantao@users.noreply.github.com>
Co-authored-by: Hunter B <hmbown@gmail.com>
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
CodeWhale Agent
2026-06-12 14:43:27 -07:00
parent 662a459ee5
commit 9b31621b19
8 changed files with 903 additions and 11 deletions
@@ -10,6 +10,7 @@ mod hooks;
mod provider;
mod queue;
mod stash;
pub mod voice;
pub(in crate::commands) use self::core::reset_conversation_state;
@@ -43,6 +44,9 @@ impl CommandGroup for CoreCommands {
Box::new(FunctionCommand::new(&PROFILE_INFO, run_profile)),
Box::new(FunctionCommand::new(&RLM_INFO, run_rlm)),
Box::new(FunctionCommand::new(&TRANSLATE_INFO, run_translate)),
Box::new(FunctionCommand::new(&VOICE_INFO, run_voice)),
Box::new(FunctionCommand::new(&VOICE_SEND_INFO, run_voice_send)),
Box::new(FunctionCommand::new(&VOICE_CONTROL_INFO, run_voice_control)),
]
}
}
@@ -167,6 +171,24 @@ static TRANSLATE_INFO: CommandInfo = CommandInfo {
usage: "/translate",
description_id: MessageId::CmdTranslateDescription,
};
static VOICE_INFO: CommandInfo = CommandInfo {
name: "voice",
aliases: &["yuyin", "语音"],
usage: "/voice",
description_id: MessageId::CmdVoiceDescription,
};
static VOICE_SEND_INFO: CommandInfo = CommandInfo {
name: "voicesend",
aliases: &["voice-send", "yuyinsend", "语音发送"],
usage: "/voicesend",
description_id: MessageId::CmdVoiceSendDescription,
};
static VOICE_CONTROL_INFO: CommandInfo = CommandInfo {
name: "voicecontrol",
aliases: &["voice-control", "yuyincontrol", "语音控制"],
usage: "/voicecontrol",
description_id: MessageId::CmdVoiceControlDescription,
};
fn run_registered(app: &mut App, name: &str, arg: Option<&str>) -> CommandResult {
dispatch(app, name, arg).expect("registered core command should dispatch")
@@ -232,6 +254,15 @@ fn run_rlm(app: &mut App, arg: Option<&str>) -> CommandResult {
fn run_translate(app: &mut App, arg: Option<&str>) -> CommandResult {
run_registered(app, "translate", arg)
}
fn run_voice(app: &mut App, arg: Option<&str>) -> CommandResult {
run_registered(app, "voice", arg)
}
fn run_voice_send(app: &mut App, arg: Option<&str>) -> CommandResult {
run_registered(app, "voicesend", arg)
}
fn run_voice_control(app: &mut App, arg: Option<&str>) -> CommandResult {
run_registered(app, "voicecontrol", arg)
}
pub(in crate::commands) fn dispatch(
app: &mut App,
@@ -259,6 +290,11 @@ pub(in crate::commands) fn dispatch(
"profile" | "dangan" => core::profile_switch(app, arg),
"rlm" | "recursive" | "digui" => rlm(app, arg),
"translate" | "translation" | "transale" => core::translate(app),
"voice" | "yuyin" | "语音" => voice::voice(app),
"voicesend" | "voice-send" | "yuyinsend" | "语音发送" => voice::voice_send(app),
"voicecontrol" | "voice-control" | "yuyincontrol" | "语音控制" => {
voice::voice_control(app)
}
_ => return None,
};
Some(result)
@@ -0,0 +1,541 @@
//! Voice input commands — `/voice`, `/voice-send`, `/voice-control`.
//!
//! Records audio from the default microphone, sends it to the configured
//! provider's API for transcription, and inserts the transcribed text into
//! the composer. The interaction model mirrors MiMo Code's voice UX:
//!
//! `/voice` — toggle voice input on/off (records when toggled on)
//! `/voice-send` — toggle auto-send when the transcript ends with
//! "send it" / "发送"
//! `/voice-control` — toggle AI-assisted dictation that sees the current
//! composer text
//!
//! The slash commands only flip state and emit [`AppAction::VoiceCapture`];
//! the actual capture runs in the UI event loop where the live [`Config`]
//! supplies provider credentials. That keeps the handlers side-effect free
//! (the registry smoke tests execute every command) and avoids caching
//! auth material on [`App`].
//!
//! ## Recording
//!
//! Uses platform-specific command-line tools (sox, rec, arecord) to capture
//! 16kHz mono 16-bit PCM audio. Records until a silence gap is detected or
//! the maximum duration is reached (default 10 s).
use std::process::{Command, Stdio};
use std::sync::LazyLock;
use std::time::Duration;
use regex::Regex;
use crate::commands::CommandResult;
use crate::config::Config;
use crate::localization::{MessageId, tr};
use crate::tui::app::{App, AppAction};
/// Transcription model requested from the provider's chat-completions API.
const ASR_MODEL: &str = "mimo-v2.5-asr";
/// Model used for the AI-assisted voice-control pipeline.
const VOICE_CONTROL_MODEL: &str = "mimo-v2.5";
// --- Recorder detection ----------------------------------------------------
/// Platform-specific recorder definitions.
#[derive(Debug, Clone)]
struct Recorder {
cmd: &'static str,
/// CLI arguments for piping raw 16kHz mono S16_LE PCM to stdout.
pipe_args: &'static [&'static str],
}
fn detect_recorder() -> Option<Recorder> {
let candidates: &[Recorder] = if cfg!(target_os = "macos") {
&[
Recorder {
cmd: "sox",
pipe_args: &["-d", "-r", "16000", "-c", "1", "-b", "16", "-t", "raw", "-"],
},
Recorder {
cmd: "rec",
pipe_args: &["-r", "16000", "-c", "1", "-b", "16", "-t", "raw", "-"],
},
]
} else if cfg!(target_os = "linux") {
&[
Recorder {
cmd: "arecord",
pipe_args: &["-f", "S16_LE", "-r", "16000", "-c", "1", "-t", "raw"],
},
Recorder {
cmd: "sox",
pipe_args: &["-d", "-r", "16000", "-c", "1", "-b", "16", "-t", "raw", "-"],
},
]
} else if cfg!(target_os = "windows") {
&[Recorder {
cmd: "sox",
pipe_args: &["-d", "-r", "16000", "-c", "1", "-b", "16", "-t", "raw", "-"],
}]
} else {
&[]
};
candidates
.iter()
.find(|r| {
Command::new(r.cmd)
.arg("--version")
.stdin(Stdio::null())
.stdout(Stdio::null())
.stderr(Stdio::null())
.spawn()
.is_ok()
})
.cloned()
}
/// Check whether voice recording is available on this system.
pub fn is_available() -> bool {
detect_recorder().is_some()
}
// --- WAV encoding ----------------------------------------------------------
/// Encode raw 16kHz mono S16_LE PCM samples as a WAV buffer.
fn encode_wav(samples: &[i16]) -> Vec<u8> {
let data_size = (samples.len() * 2) as u32;
let sample_rate: u32 = 16000;
let mut buf = Vec::with_capacity(44 + data_size as usize);
// RIFF header
buf.extend_from_slice(b"RIFF");
buf.extend_from_slice(&(36 + data_size).to_le_bytes());
buf.extend_from_slice(b"WAVE");
// fmt chunk
buf.extend_from_slice(b"fmt ");
buf.extend_from_slice(&16u32.to_le_bytes()); // chunk size
buf.extend_from_slice(&1u16.to_le_bytes()); // PCM
buf.extend_from_slice(&1u16.to_le_bytes()); // mono
buf.extend_from_slice(&sample_rate.to_le_bytes());
buf.extend_from_slice(&(sample_rate * 2).to_le_bytes()); // byte rate
buf.extend_from_slice(&2u16.to_le_bytes()); // block align
buf.extend_from_slice(&16u16.to_le_bytes()); // bits per sample
// data chunk
buf.extend_from_slice(b"data");
buf.extend_from_slice(&data_size.to_le_bytes());
for &sample in samples {
buf.extend_from_slice(&sample.to_le_bytes());
}
buf
}
// --- Recording -------------------------------------------------------------
/// Maximum recording duration in seconds before auto-stopping.
const MAX_RECORD_SECS: u64 = 10;
/// Minimum segment duration in seconds to consider as valid speech.
const MIN_SEGMENT_SECS: f64 = 0.3;
/// Record audio from the default microphone.
///
/// Returns raw 16kHz mono S16_LE PCM samples. Returns `None` if no recorder
/// is available, the recording failed, or no speech was detected.
fn record_audio() -> Option<(Vec<i16>, Duration)> {
let recorder = detect_recorder()?;
let start = std::time::Instant::now();
let mut child = Command::new(recorder.cmd)
.args(recorder.pipe_args)
.stdin(Stdio::null())
.stdout(Stdio::piped())
.stderr(Stdio::null())
.spawn()
.ok()?;
let stdout = child.stdout.take()?;
let mut reader = std::io::BufReader::new(stdout);
let mut all_samples: Vec<i16> = Vec::with_capacity(16000 * MAX_RECORD_SECS as usize);
// Read until timeout or silence
let mut buf = [0u8; 320]; // 10ms of 16kHz S16_LE
let max_duration = Duration::from_secs(MAX_RECORD_SECS);
let mut silence_samples = 0u32;
let mut had_speech = false;
let speech_threshold: i16 = 500; // RMS-based speech detection threshold
let silence_duration_samples = 16000u32; // 1 second of silence to stop
loop {
use std::io::Read;
match reader.read_exact(&mut buf) {
Ok(()) => {
let chunk: Vec<i16> = buf
.chunks_exact(2)
.map(|b| i16::from_le_bytes([b[0], b[1]]))
.collect();
// Simple RMS-based VAD
let rms = (chunk.iter().map(|&s| (s as f64) * (s as f64)).sum::<f64>()
/ chunk.len() as f64)
.sqrt();
let is_speech = rms > speech_threshold as f64;
if is_speech {
had_speech = true;
silence_samples = 0;
} else if had_speech {
silence_samples += chunk.len() as u32;
}
if had_speech {
all_samples.extend_from_slice(&chunk);
}
if start.elapsed() > max_duration {
let _ = child.kill();
break;
}
if had_speech && silence_samples >= silence_duration_samples {
let _ = child.kill();
break;
}
}
Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => break,
Err(_) => {
let _ = child.kill();
break;
}
}
}
let _ = child.wait();
let elapsed = start.elapsed();
let min_samples = (MIN_SEGMENT_SECS * 16000.0) as usize;
if all_samples.len() < min_samples {
return None;
}
Some((all_samples, elapsed))
}
// --- Auto-send suffix ------------------------------------------------------
/// Matches an explicit send instruction at the end of transcribed text:
/// "send it" (any spacing/case) or 发送/發送, with trailing punctuation.
static SEND_SUFFIX_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?i)(?:^|[\s,.。!?]+)(?:send\s*it|发送|發送)[\s.。!?]*$").unwrap()
});
/// Split a transcript into the message remainder and whether it ended with an
/// explicit send instruction. `"ship the fix, send it"` → `("ship the fix", true)`.
fn split_send_suffix(text: &str) -> (&str, bool) {
match SEND_SUFFIX_RE.find(text) {
Some(found) => (text[..found.start()].trim(), true),
None => (text.trim(), false),
}
}
// --- Transcription ---------------------------------------------------------
fn base64_encode(data: &[u8]) -> String {
use base64::Engine;
base64::engine::general_purpose::STANDARD.encode(data)
}
fn chat_completions_url(base_url: &str) -> String {
format!("{}/chat/completions", base_url.trim_end_matches('/'))
}
async fn post_chat_completions(
api_key: &str,
base_url: &str,
body: serde_json::Value,
) -> Result<serde_json::Value, String> {
let client = crate::tls::reqwest_client();
let resp = client
.post(chat_completions_url(base_url))
.header("Content-Type", "application/json")
.header("Authorization", format!("Bearer {api_key}"))
.timeout(Duration::from_secs(30))
.json(&body)
.send()
.await
.map_err(|e| format!("request failed: {e}"))?;
if !resp.status().is_success() {
return Err(format!("API returned status {}", resp.status()));
}
resp.json()
.await
.map_err(|e| format!("failed to parse response: {e}"))
}
/// Send audio to the provider's API for plain transcription.
///
/// Uses the chat completions endpoint with `input_audio` content blocks.
async fn transcribe(
api_key: &str,
base_url: &str,
audio_samples: &[i16],
) -> Result<String, String> {
let wav = encode_wav(audio_samples);
let data_url = format!("data:audio/wav;base64,{}", base64_encode(&wav));
let body = serde_json::json!({
"model": ASR_MODEL,
"messages": [
{
"role": "user",
"content": [
{
"type": "input_audio",
"input_audio": {
"data": data_url
}
}
]
}
],
"asr_options": {
"language": "auto"
}
});
let data = post_chat_completions(api_key, base_url, body).await?;
data["choices"][0]["message"]["content"]
.as_str()
.map(|s| s.trim().to_string())
.ok_or_else(|| "no transcription in response".to_string())
}
/// Process audio through the voice-control pipeline: AI-assisted dictation
/// that sees the current composer text, mirroring MiMo Code's
/// `processVoiceControl`. Used when `/voice-control` is enabled.
async fn process_voice_control(
api_key: &str,
base_url: &str,
audio_samples: &[i16],
current_text: &str,
) -> Result<String, String> {
let wav = encode_wav(audio_samples);
let data_url = format!("data:audio/wav;base64,{}", base64_encode(&wav));
let user_context = serde_json::json!({
"current_text": current_text,
"cursor": "end",
});
let body = serde_json::json!({
"model": VOICE_CONTROL_MODEL,
"messages": [
{
"role": "system",
"content": "You are a voice input assistant. Transcribe the user's speech. Output JSON: {\"text\": \"transcribed text\"}."
},
{
"role": "user",
"content": [
{ "type": "text", "text": user_context.to_string() },
{ "type": "input_audio", "input_audio": { "data": data_url } }
]
}
],
"response_format": { "type": "json_object" }
});
let data = post_chat_completions(api_key, base_url, body).await?;
let content = data["choices"][0]["message"]["content"]
.as_str()
.ok_or_else(|| "no response content".to_string())?;
let parsed: serde_json::Value = serde_json::from_str(content)
.map_err(|e| format!("failed to parse voice control JSON: {e}"))?;
parsed["text"]
.as_str()
.map(|s| s.to_string())
.ok_or_else(|| "no text field in voice control response".to_string())
}
// --- Capture orchestration (UI event loop) ---------------------------------
/// What the UI should do with a finished capture.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum VoiceCaptureOutcome {
/// Insert the transcribed text into the composer at the cursor.
Insert(String),
/// Submit this text as a message (auto-send).
Send(String),
}
/// Perform a complete record + transcribe cycle.
///
/// Runs in the UI event loop (see [`AppAction::VoiceCapture`]) so provider
/// credentials come from the live [`Config`] rather than state cached on
/// [`App`]. Recording happens on a blocking thread; transcription uses the
/// shared async HTTP client. Every failure path returns a localized message
/// so callers can surface it as a status line.
pub async fn capture_and_transcribe(
app: &mut App,
config: &Config,
) -> Result<VoiceCaptureOutcome, String> {
let locale = app.ui_locale;
if !is_available() {
return Err(tr(locale, MessageId::VoiceErrNoRecorder).to_string());
}
let api_key = config
.deepseek_api_key()
.map_err(|_| tr(locale, MessageId::VoiceErrNoAuth).to_string())?;
let base_url = config.deepseek_base_url();
app.status_message = Some(tr(locale, MessageId::VoiceRecording).to_string());
let (samples, _duration) = tokio::task::spawn_blocking(record_audio)
.await
.ok()
.flatten()
.ok_or_else(|| tr(locale, MessageId::VoiceErrTooShort).to_string())?;
app.status_message = Some(tr(locale, MessageId::VoiceProcessing).to_string());
let text = if app.voice_control_enabled {
process_voice_control(&api_key, &base_url, &samples, &app.composer.input).await
} else {
transcribe(&api_key, &base_url, &samples).await
}
.map_err(|e| format!("{}: {e}", tr(locale, MessageId::VoiceErrNetwork)))?;
let clean = text.trim();
if app.voice_send_enabled {
let (remainder, wants_send) = split_send_suffix(clean);
if wants_send {
// A bare "send it" submits whatever is already in the composer.
let outgoing = if remainder.is_empty() {
let existing = app.composer.input.trim().to_string();
if !existing.is_empty() {
app.clear_input();
}
existing
} else {
remainder.to_string()
};
if outgoing.is_empty() {
return Err(tr(locale, MessageId::VoiceErrEmptySend).to_string());
}
return Ok(VoiceCaptureOutcome::Send(outgoing));
}
}
if clean.is_empty() {
return Err(tr(locale, MessageId::VoiceErrEmptySend).to_string());
}
Ok(VoiceCaptureOutcome::Insert(clean.to_string()))
}
// --- Command handlers ------------------------------------------------------
/// Handle the `/voice` command: toggle voice input. Toggling on requests a
/// one-shot recording + transcription via [`AppAction::VoiceCapture`].
pub fn voice(app: &mut App) -> CommandResult {
let locale = app.ui_locale;
if app.voice_enabled {
app.voice_enabled = false;
return CommandResult::message(tr(locale, MessageId::VoiceDisabled));
}
if !is_available() {
return CommandResult::error(tr(locale, MessageId::VoiceErrNoRecorder));
}
app.voice_enabled = true;
CommandResult::with_message_and_action(
tr(locale, MessageId::VoiceEnabled),
AppAction::VoiceCapture,
)
}
/// Handle the `/voice-send` command: toggle auto-send after transcription.
pub fn voice_send(app: &mut App) -> CommandResult {
let locale = app.ui_locale;
app.voice_send_enabled = !app.voice_send_enabled;
let msg = if app.voice_send_enabled {
tr(locale, MessageId::VoiceSendEnabled)
} else {
tr(locale, MessageId::VoiceSendDisabled)
};
CommandResult::message(msg)
}
/// Handle the `/voice-control` command: toggle AI-assisted dictation.
pub fn voice_control(app: &mut App) -> CommandResult {
let locale = app.ui_locale;
app.voice_control_enabled = !app.voice_control_enabled;
let msg = if app.voice_control_enabled {
tr(locale, MessageId::VoiceControlEnabled)
} else {
tr(locale, MessageId::VoiceControlDisabled)
};
CommandResult::message(msg)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn wav_encoding_produces_valid_header() {
let samples = vec![0i16; 16000]; // 1 second of silence
let wav = encode_wav(&samples);
assert_eq!(&wav[0..4], b"RIFF");
assert_eq!(&wav[8..12], b"WAVE");
assert_eq!(&wav[12..16], b"fmt ");
// data size = 16000 * 2 = 32000
assert_eq!(&wav[4..8], &(36 + 32000u32).to_le_bytes());
}
#[test]
fn wav_encoding_empty_is_minimal() {
let wav = encode_wav(&[]);
assert_eq!(wav.len(), 44);
assert_eq!(&wav[4..8], &36u32.to_le_bytes());
}
#[test]
fn send_suffix_detected_and_stripped() {
assert_eq!(split_send_suffix("send it"), ("", true));
assert_eq!(split_send_suffix("Send It!"), ("", true));
assert_eq!(split_send_suffix("发送"), ("", true));
assert_eq!(split_send_suffix("發送。"), ("", true));
assert_eq!(
split_send_suffix("ship the fix, send it"),
("ship the fix", true)
);
assert_eq!(
split_send_suffix("修复这个问题,发送"),
("修复这个问题", true)
);
}
#[test]
fn send_suffix_leaves_plain_text_alone() {
assert_eq!(split_send_suffix("send it now"), ("send it now", false));
assert_eq!(
split_send_suffix("帮我发送一封邮件"),
("帮我发送一封邮件", false)
);
assert_eq!(split_send_suffix("发送邮件"), ("发送邮件", false));
assert_eq!(
split_send_suffix("resend it to the queue"),
("resend it to the queue", false)
);
}
#[test]
fn recorder_detection_does_not_crash() {
// Just verify the function runs without panicking
let _ = is_available();
}
}
+46
View File
@@ -17,6 +17,9 @@ pub use traits::CommandInfo;
// Long-standing public paths that predate the group layout.
pub use groups::project::share;
// Voice capture plumbing shared with the hotbar and the UI event loop.
pub use groups::core::voice;
use crate::tui::app::{App, AppAction};
/// Result of executing a command
@@ -664,6 +667,49 @@ mod tests {
assert!(result.message.unwrap().contains("off"));
}
#[test]
fn voice_send_and_voice_control_commands_toggle_state() {
let mut app = create_test_app();
assert!(!app.voice_send_enabled);
assert!(!app.voice_control_enabled);
for invocation in ["/voicesend", "/voice-send", "/yuyinsend", "/语音发送"] {
let result = execute(invocation, &mut app);
assert!(!result.is_error, "{invocation} should toggle cleanly");
assert!(result.action.is_none());
assert!(result.message.is_some());
}
// Four toggles land back at disabled.
assert!(!app.voice_send_enabled);
let result = execute("/voicecontrol", &mut app);
assert!(!result.is_error);
assert!(app.voice_control_enabled);
let result = execute("/voice-control", &mut app);
assert!(!result.is_error);
assert!(!app.voice_control_enabled);
}
/// `/voice` defers the actual capture to the UI event loop via
/// `AppAction::VoiceCapture`, so executing it never records audio.
/// On hosts without a recorder it must fail gracefully instead.
#[test]
fn voice_command_toggles_on_and_off_or_fails_gracefully() {
let mut app = create_test_app();
let result = execute("/voice", &mut app);
if app.voice_enabled {
assert!(!result.is_error);
assert!(matches!(result.action, Some(AppAction::VoiceCapture)));
let off = execute("/voice", &mut app);
assert!(!off.is_error);
assert!(off.action.is_none());
assert!(!app.voice_enabled);
} else {
assert!(result.is_error);
assert!(result.action.is_none());
}
}
#[test]
fn execute_sidebar_toggles_visibility() {
let mut app = create_test_app();
+211
View File
@@ -636,6 +636,24 @@ pub enum MessageId {
ToolFamilyVerify,
ToolFamilyThink,
ToolFamilyGeneric,
// Voice commands (/voice, /voice-send, /voice-control)
CmdVoiceDescription,
CmdVoiceSendDescription,
CmdVoiceControlDescription,
VoiceEnabled,
VoiceDisabled,
VoiceSendEnabled,
VoiceSendDisabled,
VoiceControlEnabled,
VoiceControlDisabled,
VoiceErrNoAuth,
VoiceErrNoRecorder,
VoiceErrNetwork,
VoiceErrEmptySend,
VoiceErrTooShort,
VoiceRecording,
VoiceProcessing,
VoiceTranscribed,
}
#[allow(dead_code)]
@@ -1041,6 +1059,23 @@ pub const ALL_MESSAGE_IDS: &[MessageId] = &[
MessageId::ToolFamilyVerify,
MessageId::ToolFamilyThink,
MessageId::ToolFamilyGeneric,
MessageId::CmdVoiceDescription,
MessageId::CmdVoiceSendDescription,
MessageId::CmdVoiceControlDescription,
MessageId::VoiceEnabled,
MessageId::VoiceDisabled,
MessageId::VoiceSendEnabled,
MessageId::VoiceSendDisabled,
MessageId::VoiceControlEnabled,
MessageId::VoiceControlDisabled,
MessageId::VoiceErrNoAuth,
MessageId::VoiceErrNoRecorder,
MessageId::VoiceErrNetwork,
MessageId::VoiceErrEmptySend,
MessageId::VoiceErrTooShort,
MessageId::VoiceRecording,
MessageId::VoiceProcessing,
MessageId::VoiceTranscribed,
];
pub fn tr(locale: Locale, id: MessageId) -> &'static str {
@@ -1774,6 +1809,32 @@ fn english(id: MessageId) -> &'static str {
MessageId::ToolFamilyVerify => "verify",
MessageId::ToolFamilyThink => "think",
MessageId::ToolFamilyGeneric => "tool",
// Voice commands
MessageId::CmdVoiceDescription => {
"Toggle voice input: record speech and transcribe into the composer"
}
MessageId::CmdVoiceSendDescription => {
"Toggle voice auto-send: submit when the transcript ends with \"send it\""
}
MessageId::CmdVoiceControlDescription => {
"Toggle voice control: AI-assisted dictation aware of the composer text"
}
MessageId::VoiceEnabled => "Voice input enabled. Speak to record.",
MessageId::VoiceDisabled => "Voice input disabled.",
MessageId::VoiceSendEnabled => "Voice auto-send enabled.",
MessageId::VoiceSendDisabled => "Voice auto-send disabled.",
MessageId::VoiceControlEnabled => "Voice control enabled.",
MessageId::VoiceControlDisabled => "Voice control disabled.",
MessageId::VoiceErrNoAuth => "Voice: no API key configured for the active provider",
MessageId::VoiceErrNoRecorder => {
"Voice: no recording tool found. Install sox, arecord, or rec."
}
MessageId::VoiceErrNetwork => "Voice: transcription request failed",
MessageId::VoiceErrEmptySend => "Voice: nothing to send",
MessageId::VoiceErrTooShort => "Voice: no speech detected, recording too short",
MessageId::VoiceRecording => "🎙 Recording... speak now",
MessageId::VoiceProcessing => "🎙 Transcribing...",
MessageId::VoiceTranscribed => "🎙 Transcribed",
}
}
@@ -2375,6 +2436,32 @@ fn vietnamese(id: MessageId) -> Option<&'static str> {
MessageId::ToolFamilyVerify => "xác minh",
MessageId::ToolFamilyThink => "suy nghĩ",
MessageId::ToolFamilyGeneric => "công cụ",
// Voice commands
MessageId::CmdVoiceDescription => {
"Bật/tắt nhập liệu bằng giọng nói: ghi âm và chuyển thành văn bản"
}
MessageId::CmdVoiceSendDescription => {
"Bật/tắt tự gửi bằng giọng nói: gửi khi bản ghi kết thúc bằng \"send it\""
}
MessageId::CmdVoiceControlDescription => {
"Bật/tắt điều khiển giọng nói: đọc chính tả có AI hỗ trợ"
}
MessageId::VoiceEnabled => "Đã bật nhập liệu bằng giọng nói. Hãy nói để ghi âm.",
MessageId::VoiceDisabled => "Đã tắt nhập liệu bằng giọng nói.",
MessageId::VoiceSendEnabled => "Đã bật tự gửi bằng giọng nói.",
MessageId::VoiceSendDisabled => "Đã tắt tự gửi bằng giọng nói.",
MessageId::VoiceControlEnabled => "Đã bật điều khiển giọng nói.",
MessageId::VoiceControlDisabled => "Đã tắt điều khiển giọng nói.",
MessageId::VoiceErrNoAuth => "Giọng nói: nhà cung cấp hiện tại chưa có khóa API",
MessageId::VoiceErrNoRecorder => {
"Giọng nói: không tìm thấy công cụ ghi âm. Hãy cài sox, arecord hoặc rec."
}
MessageId::VoiceErrNetwork => "Giọng nói: yêu cầu chuyển giọng nói thất bại",
MessageId::VoiceErrEmptySend => "Giọng nói: không có nội dung để gửi",
MessageId::VoiceErrTooShort => "Giọng nói: không phát hiện giọng nói, bản ghi quá ngắn",
MessageId::VoiceRecording => "🎙 Đang ghi âm... hãy nói",
MessageId::VoiceProcessing => "🎙 Đang chuyển thành văn bản...",
MessageId::VoiceTranscribed => "🎙 Đã chuyển xong",
})
}
@@ -2530,6 +2617,28 @@ fn traditional_chinese(id: MessageId) -> Option<&'static str> {
MessageId::ToolFamilyVerify => "驗證",
MessageId::ToolFamilyThink => "思考",
MessageId::ToolFamilyGeneric => "工具",
// Voice commands
MessageId::CmdVoiceDescription => "切換語音輸入:錄製語音並轉錄為文字",
MessageId::CmdVoiceSendDescription => {
"切換語音自動傳送:轉錄以「發送」或「send it」結尾時自動提交"
}
MessageId::CmdVoiceControlDescription => {
"切換語音控制:AI 輔助的語音聽寫(結合當前輸入內容)"
}
MessageId::VoiceEnabled => "語音輸入已開啟,開始說話即可錄製",
MessageId::VoiceDisabled => "語音輸入已關閉",
MessageId::VoiceSendEnabled => "語音自動傳送已開啟",
MessageId::VoiceSendDisabled => "語音自動傳送已關閉",
MessageId::VoiceControlEnabled => "語音控制已開啟",
MessageId::VoiceControlDisabled => "語音控制已關閉",
MessageId::VoiceErrNoAuth => "語音:目前供應商未設定 API 金鑰",
MessageId::VoiceErrNoRecorder => "語音:未找到錄音工具,請安裝 sox、arecord 或 rec",
MessageId::VoiceErrNetwork => "語音:轉錄請求失敗",
MessageId::VoiceErrEmptySend => "語音:沒有可傳送的內容",
MessageId::VoiceErrTooShort => "語音:未偵測到有效語音,錄製時間過短",
MessageId::VoiceRecording => "🎙 正在錄音...請說話",
MessageId::VoiceProcessing => "🎙 正在轉錄...",
MessageId::VoiceTranscribed => "🎙 轉錄完成",
other => chinese_simplified(other)?,
})
}
@@ -3090,6 +3199,32 @@ fn japanese(id: MessageId) -> Option<&'static str> {
MessageId::ToolFamilyVerify => "検証",
MessageId::ToolFamilyThink => "思考",
MessageId::ToolFamilyGeneric => "ツール",
// Voice commands
MessageId::CmdVoiceDescription => "音声入力の切替:音声を録音してテキストに変換",
MessageId::CmdVoiceSendDescription => {
"音声自動送信の切替:転写が「send it」で終わると自動送信"
}
MessageId::CmdVoiceControlDescription => {
"音声コントロールの切替:入力欄を考慮した AI 音声ディクテーション"
}
MessageId::VoiceEnabled => "音声入力を有効にしました。話すと録音されます。",
MessageId::VoiceDisabled => "音声入力を無効にしました。",
MessageId::VoiceSendEnabled => "音声自動送信を有効にしました。",
MessageId::VoiceSendDisabled => "音声自動送信を無効にしました。",
MessageId::VoiceControlEnabled => "音声コントロールを有効にしました。",
MessageId::VoiceControlDisabled => "音声コントロールを無効にしました。",
MessageId::VoiceErrNoAuth => {
"音声:アクティブなプロバイダーに API キーが設定されていません"
}
MessageId::VoiceErrNoRecorder => {
"音声:録音ツールが見つかりません。sox、arecord、rec のいずれかをインストールしてください"
}
MessageId::VoiceErrNetwork => "音声:文字起こしリクエストに失敗しました",
MessageId::VoiceErrEmptySend => "音声:送信する内容がありません",
MessageId::VoiceErrTooShort => "音声:音声が検出されませんでした。録音が短すぎます",
MessageId::VoiceRecording => "🎙 録音中...お話しください",
MessageId::VoiceProcessing => "🎙 文字起こし中...",
MessageId::VoiceTranscribed => "🎙 文字起こし完了",
})
}
@@ -3585,6 +3720,28 @@ fn chinese_simplified(id: MessageId) -> Option<&'static str> {
MessageId::ToolFamilyVerify => "验证",
MessageId::ToolFamilyThink => "思考",
MessageId::ToolFamilyGeneric => "工具",
// Voice commands
MessageId::CmdVoiceDescription => "切换语音输入:录制语音并转录为文字",
MessageId::CmdVoiceSendDescription => {
"切换语音自动发送:转录以「发送」或「send it」结尾时自动提交"
}
MessageId::CmdVoiceControlDescription => {
"切换语音控制:AI 辅助的语音听写(结合当前输入内容)"
}
MessageId::VoiceEnabled => "语音输入已开启,开始说话即可录制",
MessageId::VoiceDisabled => "语音输入已关闭",
MessageId::VoiceSendEnabled => "语音自动发送已开启",
MessageId::VoiceSendDisabled => "语音自动发送已关闭",
MessageId::VoiceControlEnabled => "语音控制已开启",
MessageId::VoiceControlDisabled => "语音控制已关闭",
MessageId::VoiceErrNoAuth => "语音:当前提供商未配置 API 密钥",
MessageId::VoiceErrNoRecorder => "语音:未找到录音工具,请安装 sox、arecord 或 rec",
MessageId::VoiceErrNetwork => "语音:转录请求失败",
MessageId::VoiceErrEmptySend => "语音:没有可发送的内容",
MessageId::VoiceErrTooShort => "语音:未检测到有效语音,录制时间过短",
MessageId::VoiceRecording => "🎙 正在录音...请说话",
MessageId::VoiceProcessing => "🎙 正在转录...",
MessageId::VoiceTranscribed => "🎙 转录完成",
})
}
@@ -4170,6 +4327,32 @@ fn portuguese_brazil(id: MessageId) -> Option<&'static str> {
MessageId::ToolFamilyVerify => "verificar",
MessageId::ToolFamilyThink => "pensar",
MessageId::ToolFamilyGeneric => "ferramenta",
// Voice commands
MessageId::CmdVoiceDescription => {
"Alternar entrada de voz: gravar fala e transcrever para texto"
}
MessageId::CmdVoiceSendDescription => {
"Alternar envio automático por voz: envia quando a transcrição termina com \"send it\""
}
MessageId::CmdVoiceControlDescription => {
"Alternar controle por voz: ditado assistido por IA"
}
MessageId::VoiceEnabled => "Entrada de voz ativada. Fale para gravar.",
MessageId::VoiceDisabled => "Entrada de voz desativada.",
MessageId::VoiceSendEnabled => "Envio automático por voz ativado.",
MessageId::VoiceSendDisabled => "Envio automático por voz desativado.",
MessageId::VoiceControlEnabled => "Controle por voz ativado.",
MessageId::VoiceControlDisabled => "Controle por voz desativado.",
MessageId::VoiceErrNoAuth => "Voz: nenhuma chave de API configurada para o provedor ativo",
MessageId::VoiceErrNoRecorder => {
"Voz: nenhuma ferramenta de gravação encontrada. Instale sox, arecord ou rec."
}
MessageId::VoiceErrNetwork => "Voz: falha na solicitação de transcrição",
MessageId::VoiceErrEmptySend => "Voz: nada para enviar",
MessageId::VoiceErrTooShort => "Voz: nenhuma fala detectada, gravação muito curta",
MessageId::VoiceRecording => "🎙 Gravando... fale agora",
MessageId::VoiceProcessing => "🎙 Transcrevendo...",
MessageId::VoiceTranscribed => "🎙 Transcrito",
})
}
@@ -4765,6 +4948,34 @@ fn spanish_latin_america(id: MessageId) -> Option<&'static str> {
MessageId::ToolFamilyVerify => "verificar",
MessageId::ToolFamilyThink => "pensar",
MessageId::ToolFamilyGeneric => "herramienta",
// Voice commands
MessageId::CmdVoiceDescription => {
"Alternar entrada de voz: grabar voz y transcribir a texto"
}
MessageId::CmdVoiceSendDescription => {
"Alternar envío automático por voz: envía cuando la transcripción termina con \"send it\""
}
MessageId::CmdVoiceControlDescription => {
"Alternar control por voz: dictado asistido por IA"
}
MessageId::VoiceEnabled => "Entrada de voz activada. Habla para grabar.",
MessageId::VoiceDisabled => "Entrada de voz desactivada.",
MessageId::VoiceSendEnabled => "Envío automático por voz activado.",
MessageId::VoiceSendDisabled => "Envío automático por voz desactivado.",
MessageId::VoiceControlEnabled => "Control por voz activado.",
MessageId::VoiceControlDisabled => "Control por voz desactivado.",
MessageId::VoiceErrNoAuth => {
"Voz: no hay clave de API configurada para el proveedor activo"
}
MessageId::VoiceErrNoRecorder => {
"Voz: no se encontró herramienta de grabación. Instala sox, arecord o rec."
}
MessageId::VoiceErrNetwork => "Voz: falló la solicitud de transcripción",
MessageId::VoiceErrEmptySend => "Voz: nada que enviar",
MessageId::VoiceErrTooShort => "Voz: no se detectó voz, grabación demasiado corta",
MessageId::VoiceRecording => "🎙 Grabando... habla ahora",
MessageId::VoiceProcessing => "🎙 Transcribiendo...",
MessageId::VoiceTranscribed => "🎙 Transcrito",
})
}
+16
View File
@@ -1481,6 +1481,14 @@ pub struct App {
pub cost_currency: CostCurrency,
pub composer_density: ComposerDensity,
pub composer_border: bool,
/// Voice input state — toggled by `/voice` and the voice hotbar action.
pub voice_enabled: bool,
/// Auto-send after transcription when the transcript ends with an
/// explicit send instruction ("send it" / "发送"). Toggled by `/voice-send`.
pub voice_send_enabled: bool,
/// AI-assisted dictation that sees the current composer text.
/// Toggled by `/voice-control`.
pub voice_control_enabled: bool,
pub transcript_spacing: TranscriptSpacing,
pub sidebar_width_percent: u16,
pub sidebar_focus: SidebarFocus,
@@ -2275,6 +2283,9 @@ impl App {
cost_currency,
composer_density,
composer_border,
voice_enabled: false,
voice_send_enabled: false,
voice_control_enabled: false,
transcript_spacing,
sidebar_width_percent,
sidebar_focus,
@@ -5323,6 +5334,11 @@ pub enum AppAction {
SwitchWorkspace {
workspace: PathBuf,
},
/// Record from the microphone and route the transcription into the
/// composer (or auto-send it). Emitted by `/voice` and the voice hotbar
/// action; handled in the UI event loop where the live `Config` supplies
/// provider credentials.
VoiceCapture,
/// Export and share the current session as a web URL.
ShareSession {
history_len: usize,
+1 -1
View File
@@ -1141,7 +1141,7 @@ mod tests {
assert!(command_labels.contains(&"/config"));
assert!(command_labels.contains(&"/links"));
assert!(!command_labels.contains(&"/voice"));
assert!(command_labels.contains(&"/voice"));
assert!(!command_labels.contains(&"/set"));
assert!(!command_labels.contains(&"/deepseek"));
}
+30 -10
View File
@@ -181,7 +181,7 @@ impl HotbarAction for AppHotbarAction {
fn is_active(&self, app: &App) -> bool {
match self.kind {
AppHotbarKind::VoiceToggle => false,
AppHotbarKind::VoiceToggle => app.voice_enabled,
AppHotbarKind::SessionCompact => app.is_compacting,
AppHotbarKind::Mode(mode) => app.mode == mode,
AppHotbarKind::ReasoningCycle => {
@@ -197,9 +197,12 @@ impl HotbarAction for AppHotbarAction {
fn dispatch(&self, app: &mut App) -> Result<HotbarDispatch> {
match self.kind {
AppHotbarKind::VoiceToggle => {
app.status_message =
Some("Voice input is not available in this terminal session yet.".to_string());
Ok(HotbarDispatch::Handled)
let result = crate::commands::voice::voice(app);
app.status_message = result.message;
match result.action {
Some(action) => Ok(HotbarDispatch::AppAction(action)),
None => Ok(HotbarDispatch::Handled),
}
}
AppHotbarKind::SessionCompact => {
if app.is_compacting {
@@ -539,19 +542,36 @@ mod tests {
}
#[test]
fn voice_toggle_is_safe_until_voice_input_lands() {
fn voice_toggle_dispatches_the_voice_command() {
let registry = HotbarActionRegistry::with_builtins();
let voice = registry.get("voice.toggle").expect("voice action");
let mut app = test_app();
assert!(!voice.is_active(&app));
assert_eq!(
voice.dispatch(&mut app).expect("dispatch voice"),
HotbarDispatch::Handled
);
assert_eq!(
// The toggle is wired to the /voice command. With a recorder on the
// host it arms voice input and defers capture to the UI event loop;
// without one it fails gracefully with a localized error. No audio
// is recorded in either case.
let result = voice.dispatch(&mut app).expect("dispatch voice");
assert!(app.status_message.is_some());
// The old placeholder message must be gone — voice is implemented.
assert_ne!(
app.status_message.as_deref(),
Some("Voice input is not available in this terminal session yet.")
);
if app.voice_enabled {
assert_eq!(
result,
HotbarDispatch::AppAction(crate::tui::app::AppAction::VoiceCapture)
);
assert!(voice.is_active(&app));
// A second press toggles voice input back off.
let off = voice.dispatch(&mut app).expect("dispatch voice off");
assert_eq!(off, HotbarDispatch::Handled);
assert!(!app.voice_enabled);
assert!(!voice.is_active(&app));
} else {
assert_eq!(result, HotbarDispatch::Handled);
}
}
}
+22
View File
@@ -6431,6 +6431,28 @@ async fn apply_command_result(
let queued = build_queued_message(app, content);
submit_or_steer_message(app, config, engine_handle, queued).await?;
}
AppAction::VoiceCapture => {
use commands::voice::VoiceCaptureOutcome;
match commands::voice::capture_and_transcribe(app, config).await {
Ok(VoiceCaptureOutcome::Insert(text)) => {
app.insert_str(&text);
app.status_message = Some(format!(
"{}: {text}",
tr(app.ui_locale, MessageId::VoiceTranscribed)
));
}
Ok(VoiceCaptureOutcome::Send(content)) => {
app.status_message =
Some(tr(app.ui_locale, MessageId::VoiceTranscribed).to_string());
let queued = build_queued_message(app, content);
submit_or_steer_message(app, config, engine_handle, queued).await?;
}
Err(err) => {
app.voice_enabled = false;
app.status_message = Some(err);
}
}
}
AppAction::ListSubAgents => {
let _ = engine_handle.send(Op::ListSubAgents).await;
}