Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add speech input support #1456

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions crates/goose-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ goose-mcp = { path = "../goose-mcp" }
mcp-client = { path = "../mcp-client" }
mcp-server = { path = "../mcp-server" }
mcp-core = { path = "../mcp-core" }
cpal = "0.15.2"
whisper-rs = "0.8.0"
vosk = "0.2.0"
clap = { version = "4.4", features = ["derive"] }
cliclack = "0.3.5"
console = "0.15.8"
Expand Down
58 changes: 58 additions & 0 deletions crates/goose-cli/src/session/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,49 @@ use anyhow::Result;
use rustyline::Editor;
use shlex;
use std::collections::HashMap;
use std::sync::Arc;
use std::sync::Mutex;

use super::completion::GooseCompleter;
use super::speech::SpeechRecognizer;

static SPEECH_RECOGNIZER: once_cell::sync::Lazy<Arc<Mutex<Option<SpeechRecognizer>>>> =
once_cell::sync::Lazy::new(|| Arc::new(Mutex::new(None)));

fn check_voice_input() -> Result<Option<String>> {
let recognizer = SPEECH_RECOGNIZER.lock().unwrap();
if let Some(recognizer) = recognizer.as_ref() {
if recognizer.is_listening() {
if let Some(text) = recognizer.get_text()? {
return Ok(Some(text));
}
}
}
Ok(None)
}

pub fn toggle_voice_input() -> Result<()> {
let mut recognizer = SPEECH_RECOGNIZER.lock().unwrap();
match *recognizer {
None => {
// Initialize speech recognition
let mut new_recognizer = SpeechRecognizer::new()?;
new_recognizer.start_listening()?;
*recognizer = Some(new_recognizer);
println!("Voice input enabled. Press Ctrl+V to start/stop recording.");
}
Some(ref mut r) => {
if r.is_listening() {
r.stop_listening();
println!("Voice recording stopped.");
} else {
r.start_listening()?;
println!("Voice recording started.");
}
}
}
Ok(())
}

#[derive(Debug)]
pub enum InputResult {
Expand All @@ -12,6 +53,7 @@ pub enum InputResult {
AddExtension(String),
AddBuiltin(String),
ToggleTheme,
ToggleSpeech,
Retry,
ListPrompts(Option<String>),
PromptCommand(PromptCommandOptions),
Expand All @@ -27,12 +69,27 @@ pub struct PromptCommandOptions {
pub fn get_input(
editor: &mut Editor<GooseCompleter, rustyline::history::DefaultHistory>,
) -> Result<InputResult> {
// Check if we have any voice input
if let Some(text) = check_voice_input()? {
return Ok(InputResult::Message(text));
}
// Ensure Ctrl-J binding is set for newlines
editor.bind_sequence(
rustyline::KeyEvent(rustyline::KeyCode::Char('j'), rustyline::Modifiers::CTRL),
rustyline::EventHandler::Simple(rustyline::Cmd::Newline),
);

// Add Ctrl-V binding for voice recording toggle
editor.bind_sequence(
rustyline::KeyEvent(rustyline::KeyCode::Char('v'), rustyline::Modifiers::CTRL),
rustyline::EventHandler::Simple(rustyline::Cmd::Custom(Box::new(|editor| {
if let Err(e) = toggle_voice_input() {
println!("Error toggling voice input: {}", e);
}
Ok(())
}))),
);

let prompt = format!("{} ", console::style("( O)>").cyan().bold());
let input = match editor.readline(&prompt) {
Ok(text) => text,
Expand Down Expand Up @@ -72,6 +129,7 @@ fn handle_slash_command(input: &str) -> Option<InputResult> {
Some(InputResult::Retry)
}
"/t" => Some(InputResult::ToggleTheme),
"/voice" => Some(InputResult::ToggleSpeech),
"/prompts" => Some(InputResult::ListPrompts(None)),
s if s.starts_with("/prompts ") => {
// Parse arguments for /prompts command
Expand Down
97 changes: 97 additions & 0 deletions crates/goose-cli/src/session/speech.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
use anyhow::Result;
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
use std::sync::mpsc::{channel, Receiver, Sender};
use std::sync::{Arc, Mutex};
use vosk::{Model, Recognizer};

pub struct SpeechRecognizer {
model: Model,
audio_receiver: Option<Receiver<Vec<f32>>>,
stop_sender: Option<Sender<()>>,
is_listening: Arc<Mutex<bool>>,
}

impl SpeechRecognizer {
pub fn new() -> Result<Self> {
// Initialize Vosk model (download if not present)
let model = Model::new("model")?;

Ok(Self {
model,
audio_receiver: None,
stop_sender: None,
is_listening: Arc::new(Mutex::new(false)),
})
}

pub fn start_listening(&mut self) -> Result<()> {
let (audio_sender, audio_receiver) = channel();
let (stop_sender, stop_receiver) = channel();

let host = cpal::default_host();
let device = host.default_input_device()
.ok_or_else(|| anyhow::anyhow!("No input device found"))?;

let config = device.default_input_config()?;
let sample_rate = config.sample_rate().0;

let recognizer = Recognizer::new(&self.model, sample_rate as f32)?;
let is_listening = Arc::clone(&self.is_listening);

*is_listening.lock().unwrap() = true;

let stream = device.build_input_stream(
&config.into(),
move |data: &[f32], _: &cpal::InputCallbackInfo| {
if stop_receiver.try_recv().is_ok() {
return;
}

// Send audio data for processing
if let Err(_) = audio_sender.send(data.to_vec()) {
return;
}
},
move |err| {
eprintln!("Error in audio stream: {}", err);
},
None,
)?;

stream.play()?;

self.audio_receiver = Some(audio_receiver);
self.stop_sender = Some(stop_sender);

Ok(())
}

pub fn stop_listening(&mut self) {
if let Some(sender) = self.stop_sender.take() {
let _ = sender.send(());
}
*self.is_listening.lock().unwrap() = false;
}

pub fn is_listening(&self) -> bool {
*self.is_listening.lock().unwrap()
}

pub fn get_text(&self) -> Result<Option<String>> {
if let Some(receiver) = &self.audio_receiver {
if let Ok(audio_data) = receiver.try_recv() {
// Process audio data and return recognized text
// This is a simplified version - in practice you'd want to
// accumulate audio until silence is detected
let recognizer = Recognizer::new(&self.model, 16000.0)?;
recognizer.accept_waveform(&audio_data);
if let Some(result) = recognizer.final_result().text() {
if !result.is_empty() {
return Ok(Some(result.to_string()));
}
}
}
}
Ok(None)
}
}
Loading