gemini_adk_rs/live/
input_vad.rs1use std::time::Instant;
4
5use gemini_genai_rs::prelude::{VadConfig, VadEvent, VoiceActivityDetector};
6use gemini_genai_rs::vad::VadState;
7use serde::Serialize;
8
9#[derive(Debug, Clone, Serialize)]
11pub struct BackendVadSnapshot {
12 pub backend: &'static str,
14 pub sample_rate: u32,
16 pub frame_duration_ms: u32,
18 pub frame_size: usize,
20 pub state: &'static str,
22 pub speaking: bool,
24 pub last_probability: Option<f32>,
26 pub frames_processed: u64,
28 pub last_transition_ms_ago: Option<u64>,
30}
31
32pub struct BackendInputVad {
34 detector: VoiceActivityDetector,
35 config: VadConfig,
36 pending_samples: Vec<i16>,
37 frames_processed: u64,
38 last_transition_at: Option<Instant>,
39}
40
41impl BackendInputVad {
42 pub fn new(config: VadConfig) -> Self {
44 Self {
45 detector: VoiceActivityDetector::new(config.clone()),
46 config,
47 pending_samples: Vec::new(),
48 frames_processed: 0,
49 last_transition_at: None,
50 }
51 }
52
53 pub fn process_pcm_bytes(&mut self, bytes: &[u8]) -> Vec<VadEvent> {
55 self.pending_samples
56 .extend(bytes.chunks_exact(2).map(|pair| {
57 let raw = [pair[0], pair[1]];
58 i16::from_le_bytes(raw)
59 }));
60
61 let frame_size = self.config.frame_size();
62 if frame_size == 0 {
63 return Vec::new();
64 }
65
66 let mut events = Vec::new();
67 while self.pending_samples.len() >= frame_size {
68 let frame: Vec<i16> = self.pending_samples.drain(..frame_size).collect();
69 self.frames_processed += 1;
70 if let Some(event) = self.detector.process_frame(&frame) {
71 self.last_transition_at = Some(Instant::now());
72 events.push(event);
73 }
74 }
75 events
76 }
77
78 pub fn snapshot(&self) -> BackendVadSnapshot {
80 BackendVadSnapshot {
81 backend: self.detector.backend_name(),
82 sample_rate: self.config.sample_rate,
83 frame_duration_ms: self.config.frame_duration_ms,
84 frame_size: self.config.frame_size(),
85 state: state_name(self.detector.state()),
86 speaking: self.detector.is_speaking(),
87 last_probability: self.detector.last_probability(),
88 frames_processed: self.frames_processed,
89 last_transition_ms_ago: self
90 .last_transition_at
91 .map(|instant| instant.elapsed().as_millis() as u64),
92 }
93 }
94
95 #[cfg(test)]
96 pub fn is_speaking(&self) -> bool {
98 self.detector.is_speaking()
99 }
100}
101
102impl Default for BackendInputVad {
103 fn default() -> Self {
104 Self::new(VadConfig {
105 sample_rate: 16000,
106 frame_duration_ms: 30,
107 min_speech_frames: 2,
108 hangover_frames: 8,
109 ..VadConfig::default()
110 })
111 }
112}
113
114fn state_name(state: VadState) -> &'static str {
115 match state {
116 VadState::Silence => "silence",
117 VadState::PendingSpeech => "pending_speech",
118 VadState::Speech => "speech",
119 VadState::Hangover => "hangover",
120 }
121}
122
123#[cfg(test)]
124mod tests {
125 use super::*;
126
127 fn speech_frame(len: usize, amplitude: i16) -> Vec<i16> {
128 (0..len)
129 .map(|i| if i % 4 < 2 { amplitude } else { -amplitude })
130 .collect()
131 }
132
133 fn bytes(samples: &[i16]) -> Vec<u8> {
134 samples
135 .iter()
136 .flat_map(|sample| sample.to_le_bytes())
137 .collect()
138 }
139
140 #[test]
141 fn buffers_arbitrary_chunks_into_vad_frames() {
142 let mut vad = BackendInputVad::new(VadConfig {
143 sample_rate: 16000,
144 frame_duration_ms: 20,
145 min_speech_frames: 2,
146 hangover_frames: 2,
147 speech_zcr_range: (0.01, 0.9),
148 ..VadConfig::default()
149 });
150 let speech = speech_frame(640, 10000);
151 let half = bytes(&speech[..100]);
152 assert!(vad.process_pcm_bytes(&half).is_empty());
153
154 let rest = bytes(&speech[100..]);
155 let events = vad.process_pcm_bytes(&rest);
156 assert!(events.contains(&VadEvent::SpeechStart));
157 assert!(vad.is_speaking());
158 }
159}