gemini_adk_rs/live/
input_vad.rs

1//! Backend input VAD for browser microphone PCM.
2
3use std::time::Instant;
4
5use gemini_genai_rs::prelude::{VadConfig, VadEvent, VoiceActivityDetector};
6use gemini_genai_rs::vad::VadState;
7use serde::Serialize;
8
9/// Snapshot of backend VAD state for devtools and diagnostics.
10#[derive(Debug, Clone, Serialize)]
11pub struct BackendVadSnapshot {
12    /// Active detector backend name.
13    pub backend: &'static str,
14    /// Input sample rate in Hz.
15    pub sample_rate: u32,
16    /// Detector frame duration in milliseconds.
17    pub frame_duration_ms: u32,
18    /// Detector frame size in samples.
19    pub frame_size: usize,
20    /// Current detector state.
21    pub state: &'static str,
22    /// Whether the detector is currently in a speech state.
23    pub speaking: bool,
24    /// Last normalized speech probability or binary decision.
25    pub last_probability: Option<f32>,
26    /// Number of complete frames processed by the backend detector.
27    pub frames_processed: u64,
28    /// Milliseconds since the last speech start/end transition.
29    pub last_transition_ms_ago: Option<u64>,
30}
31
32/// Incremental VAD over arbitrary PCM16 byte chunks.
33pub struct BackendInputVad {
34    detector: VoiceActivityDetector,
35    config: VadConfig,
36    pending_samples: Vec<i16>,
37    frames_processed: u64,
38    last_transition_at: Option<Instant>,
39}
40
41impl BackendInputVad {
42    /// Create a backend input VAD with explicit detector configuration.
43    pub fn new(config: VadConfig) -> Self {
44        Self {
45            detector: VoiceActivityDetector::new(config.clone()),
46            config,
47            pending_samples: Vec::new(),
48            frames_processed: 0,
49            last_transition_at: None,
50        }
51    }
52
53    /// Process arbitrary little-endian PCM16 bytes and return speech edge events.
54    pub fn process_pcm_bytes(&mut self, bytes: &[u8]) -> Vec<VadEvent> {
55        self.pending_samples
56            .extend(bytes.chunks_exact(2).map(|pair| {
57                let raw = [pair[0], pair[1]];
58                i16::from_le_bytes(raw)
59            }));
60
61        let frame_size = self.config.frame_size();
62        if frame_size == 0 {
63            return Vec::new();
64        }
65
66        let mut events = Vec::new();
67        while self.pending_samples.len() >= frame_size {
68            let frame: Vec<i16> = self.pending_samples.drain(..frame_size).collect();
69            self.frames_processed += 1;
70            if let Some(event) = self.detector.process_frame(&frame) {
71                self.last_transition_at = Some(Instant::now());
72                events.push(event);
73            }
74        }
75        events
76    }
77
78    /// Return a diagnostics snapshot suitable for UI/devtools display.
79    pub fn snapshot(&self) -> BackendVadSnapshot {
80        BackendVadSnapshot {
81            backend: self.detector.backend_name(),
82            sample_rate: self.config.sample_rate,
83            frame_duration_ms: self.config.frame_duration_ms,
84            frame_size: self.config.frame_size(),
85            state: state_name(self.detector.state()),
86            speaking: self.detector.is_speaking(),
87            last_probability: self.detector.last_probability(),
88            frames_processed: self.frames_processed,
89            last_transition_ms_ago: self
90                .last_transition_at
91                .map(|instant| instant.elapsed().as_millis() as u64),
92        }
93    }
94
95    #[cfg(test)]
96    /// Whether the detector is currently speaking.
97    pub fn is_speaking(&self) -> bool {
98        self.detector.is_speaking()
99    }
100}
101
102impl Default for BackendInputVad {
103    fn default() -> Self {
104        Self::new(VadConfig {
105            sample_rate: 16000,
106            frame_duration_ms: 30,
107            min_speech_frames: 2,
108            hangover_frames: 8,
109            ..VadConfig::default()
110        })
111    }
112}
113
114fn state_name(state: VadState) -> &'static str {
115    match state {
116        VadState::Silence => "silence",
117        VadState::PendingSpeech => "pending_speech",
118        VadState::Speech => "speech",
119        VadState::Hangover => "hangover",
120    }
121}
122
123#[cfg(test)]
124mod tests {
125    use super::*;
126
127    fn speech_frame(len: usize, amplitude: i16) -> Vec<i16> {
128        (0..len)
129            .map(|i| if i % 4 < 2 { amplitude } else { -amplitude })
130            .collect()
131    }
132
133    fn bytes(samples: &[i16]) -> Vec<u8> {
134        samples
135            .iter()
136            .flat_map(|sample| sample.to_le_bytes())
137            .collect()
138    }
139
140    #[test]
141    fn buffers_arbitrary_chunks_into_vad_frames() {
142        let mut vad = BackendInputVad::new(VadConfig {
143            sample_rate: 16000,
144            frame_duration_ms: 20,
145            min_speech_frames: 2,
146            hangover_frames: 2,
147            speech_zcr_range: (0.01, 0.9),
148            ..VadConfig::default()
149        });
150        let speech = speech_frame(640, 10000);
151        let half = bytes(&speech[..100]);
152        assert!(vad.process_pcm_bytes(&half).is_empty());
153
154        let rest = bytes(&speech[100..]);
155        let events = vad.process_pcm_bytes(&rest);
156        assert!(events.contains(&VadEvent::SpeechStart));
157        assert!(vad.is_speaking());
158    }
159}