gemini_genai_rs/flow/
barge_in.rs

1//! Barge-in (interruption) detection and handling.
2//!
3//! Coordinates client-side VAD with jitter buffer flush and server signaling
4//! to achieve atomic barge-in with minimal latency.
5//!
6//! When `tentative` mode is enabled (the default), the detector follows a
7//! three-step duck-confirm-flush sequence:
8//!
9//! 1. **Duck** — On the first speech frame during `ModelSpeaking`, reduce
10//!    playback volume instead of immediately silencing. This avoids jarring
11//!    silence from false-positive VAD triggers (e.g., background noise).
12//! 2. **Interrupt** — Once speech has been sustained for `min_speech_frames`,
13//!    flush the jitter buffer and signal the server.
14//! 3. **Restore** — If speech stops before reaching the confirmation threshold,
15//!    restore the original playback volume (false positive resolved).
16
17use crate::buffer::AudioJitterBuffer;
18use crate::session::{SessionCommand, SessionPhase};
19
20/// Configuration for barge-in behavior.
21#[derive(Debug, Clone)]
22pub struct BargeInConfig {
23    /// Whether barge-in is enabled.
24    pub enabled: bool,
25    /// Minimum energy (dBFS above noise floor) to trigger barge-in.
26    pub energy_threshold_db: f64,
27    /// Minimum duration of speech (in frames) before triggering barge-in.
28    pub min_speech_frames: u32,
29    /// Enable tentative barge-in (duck before flush).
30    pub tentative: bool,
31    /// Volume multiplier during duck phase (0.0-1.0).
32    pub duck_volume: f32,
33}
34
35impl Default for BargeInConfig {
36    fn default() -> Self {
37        Self {
38            enabled: true,
39            energy_threshold_db: 15.0,
40            min_speech_frames: 2,
41            tentative: true,
42            duck_volume: 0.3,
43        }
44    }
45}
46
47/// Result of a barge-in check.
48#[derive(Debug, Clone, Copy, PartialEq)]
49pub enum BargeInAction {
50    /// No barge-in — continue normal operation.
51    None,
52    /// Duck audio volume — tentative barge-in detected.
53    /// The `f32` is the volume multiplier (0.0 = silent, 1.0 = full).
54    Duck(f32),
55    /// Barge-in detected — flush buffer and signal server.
56    Interrupt,
57    /// Restore audio volume — false positive resolved.
58    Restore,
59}
60
61/// Internal state of the tentative barge-in detector.
62#[derive(Debug, Clone, Copy, PartialEq, Eq)]
63enum DetectorState {
64    /// No tentative barge-in in progress.
65    Idle,
66    /// Audio has been ducked; waiting for confirmation or silence.
67    Ducked { frames: u32 },
68}
69
70/// Barge-in detector — checks whether user speech should interrupt model output.
71pub struct BargeInDetector {
72    config: BargeInConfig,
73    /// Count of consecutive speech frames during model output.
74    speech_frame_count: u32,
75    /// Internal state for tentative barge-in.
76    state: DetectorState,
77}
78
79impl BargeInDetector {
80    /// Create a new barge-in detector.
81    pub fn new(config: BargeInConfig) -> Self {
82        Self {
83            config,
84            speech_frame_count: 0,
85            state: DetectorState::Idle,
86        }
87    }
88
89    /// Check whether a VAD speech detection during model output should trigger barge-in.
90    ///
91    /// Call this when the VAD detects speech while the session is in `ModelSpeaking` phase.
92    ///
93    /// When tentative mode is enabled, the sequence is:
94    /// - First speech frame → `Duck(volume)` (reduce playback volume)
95    /// - Sustained speech reaching `min_speech_frames` → `Interrupt` (flush and signal)
96    /// - Silence before confirmation → `Restore` (false positive)
97    ///
98    /// When tentative mode is disabled, the legacy behavior applies:
99    /// - `None` until `min_speech_frames` consecutive frames → `Interrupt`
100    pub fn check(&mut self, current_phase: SessionPhase, vad_is_speaking: bool) -> BargeInAction {
101        if !self.config.enabled {
102            return BargeInAction::None;
103        }
104
105        // Only trigger barge-in during model output
106        if current_phase != SessionPhase::ModelSpeaking {
107            let action = self.restore_if_ducked();
108            self.speech_frame_count = 0;
109            self.state = DetectorState::Idle;
110            return action;
111        }
112
113        if self.config.tentative {
114            self.check_tentative(vad_is_speaking)
115        } else {
116            self.check_legacy(vad_is_speaking)
117        }
118    }
119
120    /// Legacy barge-in check: None until min_speech_frames consecutive frames → Interrupt.
121    fn check_legacy(&mut self, vad_is_speaking: bool) -> BargeInAction {
122        if vad_is_speaking {
123            self.speech_frame_count += 1;
124            if self.speech_frame_count >= self.config.min_speech_frames {
125                self.speech_frame_count = 0;
126                return BargeInAction::Interrupt;
127            }
128        } else {
129            self.speech_frame_count = 0;
130        }
131
132        BargeInAction::None
133    }
134
135    /// Tentative barge-in check: Duck → Interrupt or Restore.
136    fn check_tentative(&mut self, vad_is_speaking: bool) -> BargeInAction {
137        match self.state {
138            DetectorState::Idle => {
139                if vad_is_speaking {
140                    // First speech frame — duck audio and start counting.
141                    // We count this frame as frame 1.
142                    self.state = DetectorState::Ducked { frames: 1 };
143                    // Check if min_speech_frames == 1 (immediate interrupt).
144                    if self.config.min_speech_frames <= 1 {
145                        self.state = DetectorState::Idle;
146                        return BargeInAction::Interrupt;
147                    }
148                    BargeInAction::Duck(self.config.duck_volume)
149                } else {
150                    BargeInAction::None
151                }
152            }
153            DetectorState::Ducked { frames } => {
154                if vad_is_speaking {
155                    let new_frames = frames + 1;
156                    if new_frames >= self.config.min_speech_frames {
157                        // Confirmed speech — full interrupt.
158                        self.state = DetectorState::Idle;
159                        BargeInAction::Interrupt
160                    } else {
161                        self.state = DetectorState::Ducked { frames: new_frames };
162                        // Already ducked, no new action needed.
163                        BargeInAction::None
164                    }
165                } else {
166                    // Silence while ducked — false positive, restore volume.
167                    self.state = DetectorState::Idle;
168                    BargeInAction::Restore
169                }
170            }
171        }
172    }
173
174    /// If currently ducked, return Restore; otherwise None.
175    fn restore_if_ducked(&self) -> BargeInAction {
176        match self.state {
177            DetectorState::Ducked { .. } => BargeInAction::Restore,
178            DetectorState::Idle => BargeInAction::None,
179        }
180    }
181
182    /// Reset the detector state.
183    pub fn reset(&mut self) {
184        self.speech_frame_count = 0;
185        self.state = DetectorState::Idle;
186    }
187
188    /// Execute the barge-in sequence:
189    /// 1. Flush the jitter buffer (instant silence)
190    /// 2. Return the command to signal activity start
191    ///
192    /// The caller is responsible for sending the command and transitioning the FSM.
193    pub fn execute_barge_in(jitter_buffer: &mut AudioJitterBuffer) -> SessionCommand {
194        // Step 1: Instant silence — flush the playback buffer
195        jitter_buffer.flush();
196
197        // Step 2: Signal activity start to the server
198        SessionCommand::ActivityStart
199    }
200}
201
202#[cfg(test)]
203mod tests {
204    use super::*;
205
206    #[test]
207    fn no_barge_in_when_disabled() {
208        let mut detector = BargeInDetector::new(BargeInConfig {
209            enabled: false,
210            ..Default::default()
211        });
212
213        let action = detector.check(SessionPhase::ModelSpeaking, true);
214        assert_eq!(action, BargeInAction::None);
215    }
216
217    #[test]
218    fn no_barge_in_when_not_model_speaking() {
219        let mut detector = BargeInDetector::new(BargeInConfig::default());
220
221        let action = detector.check(SessionPhase::Active, true);
222        assert_eq!(action, BargeInAction::None);
223    }
224
225    #[test]
226    fn barge_in_after_min_frames() {
227        let mut detector = BargeInDetector::new(BargeInConfig {
228            min_speech_frames: 3,
229            tentative: false,
230            ..Default::default()
231        });
232
233        assert_eq!(
234            detector.check(SessionPhase::ModelSpeaking, true),
235            BargeInAction::None
236        );
237        assert_eq!(
238            detector.check(SessionPhase::ModelSpeaking, true),
239            BargeInAction::None
240        );
241        assert_eq!(
242            detector.check(SessionPhase::ModelSpeaking, true),
243            BargeInAction::Interrupt
244        );
245    }
246
247    #[test]
248    fn barge_in_resets_on_silence() {
249        let mut detector = BargeInDetector::new(BargeInConfig {
250            min_speech_frames: 3,
251            tentative: false,
252            ..Default::default()
253        });
254
255        detector.check(SessionPhase::ModelSpeaking, true);
256        detector.check(SessionPhase::ModelSpeaking, true);
257        // Silence interrupts the count
258        detector.check(SessionPhase::ModelSpeaking, false);
259        // Must start over
260        assert_eq!(
261            detector.check(SessionPhase::ModelSpeaking, true),
262            BargeInAction::None
263        );
264    }
265
266    #[test]
267    fn tentative_barge_in_duck_then_interrupt() {
268        let mut detector = BargeInDetector::new(BargeInConfig {
269            min_speech_frames: 3,
270            tentative: true,
271            duck_volume: 0.3,
272            ..Default::default()
273        });
274
275        // First speech frame → Duck
276        assert_eq!(
277            detector.check(SessionPhase::ModelSpeaking, true),
278            BargeInAction::Duck(0.3)
279        );
280        // Second speech frame → still ducked, no new action
281        assert_eq!(
282            detector.check(SessionPhase::ModelSpeaking, true),
283            BargeInAction::None
284        );
285        // Third speech frame → confirmed, Interrupt
286        assert_eq!(
287            detector.check(SessionPhase::ModelSpeaking, true),
288            BargeInAction::Interrupt
289        );
290    }
291
292    #[test]
293    fn tentative_barge_in_duck_then_restore() {
294        let mut detector = BargeInDetector::new(BargeInConfig {
295            min_speech_frames: 3,
296            tentative: true,
297            duck_volume: 0.3,
298            ..Default::default()
299        });
300
301        // First speech frame → Duck
302        assert_eq!(
303            detector.check(SessionPhase::ModelSpeaking, true),
304            BargeInAction::Duck(0.3)
305        );
306        // Silence before confirmation → Restore
307        assert_eq!(
308            detector.check(SessionPhase::ModelSpeaking, false),
309            BargeInAction::Restore
310        );
311        // Back to idle — silence does nothing
312        assert_eq!(
313            detector.check(SessionPhase::ModelSpeaking, false),
314            BargeInAction::None
315        );
316    }
317
318    #[test]
319    fn tentative_disabled_skips_duck() {
320        let mut detector = BargeInDetector::new(BargeInConfig {
321            min_speech_frames: 3,
322            tentative: false,
323            duck_volume: 0.3,
324            ..Default::default()
325        });
326
327        // Without tentative, first frames return None (no Duck).
328        assert_eq!(
329            detector.check(SessionPhase::ModelSpeaking, true),
330            BargeInAction::None
331        );
332        assert_eq!(
333            detector.check(SessionPhase::ModelSpeaking, true),
334            BargeInAction::None
335        );
336        // Reaching min_speech_frames → Interrupt directly (no Duck).
337        assert_eq!(
338            detector.check(SessionPhase::ModelSpeaking, true),
339            BargeInAction::Interrupt
340        );
341    }
342
343    #[test]
344    fn duck_volume_in_action() {
345        let mut detector = BargeInDetector::new(BargeInConfig {
346            min_speech_frames: 5,
347            tentative: true,
348            duck_volume: 0.5,
349            ..Default::default()
350        });
351
352        let action = detector.check(SessionPhase::ModelSpeaking, true);
353        assert_eq!(action, BargeInAction::Duck(0.5));
354    }
355
356    #[test]
357    fn tentative_restores_on_phase_change() {
358        let mut detector = BargeInDetector::new(BargeInConfig {
359            min_speech_frames: 5,
360            tentative: true,
361            duck_volume: 0.3,
362            ..Default::default()
363        });
364
365        // Start ducking
366        assert_eq!(
367            detector.check(SessionPhase::ModelSpeaking, true),
368            BargeInAction::Duck(0.3)
369        );
370        // Phase changes away from ModelSpeaking while ducked → Restore
371        assert_eq!(
372            detector.check(SessionPhase::Active, true),
373            BargeInAction::Restore
374        );
375    }
376
377    #[test]
378    fn tentative_immediate_interrupt_when_min_frames_one() {
379        let mut detector = BargeInDetector::new(BargeInConfig {
380            min_speech_frames: 1,
381            tentative: true,
382            duck_volume: 0.3,
383            ..Default::default()
384        });
385
386        // With min_speech_frames=1, even in tentative mode we go straight to Interrupt
387        assert_eq!(
388            detector.check(SessionPhase::ModelSpeaking, true),
389            BargeInAction::Interrupt
390        );
391    }
392}