gemini_adk_fluent_rs/compose/
eval.rs

1//! E — Evaluation composition.
2//!
3//! Compose evaluation criteria with `|` for agent quality assessment.
4
5use std::sync::Arc;
6
7/// An evaluation criterion applied to agent output.
8#[derive(Clone)]
9pub struct ECriterion {
10    name: &'static str,
11    #[allow(clippy::type_complexity)]
12    checker: Arc<dyn Fn(&str, &str) -> f64 + Send + Sync>,
13}
14
15impl ECriterion {
16    fn new(name: &'static str, f: impl Fn(&str, &str) -> f64 + Send + Sync + 'static) -> Self {
17        Self {
18            name,
19            checker: Arc::new(f),
20        }
21    }
22
23    /// Name of this criterion.
24    pub fn name(&self) -> &str {
25        self.name
26    }
27
28    /// Score the output against the expected value. Returns 0.0–1.0.
29    pub fn score(&self, output: &str, expected: &str) -> f64 {
30        (self.checker)(output, expected)
31    }
32}
33
34impl std::fmt::Debug for ECriterion {
35    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36        f.debug_struct("ECriterion")
37            .field("name", &self.name)
38            .finish()
39    }
40}
41
42/// Compose two criteria with `|`.
43impl std::ops::BitOr for ECriterion {
44    type Output = EComposite;
45
46    fn bitor(self, rhs: ECriterion) -> Self::Output {
47        EComposite {
48            criteria: vec![self, rhs],
49        }
50    }
51}
52
53/// A composite of evaluation criteria.
54#[derive(Clone)]
55pub struct EComposite {
56    /// The list of criteria in this composite.
57    pub criteria: Vec<ECriterion>,
58}
59
60impl EComposite {
61    /// Score the output against expected, returning per-criterion scores.
62    pub fn score_all(&self, output: &str, expected: &str) -> Vec<(&str, f64)> {
63        self.criteria
64            .iter()
65            .map(|c| (c.name(), c.score(output, expected)))
66            .collect()
67    }
68
69    /// Number of criteria.
70    pub fn len(&self) -> usize {
71        self.criteria.len()
72    }
73
74    /// Whether empty.
75    pub fn is_empty(&self) -> bool {
76        self.criteria.is_empty()
77    }
78}
79
80impl std::ops::BitOr<ECriterion> for EComposite {
81    type Output = EComposite;
82
83    fn bitor(mut self, rhs: ECriterion) -> Self::Output {
84        self.criteria.push(rhs);
85        self
86    }
87}
88
89/// A single evaluation case — prompt + expected output.
90#[derive(Clone, Debug)]
91pub struct EvalCase {
92    /// The prompt to send to the agent.
93    pub prompt: String,
94    /// The expected response (for comparison).
95    pub expected: String,
96}
97
98/// An evaluation suite builder.
99#[derive(Clone, Debug)]
100pub struct EvalSuite {
101    /// The cases in this suite.
102    pub cases: Vec<EvalCase>,
103    /// The criteria to apply to each case.
104    pub criteria_names: Vec<String>,
105}
106
107impl EvalSuite {
108    /// Add a test case to the suite.
109    pub fn case(mut self, prompt: impl Into<String>, expected: impl Into<String>) -> Self {
110        self.cases.push(EvalCase {
111            prompt: prompt.into(),
112            expected: expected.into(),
113        });
114        self
115    }
116
117    /// Set criteria names for this suite.
118    pub fn criteria(mut self, names: &[&str]) -> Self {
119        self.criteria_names = names.iter().map(|s| s.to_string()).collect();
120        self
121    }
122
123    /// Number of cases.
124    pub fn len(&self) -> usize {
125        self.cases.len()
126    }
127
128    /// Whether empty.
129    pub fn is_empty(&self) -> bool {
130        self.cases.is_empty()
131    }
132}
133
134/// The `E` namespace — static factory methods for evaluation criteria.
135pub struct E;
136
137impl E {
138    /// Create an evaluation suite.
139    pub fn suite() -> EvalSuite {
140        EvalSuite {
141            cases: Vec::new(),
142            criteria_names: Vec::new(),
143        }
144    }
145
146    /// Exact response match criterion.
147    pub fn response_match() -> ECriterion {
148        ECriterion::new("response_match", |output, expected| {
149            if output.trim() == expected.trim() {
150                1.0
151            } else {
152                0.0
153            }
154        })
155    }
156
157    /// Substring containment criterion — scores 1.0 if output contains expected.
158    pub fn contains_match() -> ECriterion {
159        ECriterion::new("contains_match", |output, expected| {
160            if output.contains(expected) {
161                1.0
162            } else {
163                0.0
164            }
165        })
166    }
167
168    /// Safety criterion — placeholder that always passes.
169    pub fn safety() -> ECriterion {
170        ECriterion::new("safety", |_output, _expected| 1.0)
171    }
172
173    /// Semantic match criterion — placeholder (requires LLM judge at runtime).
174    pub fn semantic_match() -> ECriterion {
175        ECriterion::new("semantic_match", |_output, _expected| 0.5)
176    }
177
178    /// Hallucination detection criterion — placeholder.
179    pub fn hallucination() -> ECriterion {
180        ECriterion::new("hallucination", |_output, _expected| 0.5)
181    }
182
183    /// Trajectory evaluation — placeholder for tool call sequence validation.
184    pub fn trajectory() -> ECriterion {
185        ECriterion::new("trajectory", |_output, _expected| 0.5)
186    }
187
188    /// Custom evaluation criterion from a scoring function.
189    pub fn custom(
190        name: &'static str,
191        f: impl Fn(&str, &str) -> f64 + Send + Sync + 'static,
192    ) -> ECriterion {
193        ECriterion::new(name, f)
194    }
195
196    /// Load eval cases from a file path.
197    ///
198    /// The file should contain one case per pair of consecutive lines:
199    /// odd lines are prompts, even lines are expected responses.
200    /// Lines starting with `#` are comments and blank lines are skipped.
201    pub fn from_file(path: &str) -> EvalSuite {
202        let content = std::fs::read_to_string(path).unwrap_or_default();
203        let lines: Vec<&str> = content
204            .lines()
205            .map(|l| l.trim())
206            .filter(|l| !l.is_empty() && !l.starts_with('#'))
207            .collect();
208
209        let mut cases = Vec::new();
210        let mut i = 0;
211        while i + 1 < lines.len() {
212            cases.push(EvalCase {
213                prompt: lines[i].to_string(),
214                expected: lines[i + 1].to_string(),
215            });
216            i += 2;
217        }
218
219        EvalSuite {
220            cases,
221            criteria_names: Vec::new(),
222        }
223    }
224
225    /// Create a persona-based evaluator for user simulation.
226    ///
227    /// The persona describes a simulated user with a given name and description,
228    /// which can be used to generate realistic test interactions.
229    pub fn persona(name: &'static str, description: &'static str) -> ECriterion {
230        ECriterion::new(name, move |output, _expected| {
231            // Persona evaluator checks that the agent's output is appropriate
232            // for the described persona. Placeholder scoring: returns 0.5
233            // indicating neutral — real implementation requires an LLM judge
234            // parameterized with the persona description.
235            let _ = description;
236            if output.is_empty() {
237                0.0
238            } else {
239                0.5
240            }
241        })
242    }
243}
244
245#[cfg(test)]
246mod tests {
247    use super::*;
248
249    #[test]
250    fn response_match_exact() {
251        let c = E::response_match();
252        assert_eq!(c.score("hello", "hello"), 1.0);
253        assert_eq!(c.score("hello", "world"), 0.0);
254    }
255
256    #[test]
257    fn contains_match_works() {
258        let c = E::contains_match();
259        assert_eq!(c.score("hello world", "world"), 1.0);
260        assert_eq!(c.score("hello", "world"), 0.0);
261    }
262
263    #[test]
264    fn compose_with_bitor() {
265        let composite = E::response_match() | E::safety() | E::semantic_match();
266        assert_eq!(composite.len(), 3);
267    }
268
269    #[test]
270    fn suite_builder() {
271        let suite = E::suite()
272            .case("What is 2+2?", "4")
273            .case("Hello", "Hi")
274            .criteria(&["response_match", "safety"]);
275        assert_eq!(suite.len(), 2);
276        assert_eq!(suite.criteria_names.len(), 2);
277    }
278
279    #[test]
280    fn score_all_returns_results() {
281        let composite = E::response_match() | E::contains_match();
282        let scores = composite.score_all("hello world", "hello");
283        assert_eq!(scores.len(), 2);
284        assert_eq!(scores[0].0, "response_match");
285        assert_eq!(scores[1].0, "contains_match");
286    }
287
288    #[test]
289    fn from_file_missing() {
290        let suite = E::from_file("/nonexistent/path.txt");
291        assert!(suite.is_empty());
292    }
293
294    #[test]
295    fn from_file_parses_cases() {
296        let dir = std::env::temp_dir();
297        let path = dir.join("eval_test_cases.txt");
298        std::fs::write(&path, "# comment\nWhat is 2+2?\n4\n\nHello\nHi\n").unwrap();
299        let suite = E::from_file(path.to_str().unwrap());
300        assert_eq!(suite.len(), 2);
301        assert_eq!(suite.cases[0].prompt, "What is 2+2?");
302        assert_eq!(suite.cases[0].expected, "4");
303        assert_eq!(suite.cases[1].prompt, "Hello");
304        assert_eq!(suite.cases[1].expected, "Hi");
305        let _ = std::fs::remove_file(&path);
306    }
307
308    #[test]
309    fn persona_criterion() {
310        let c = E::persona(
311            "impatient_user",
312            "A user who is in a hurry and wants quick answers",
313        );
314        assert_eq!(c.name(), "impatient_user");
315        assert_eq!(c.score("Here is your answer", ""), 0.5);
316        assert_eq!(c.score("", ""), 0.0);
317    }
318
319    #[test]
320    fn custom_criterion() {
321        let c = E::custom(
322            "length",
323            |output, _expected| {
324                if output.len() > 10 {
325                    1.0
326                } else {
327                    0.0
328                }
329            },
330        );
331        assert_eq!(c.score("short", ""), 0.0);
332        assert_eq!(c.score("a long enough output", ""), 1.0);
333    }
334}