diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py index fb17fe80eb..83cb0e446c 100644 --- a/src/google/adk/evaluation/final_response_match_v1.py +++ b/src/google/adk/evaluation/final_response_match_v1.py @@ -14,6 +14,8 @@ from __future__ import annotations +from collections import Counter +from collections import namedtuple from typing import Optional from google.genai import types as genai_types @@ -27,6 +29,7 @@ from .evaluator import EvaluationResult from .evaluator import Evaluator from .evaluator import PerInvocationResult +from .text_utils import normalize_text class RougeEvaluator(Evaluator): @@ -46,7 +49,7 @@ def evaluate_invocations( conversation_scenario: Optional[ConversationScenario] = None, ) -> EvaluationResult: if expected_invocations is None: - raise ValueError("expected_invocations is required for this metric.") + raise ValueError('expected_invocations is required for this metric.') del conversation_scenario # not used by this metric. total_score = 0.0 @@ -83,15 +86,18 @@ def evaluate_invocations( def _get_text_from_content(content: Optional[genai_types.Content]) -> str: if content and content.parts: - return "\n".join([part.text for part in content.parts if part.text]) + return '\n'.join([part.text for part in content.parts if part.text]) - return "" + return '' def _get_eval_status(score: float, threshold: float): return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED +Score = namedtuple('Score', ['precision', 'recall', 'fmeasure']) + + def _calculate_rouge_1_scores(candidate: str, reference: str): """Calculates the ROUGE-1 score between a candidate and reference text. @@ -108,12 +114,56 @@ def _calculate_rouge_1_scores(candidate: str, reference: str): reference: The ground-truth text to compare against. Returns: - A dictionary containing the ROUGE-1 precision, recall, and f-measure. + A Score namedtuple containing the ROUGE-1 precision, recall, and f-measure. """ - scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True) + # Normalize both texts before scoring to handle Unicode variations + normalized_candidate = normalize_text(candidate) + normalized_reference = normalize_text(reference) + + # Check if the text contains spaces (word-separated languages) + has_spaces = ' ' in normalized_reference or ' ' in normalized_candidate + + if has_spaces: + # Use standard word-level ROUGE for space-separated languages + scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True) + scores = scorer.score(normalized_reference, normalized_candidate) + return scores['rouge1'] + else: + # For non-space-separated languages, use character-level comparison + return _calculate_character_level_rouge( + normalized_candidate, normalized_reference + ) + + +def _calculate_character_level_rouge(candidate: str, reference: str): + """Calculates character-level ROUGE-1 score for non-space-separated text. + + Args: + candidate: The candidate text (already normalized). + reference: The reference text (already normalized). + + Returns: + A Score namedtuple with precision, recall, and fmeasure. + """ + + if not reference or not candidate: + return Score(precision=0.0, recall=0.0, fmeasure=0.0) + + # Count character occurrences + ref_chars = Counter(reference) + cand_chars = Counter(candidate) + + # Calculate overlapping characters + overlap = sum((ref_chars & cand_chars).values()) + + # Calculate precision and recall + precision = overlap / len(candidate) if len(candidate) > 0 else 0.0 + recall = overlap / len(reference) if len(reference) > 0 else 0.0 - # The score method returns a dictionary where keys are the ROUGE types - # and values are Score objects (tuples) with precision, recall, and fmeasure. - scores = scorer.score(reference, candidate) + # Calculate F-measure + if precision + recall > 0: + fmeasure = 2 * (precision * recall) / (precision + recall) + else: + fmeasure = 0.0 - return scores["rouge1"] + return Score(precision=precision, recall=recall, fmeasure=fmeasure) diff --git a/src/google/adk/evaluation/text_utils.py b/src/google/adk/evaluation/text_utils.py new file mode 100644 index 0000000000..46ef562256 --- /dev/null +++ b/src/google/adk/evaluation/text_utils.py @@ -0,0 +1,34 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Text utilities for evaluation.""" + +from __future__ import annotations + +import unicodedata + + +def normalize_text(text: str) -> str: + """Normalize text using NFC normalization and strip whitespace. + + This ensures consistent text comparison across different Unicode + representations, which is particularly important for non-English text. + + Args: + text: The text to normalize. + + Returns: + The normalized text. + """ + return unicodedata.normalize("NFC", text).strip() diff --git a/tests/unittests/evaluation/test_non_english_eval.py b/tests/unittests/evaluation/test_non_english_eval.py new file mode 100644 index 0000000000..9cdd0efa45 --- /dev/null +++ b/tests/unittests/evaluation/test_non_english_eval.py @@ -0,0 +1,38 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for final_response_match_v1.""" + +from __future__ import annotations + +import pytest + + +def test_normalization_applied_in_rouge(): + """Normalization should make identical Thai strings match.""" + from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores + from google.adk.evaluation.text_utils import normalize_text + + reference = "สวัสดี" + candidate = "สวัสดี" + + # Verify normalization directly + assert normalize_text(reference) == normalize_text(candidate) + + # Verify ROUGE score reflects a perfect match + score = _calculate_rouge_1_scores(candidate, reference) + + assert score.precision == pytest.approx(1.0) + assert score.recall == pytest.approx(1.0) + assert score.fmeasure == pytest.approx(1.0)