From 0f17a06d219ff38c9579381166d5f74ae5b87b9f Mon Sep 17 00:00:00 2001 From: robinpats666 Date: Tue, 13 Jan 2026 00:15:54 +0100 Subject: [PATCH 1/3] Fixes #3111 - Created text_utils.py with normalize_text() for NFC Unicode normalization - Modified _calculate_rouge_1_scores() to normalize texts before comparison - Added automatic detection: word-level ROUGE for space-separated text, character-level scoring for non-space-separated text - Implemented _calculate_character_level_rouge() using Counter to calculate precision/recall/F-measure from character overlap - Created test_non_english_eval.py for Thai text evaluation Previously returned 0.0 for non-English text. Now provides proportional scores based on character frequency overlap. --- .../adk/evaluation/final_response_match_v1.py | 60 ++++++++++++++++--- src/google/adk/evaluation/text_utils.py | 34 +++++++++++ .../evaluation/test_non_english_eval.py | 40 +++++++++++++ 3 files changed, 127 insertions(+), 7 deletions(-) create mode 100644 src/google/adk/evaluation/text_utils.py create mode 100644 tests/unittests/evaluation/test_non_english_eval.py diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py index fb17fe80eb..3fd136fe39 100644 --- a/src/google/adk/evaluation/final_response_match_v1.py +++ b/src/google/adk/evaluation/final_response_match_v1.py @@ -27,6 +27,7 @@ from .evaluator import EvaluationResult from .evaluator import Evaluator from .evaluator import PerInvocationResult +from .text_utils import normalize_text #importing normalize_text function for non-English text comparison class RougeEvaluator(Evaluator): @@ -110,10 +111,55 @@ def _calculate_rouge_1_scores(candidate: str, reference: str): Returns: A dictionary containing the ROUGE-1 precision, recall, and f-measure. """ - scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True) - - # The score method returns a dictionary where keys are the ROUGE types - # and values are Score objects (tuples) with precision, recall, and fmeasure. - scores = scorer.score(reference, candidate) - - return scores["rouge1"] + # Normalize both texts before scoring to handle Unicode variations + normalized_candidate = normalize_text(candidate) + normalized_reference = normalize_text(reference) + + # Check if the text contains spaces (word-separated languages) + has_spaces = ' ' in normalized_reference or ' ' in normalized_candidate + + if has_spaces: + # Use standard word-level ROUGE for space-separated languages + scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True) + scores = scorer.score(normalized_reference, normalized_candidate) + return scores["rouge1"] + else: + # For non-space-separated languages, use character-level comparison + return _calculate_character_level_rouge(normalized_candidate, normalized_reference) + + +def _calculate_character_level_rouge(candidate: str, reference: str): + """Calculates character-level ROUGE-1 score for non-space-separated text. + + Args: + candidate: The candidate text (already normalized). + reference: The reference text (already normalized). + + Returns: + A Score namedtuple with precision, recall, and fmeasure. + """ + from collections import Counter, namedtuple + + if not reference or not candidate: + Score = namedtuple('Score', ['precision', 'recall', 'fmeasure']) + return Score(precision=0.0, recall=0.0, fmeasure=0.0) + + # Count character occurrences + ref_chars = Counter(reference) + cand_chars = Counter(candidate) + + # Calculate overlapping characters + overlap = sum((ref_chars & cand_chars).values()) + + # Calculate precision and recall + precision = overlap / len(candidate) if len(candidate) > 0 else 0.0 + recall = overlap / len(reference) if len(reference) > 0 else 0.0 + + # Calculate F-measure + if precision + recall > 0: + fmeasure = 2 * (precision * recall) / (precision + recall) + else: + fmeasure = 0.0 + + Score = namedtuple('Score', ['precision', 'recall', 'fmeasure']) + return Score(precision=precision, recall=recall, fmeasure=fmeasure) \ No newline at end of file diff --git a/src/google/adk/evaluation/text_utils.py b/src/google/adk/evaluation/text_utils.py new file mode 100644 index 0000000000..c4ff75c675 --- /dev/null +++ b/src/google/adk/evaluation/text_utils.py @@ -0,0 +1,34 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Text utilities for evaluation.""" + +from __future__ import annotations + +import unicodedata + + +def normalize_text(text: str) -> str: + """Normalize text using NFC normalization and strip whitespace. + + This ensures consistent text comparison across different Unicode + representations, which is particularly important for non-English text. + + Args: + text: The text to normalize. + + Returns: + The normalized text. + """ + return unicodedata.normalize("NFC", text).strip() \ No newline at end of file diff --git a/tests/unittests/evaluation/test_non_english_eval.py b/tests/unittests/evaluation/test_non_english_eval.py new file mode 100644 index 0000000000..23a8cd02ae --- /dev/null +++ b/tests/unittests/evaluation/test_non_english_eval.py @@ -0,0 +1,40 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for final_response_match_v1.""" + +from __future__ import annotations + + +def test_debug_normalization(): + """Debug test to see if normalization is being applied.""" + from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores + from google.adk.evaluation.text_utils import normalize_text + + reference = "สวัสดี" + candidate = "สวัสดี" + + # Check normalization directly + norm_ref = normalize_text(reference) + norm_cand = normalize_text(candidate) + + print(f"Reference: {repr(reference)}") + print(f"Candidate: {repr(candidate)}") + print(f"Normalized reference: {repr(norm_ref)}") + print(f"Normalized candidate: {repr(norm_cand)}") + print(f"Are they equal after normalization? {norm_ref == norm_cand}") + + # Now test the actual function + score = _calculate_rouge_1_scores(candidate, reference) + print(f"ROUGE score: {score}") \ No newline at end of file From 249e4994b28a51c75121f6ab5154d4270a9b6885 Mon Sep 17 00:00:00 2001 From: robinpats666 Date: Tue, 13 Jan 2026 12:55:33 +0100 Subject: [PATCH 2/3] Formatted final_response_matchv1 codes and updated normalization tests to use pytest assertions instead of prints --- .../adk/evaluation/final_response_match_v1.py | 5 +-- .../evaluation/test_non_english_eval.py | 38 +++++++++---------- 2 files changed, 19 insertions(+), 24 deletions(-) diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py index 3fd136fe39..64b908c9ce 100644 --- a/src/google/adk/evaluation/final_response_match_v1.py +++ b/src/google/adk/evaluation/final_response_match_v1.py @@ -28,6 +28,7 @@ from .evaluator import Evaluator from .evaluator import PerInvocationResult from .text_utils import normalize_text #importing normalize_text function for non-English text comparison +from collections import Counter, namedtuple class RougeEvaluator(Evaluator): @@ -92,6 +93,7 @@ def _get_text_from_content(content: Optional[genai_types.Content]) -> str: def _get_eval_status(score: float, threshold: float): return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED +Score = namedtuple('Score', ['precision', 'recall', 'fmeasure']) def _calculate_rouge_1_scores(candidate: str, reference: str): """Calculates the ROUGE-1 score between a candidate and reference text. @@ -138,10 +140,8 @@ def _calculate_character_level_rouge(candidate: str, reference: str): Returns: A Score namedtuple with precision, recall, and fmeasure. """ - from collections import Counter, namedtuple if not reference or not candidate: - Score = namedtuple('Score', ['precision', 'recall', 'fmeasure']) return Score(precision=0.0, recall=0.0, fmeasure=0.0) # Count character occurrences @@ -161,5 +161,4 @@ def _calculate_character_level_rouge(candidate: str, reference: str): else: fmeasure = 0.0 - Score = namedtuple('Score', ['precision', 'recall', 'fmeasure']) return Score(precision=precision, recall=recall, fmeasure=fmeasure) \ No newline at end of file diff --git a/tests/unittests/evaluation/test_non_english_eval.py b/tests/unittests/evaluation/test_non_english_eval.py index 23a8cd02ae..3a89ddf784 100644 --- a/tests/unittests/evaluation/test_non_english_eval.py +++ b/tests/unittests/evaluation/test_non_english_eval.py @@ -15,26 +15,22 @@ """Tests for final_response_match_v1.""" from __future__ import annotations +import pytest +def test_normalization_applied_in_rouge(): + """Normalization should make identical Thai strings match.""" + from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores + from google.adk.evaluation.text_utils import normalize_text -def test_debug_normalization(): - """Debug test to see if normalization is being applied.""" - from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores - from google.adk.evaluation.text_utils import normalize_text - - reference = "สวัสดี" - candidate = "สวัสดี" - - # Check normalization directly - norm_ref = normalize_text(reference) - norm_cand = normalize_text(candidate) - - print(f"Reference: {repr(reference)}") - print(f"Candidate: {repr(candidate)}") - print(f"Normalized reference: {repr(norm_ref)}") - print(f"Normalized candidate: {repr(norm_cand)}") - print(f"Are they equal after normalization? {norm_ref == norm_cand}") - - # Now test the actual function - score = _calculate_rouge_1_scores(candidate, reference) - print(f"ROUGE score: {score}") \ No newline at end of file + reference = "สวัสดี" + candidate = "สวัสดี" + + # Verify normalization directly + assert normalize_text(reference) == normalize_text(candidate) + + # Verify ROUGE score reflects a perfect match + score = _calculate_rouge_1_scores(candidate, reference) + + assert score.precision == pytest.approx(1.0) + assert score.recall == pytest.approx(1.0) + assert score.fmeasure == pytest.approx(1.0) \ No newline at end of file From edfaa914cd9e09380006436b28b92d61589d4619 Mon Sep 17 00:00:00 2001 From: robinpats666 Date: Wed, 14 Jan 2026 22:00:42 +0100 Subject: [PATCH 3/3] Fixed import sorting and code formatting --- .../adk/evaluation/final_response_match_v1.py | 43 +++++++++++-------- src/google/adk/evaluation/text_utils.py | 8 ++-- .../evaluation/test_non_english_eval.py | 26 +++++------ 3 files changed, 42 insertions(+), 35 deletions(-) diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py index 64b908c9ce..83cb0e446c 100644 --- a/src/google/adk/evaluation/final_response_match_v1.py +++ b/src/google/adk/evaluation/final_response_match_v1.py @@ -14,6 +14,8 @@ from __future__ import annotations +from collections import Counter +from collections import namedtuple from typing import Optional from google.genai import types as genai_types @@ -27,8 +29,7 @@ from .evaluator import EvaluationResult from .evaluator import Evaluator from .evaluator import PerInvocationResult -from .text_utils import normalize_text #importing normalize_text function for non-English text comparison -from collections import Counter, namedtuple +from .text_utils import normalize_text class RougeEvaluator(Evaluator): @@ -48,7 +49,7 @@ def evaluate_invocations( conversation_scenario: Optional[ConversationScenario] = None, ) -> EvaluationResult: if expected_invocations is None: - raise ValueError("expected_invocations is required for this metric.") + raise ValueError('expected_invocations is required for this metric.') del conversation_scenario # not used by this metric. total_score = 0.0 @@ -85,16 +86,18 @@ def evaluate_invocations( def _get_text_from_content(content: Optional[genai_types.Content]) -> str: if content and content.parts: - return "\n".join([part.text for part in content.parts if part.text]) + return '\n'.join([part.text for part in content.parts if part.text]) - return "" + return '' def _get_eval_status(score: float, threshold: float): return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED + Score = namedtuple('Score', ['precision', 'recall', 'fmeasure']) + def _calculate_rouge_1_scores(candidate: str, reference: str): """Calculates the ROUGE-1 score between a candidate and reference text. @@ -111,7 +114,7 @@ def _calculate_rouge_1_scores(candidate: str, reference: str): reference: The ground-truth text to compare against. Returns: - A dictionary containing the ROUGE-1 precision, recall, and f-measure. + A Score namedtuple containing the ROUGE-1 precision, recall, and f-measure. """ # Normalize both texts before scoring to handle Unicode variations normalized_candidate = normalize_text(candidate) @@ -119,46 +122,48 @@ def _calculate_rouge_1_scores(candidate: str, reference: str): # Check if the text contains spaces (word-separated languages) has_spaces = ' ' in normalized_reference or ' ' in normalized_candidate - + if has_spaces: # Use standard word-level ROUGE for space-separated languages - scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True) + scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True) scores = scorer.score(normalized_reference, normalized_candidate) - return scores["rouge1"] + return scores['rouge1'] else: # For non-space-separated languages, use character-level comparison - return _calculate_character_level_rouge(normalized_candidate, normalized_reference) + return _calculate_character_level_rouge( + normalized_candidate, normalized_reference + ) def _calculate_character_level_rouge(candidate: str, reference: str): """Calculates character-level ROUGE-1 score for non-space-separated text. - + Args: candidate: The candidate text (already normalized). reference: The reference text (already normalized). - + Returns: A Score namedtuple with precision, recall, and fmeasure. """ - + if not reference or not candidate: return Score(precision=0.0, recall=0.0, fmeasure=0.0) - + # Count character occurrences ref_chars = Counter(reference) cand_chars = Counter(candidate) - + # Calculate overlapping characters overlap = sum((ref_chars & cand_chars).values()) - + # Calculate precision and recall precision = overlap / len(candidate) if len(candidate) > 0 else 0.0 recall = overlap / len(reference) if len(reference) > 0 else 0.0 - + # Calculate F-measure if precision + recall > 0: fmeasure = 2 * (precision * recall) / (precision + recall) else: fmeasure = 0.0 - - return Score(precision=precision, recall=recall, fmeasure=fmeasure) \ No newline at end of file + + return Score(precision=precision, recall=recall, fmeasure=fmeasure) diff --git a/src/google/adk/evaluation/text_utils.py b/src/google/adk/evaluation/text_utils.py index c4ff75c675..46ef562256 100644 --- a/src/google/adk/evaluation/text_utils.py +++ b/src/google/adk/evaluation/text_utils.py @@ -21,14 +21,14 @@ def normalize_text(text: str) -> str: """Normalize text using NFC normalization and strip whitespace. - + This ensures consistent text comparison across different Unicode representations, which is particularly important for non-English text. - + Args: text: The text to normalize. - + Returns: The normalized text. """ - return unicodedata.normalize("NFC", text).strip() \ No newline at end of file + return unicodedata.normalize("NFC", text).strip() diff --git a/tests/unittests/evaluation/test_non_english_eval.py b/tests/unittests/evaluation/test_non_english_eval.py index 3a89ddf784..9cdd0efa45 100644 --- a/tests/unittests/evaluation/test_non_english_eval.py +++ b/tests/unittests/evaluation/test_non_english_eval.py @@ -15,22 +15,24 @@ """Tests for final_response_match_v1.""" from __future__ import annotations + import pytest + def test_normalization_applied_in_rouge(): - """Normalization should make identical Thai strings match.""" - from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores - from google.adk.evaluation.text_utils import normalize_text + """Normalization should make identical Thai strings match.""" + from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores + from google.adk.evaluation.text_utils import normalize_text - reference = "สวัสดี" - candidate = "สวัสดี" + reference = "สวัสดี" + candidate = "สวัสดี" - # Verify normalization directly - assert normalize_text(reference) == normalize_text(candidate) + # Verify normalization directly + assert normalize_text(reference) == normalize_text(candidate) - # Verify ROUGE score reflects a perfect match - score = _calculate_rouge_1_scores(candidate, reference) + # Verify ROUGE score reflects a perfect match + score = _calculate_rouge_1_scores(candidate, reference) - assert score.precision == pytest.approx(1.0) - assert score.recall == pytest.approx(1.0) - assert score.fmeasure == pytest.approx(1.0) \ No newline at end of file + assert score.precision == pytest.approx(1.0) + assert score.recall == pytest.approx(1.0) + assert score.fmeasure == pytest.approx(1.0)