From 0f17a06d219ff38c9579381166d5f74ae5b87b9f Mon Sep 17 00:00:00 2001
From: robinpats666 <patelrabin89@gmail.com>
Date: Tue, 13 Jan 2026 00:15:54 +0100
Subject: [PATCH 1/3] Fixes #3111

- Created text_utils.py with normalize_text() for NFC Unicode normalization
- Modified _calculate_rouge_1_scores() to normalize texts before comparison
- Added automatic detection: word-level ROUGE for space-separated text,
  character-level scoring for non-space-separated text
- Implemented _calculate_character_level_rouge() using Counter to calculate
  precision/recall/F-measure from character overlap
- Created test_non_english_eval.py for Thai text evaluation

Previously returned 0.0 for non-English text. Now provides proportional
scores based on character frequency overlap.
---
 .../adk/evaluation/final_response_match_v1.py | 60 ++++++++++++++++---
 src/google/adk/evaluation/text_utils.py       | 34 +++++++++++
 .../evaluation/test_non_english_eval.py       | 40 +++++++++++++
 3 files changed, 127 insertions(+), 7 deletions(-)
 create mode 100644 src/google/adk/evaluation/text_utils.py
 create mode 100644 tests/unittests/evaluation/test_non_english_eval.py

diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py
index fb17fe80eb..3fd136fe39 100644
--- a/src/google/adk/evaluation/final_response_match_v1.py
+++ b/src/google/adk/evaluation/final_response_match_v1.py
@@ -27,6 +27,7 @@
 from .evaluator import EvaluationResult
 from .evaluator import Evaluator
 from .evaluator import PerInvocationResult
+from .text_utils import normalize_text #importing normalize_text function for non-English text comparison
 
 
 class RougeEvaluator(Evaluator):
@@ -110,10 +111,55 @@ def _calculate_rouge_1_scores(candidate: str, reference: str):
   Returns:
       A dictionary containing the ROUGE-1 precision, recall, and f-measure.
   """
-  scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
-
-  # The score method returns a dictionary where keys are the ROUGE types
-  # and values are Score objects (tuples) with precision, recall, and fmeasure.
-  scores = scorer.score(reference, candidate)
-
-  return scores["rouge1"]
+  # Normalize both texts before scoring to handle Unicode variations
+  normalized_candidate = normalize_text(candidate)
+  normalized_reference = normalize_text(reference)
+
+  # Check if the text contains spaces (word-separated languages)
+  has_spaces = ' ' in normalized_reference or ' ' in normalized_candidate
+  
+  if has_spaces:
+    # Use standard word-level ROUGE for space-separated languages
+    scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
+    scores = scorer.score(normalized_reference, normalized_candidate)
+    return scores["rouge1"]
+  else:
+    # For non-space-separated languages, use character-level comparison
+    return _calculate_character_level_rouge(normalized_candidate, normalized_reference)
+
+
+def _calculate_character_level_rouge(candidate: str, reference: str):
+  """Calculates character-level ROUGE-1 score for non-space-separated text.
+  
+  Args:
+    candidate: The candidate text (already normalized).
+    reference: The reference text (already normalized).
+  
+  Returns:
+    A Score namedtuple with precision, recall, and fmeasure.
+  """
+  from collections import Counter, namedtuple
+  
+  if not reference or not candidate:
+    Score = namedtuple('Score', ['precision', 'recall', 'fmeasure'])
+    return Score(precision=0.0, recall=0.0, fmeasure=0.0)
+  
+  # Count character occurrences
+  ref_chars = Counter(reference)
+  cand_chars = Counter(candidate)
+  
+  # Calculate overlapping characters
+  overlap = sum((ref_chars & cand_chars).values())
+  
+  # Calculate precision and recall
+  precision = overlap / len(candidate) if len(candidate) > 0 else 0.0
+  recall = overlap / len(reference) if len(reference) > 0 else 0.0
+  
+  # Calculate F-measure
+  if precision + recall > 0:
+    fmeasure = 2 * (precision * recall) / (precision + recall)
+  else:
+    fmeasure = 0.0
+  
+  Score = namedtuple('Score', ['precision', 'recall', 'fmeasure'])
+  return Score(precision=precision, recall=recall, fmeasure=fmeasure)
\ No newline at end of file
diff --git a/src/google/adk/evaluation/text_utils.py b/src/google/adk/evaluation/text_utils.py
new file mode 100644
index 0000000000..c4ff75c675
--- /dev/null
+++ b/src/google/adk/evaluation/text_utils.py
@@ -0,0 +1,34 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Text utilities for evaluation."""
+
+from __future__ import annotations
+
+import unicodedata
+
+
+def normalize_text(text: str) -> str:
+  """Normalize text using NFC normalization and strip whitespace.
+  
+  This ensures consistent text comparison across different Unicode
+  representations, which is particularly important for non-English text.
+  
+  Args:
+    text: The text to normalize.
+    
+  Returns:
+    The normalized text.
+  """
+  return unicodedata.normalize("NFC", text).strip()
\ No newline at end of file
diff --git a/tests/unittests/evaluation/test_non_english_eval.py b/tests/unittests/evaluation/test_non_english_eval.py
new file mode 100644
index 0000000000..23a8cd02ae
--- /dev/null
+++ b/tests/unittests/evaluation/test_non_english_eval.py
@@ -0,0 +1,40 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for final_response_match_v1."""
+
+from __future__ import annotations
+
+
+def test_debug_normalization():
+  """Debug test to see if normalization is being applied."""
+  from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores
+  from google.adk.evaluation.text_utils import normalize_text
+  
+  reference = "สวัสดี"
+  candidate = "สวัสดี"
+  
+  # Check normalization directly
+  norm_ref = normalize_text(reference)
+  norm_cand = normalize_text(candidate)
+  
+  print(f"Reference: {repr(reference)}")
+  print(f"Candidate: {repr(candidate)}")
+  print(f"Normalized reference: {repr(norm_ref)}")
+  print(f"Normalized candidate: {repr(norm_cand)}")
+  print(f"Are they equal after normalization? {norm_ref == norm_cand}")
+  
+  # Now test the actual function
+  score = _calculate_rouge_1_scores(candidate, reference)
+  print(f"ROUGE score: {score}")
\ No newline at end of file

From 249e4994b28a51c75121f6ab5154d4270a9b6885 Mon Sep 17 00:00:00 2001
From: robinpats666 <patelrabin89@gmail.com>
Date: Tue, 13 Jan 2026 12:55:33 +0100
Subject: [PATCH 2/3] Formatted final_response_matchv1 codes and updated
 normalization tests to use pytest assertions instead of prints

---
 .../adk/evaluation/final_response_match_v1.py |  5 +--
 .../evaluation/test_non_english_eval.py       | 38 +++++++++----------
 2 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py
index 3fd136fe39..64b908c9ce 100644
--- a/src/google/adk/evaluation/final_response_match_v1.py
+++ b/src/google/adk/evaluation/final_response_match_v1.py
@@ -28,6 +28,7 @@
 from .evaluator import Evaluator
 from .evaluator import PerInvocationResult
 from .text_utils import normalize_text #importing normalize_text function for non-English text comparison
+from collections import Counter, namedtuple
 
 
 class RougeEvaluator(Evaluator):
@@ -92,6 +93,7 @@ def _get_text_from_content(content: Optional[genai_types.Content]) -> str:
 def _get_eval_status(score: float, threshold: float):
   return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
 
+Score = namedtuple('Score', ['precision', 'recall', 'fmeasure'])
 
 def _calculate_rouge_1_scores(candidate: str, reference: str):
   """Calculates the ROUGE-1 score between a candidate and reference text.
@@ -138,10 +140,8 @@ def _calculate_character_level_rouge(candidate: str, reference: str):
   Returns:
     A Score namedtuple with precision, recall, and fmeasure.
   """
-  from collections import Counter, namedtuple
   
   if not reference or not candidate:
-    Score = namedtuple('Score', ['precision', 'recall', 'fmeasure'])
     return Score(precision=0.0, recall=0.0, fmeasure=0.0)
   
   # Count character occurrences
@@ -161,5 +161,4 @@ def _calculate_character_level_rouge(candidate: str, reference: str):
   else:
     fmeasure = 0.0
   
-  Score = namedtuple('Score', ['precision', 'recall', 'fmeasure'])
   return Score(precision=precision, recall=recall, fmeasure=fmeasure)
\ No newline at end of file
diff --git a/tests/unittests/evaluation/test_non_english_eval.py b/tests/unittests/evaluation/test_non_english_eval.py
index 23a8cd02ae..3a89ddf784 100644
--- a/tests/unittests/evaluation/test_non_english_eval.py
+++ b/tests/unittests/evaluation/test_non_english_eval.py
@@ -15,26 +15,22 @@
 """Tests for final_response_match_v1."""
 
 from __future__ import annotations
+import pytest
 
+def test_normalization_applied_in_rouge():
+    """Normalization should make identical Thai strings match."""
+    from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores
+    from google.adk.evaluation.text_utils import normalize_text
 
-def test_debug_normalization():
-  """Debug test to see if normalization is being applied."""
-  from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores
-  from google.adk.evaluation.text_utils import normalize_text
-  
-  reference = "สวัสดี"
-  candidate = "สวัสดี"
-  
-  # Check normalization directly
-  norm_ref = normalize_text(reference)
-  norm_cand = normalize_text(candidate)
-  
-  print(f"Reference: {repr(reference)}")
-  print(f"Candidate: {repr(candidate)}")
-  print(f"Normalized reference: {repr(norm_ref)}")
-  print(f"Normalized candidate: {repr(norm_cand)}")
-  print(f"Are they equal after normalization? {norm_ref == norm_cand}")
-  
-  # Now test the actual function
-  score = _calculate_rouge_1_scores(candidate, reference)
-  print(f"ROUGE score: {score}")
\ No newline at end of file
+    reference = "สวัสดี"
+    candidate = "สวัสดี"
+
+    # Verify normalization directly
+    assert normalize_text(reference) == normalize_text(candidate)
+
+    # Verify ROUGE score reflects a perfect match
+    score = _calculate_rouge_1_scores(candidate, reference)
+
+    assert score.precision == pytest.approx(1.0)
+    assert score.recall == pytest.approx(1.0)
+    assert score.fmeasure == pytest.approx(1.0)
\ No newline at end of file

From edfaa914cd9e09380006436b28b92d61589d4619 Mon Sep 17 00:00:00 2001
From: robinpats666 <patelrabin89@gmail.com>
Date: Wed, 14 Jan 2026 22:00:42 +0100
Subject: [PATCH 3/3] Fixed import sorting and code formatting

---
 .../adk/evaluation/final_response_match_v1.py | 43 +++++++++++--------
 src/google/adk/evaluation/text_utils.py       |  8 ++--
 .../evaluation/test_non_english_eval.py       | 26 +++++------
 3 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py
index 64b908c9ce..83cb0e446c 100644
--- a/src/google/adk/evaluation/final_response_match_v1.py
+++ b/src/google/adk/evaluation/final_response_match_v1.py
@@ -14,6 +14,8 @@
 
 from __future__ import annotations
 
+from collections import Counter
+from collections import namedtuple
 from typing import Optional
 
 from google.genai import types as genai_types
@@ -27,8 +29,7 @@
 from .evaluator import EvaluationResult
 from .evaluator import Evaluator
 from .evaluator import PerInvocationResult
-from .text_utils import normalize_text #importing normalize_text function for non-English text comparison
-from collections import Counter, namedtuple
+from .text_utils import normalize_text
 
 
 class RougeEvaluator(Evaluator):
@@ -48,7 +49,7 @@ def evaluate_invocations(
       conversation_scenario: Optional[ConversationScenario] = None,
   ) -> EvaluationResult:
     if expected_invocations is None:
-      raise ValueError("expected_invocations is required for this metric.")
+      raise ValueError('expected_invocations is required for this metric.')
     del conversation_scenario  # not used by this metric.
 
     total_score = 0.0
@@ -85,16 +86,18 @@ def evaluate_invocations(
 
 def _get_text_from_content(content: Optional[genai_types.Content]) -> str:
   if content and content.parts:
-    return "\n".join([part.text for part in content.parts if part.text])
+    return '\n'.join([part.text for part in content.parts if part.text])
 
-  return ""
+  return ''
 
 
 def _get_eval_status(score: float, threshold: float):
   return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
 
+
 Score = namedtuple('Score', ['precision', 'recall', 'fmeasure'])
 
+
 def _calculate_rouge_1_scores(candidate: str, reference: str):
   """Calculates the ROUGE-1 score between a candidate and reference text.
 
@@ -111,7 +114,7 @@ def _calculate_rouge_1_scores(candidate: str, reference: str):
       reference: The ground-truth text to compare against.
 
   Returns:
-      A dictionary containing the ROUGE-1 precision, recall, and f-measure.
+      A Score namedtuple containing the ROUGE-1 precision, recall, and f-measure.
   """
   # Normalize both texts before scoring to handle Unicode variations
   normalized_candidate = normalize_text(candidate)
@@ -119,46 +122,48 @@ def _calculate_rouge_1_scores(candidate: str, reference: str):
 
   # Check if the text contains spaces (word-separated languages)
   has_spaces = ' ' in normalized_reference or ' ' in normalized_candidate
-  
+
   if has_spaces:
     # Use standard word-level ROUGE for space-separated languages
-    scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
+    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
     scores = scorer.score(normalized_reference, normalized_candidate)
-    return scores["rouge1"]
+    return scores['rouge1']
   else:
     # For non-space-separated languages, use character-level comparison
-    return _calculate_character_level_rouge(normalized_candidate, normalized_reference)
+    return _calculate_character_level_rouge(
+        normalized_candidate, normalized_reference
+    )
 
 
 def _calculate_character_level_rouge(candidate: str, reference: str):
   """Calculates character-level ROUGE-1 score for non-space-separated text.
-  
+
   Args:
     candidate: The candidate text (already normalized).
     reference: The reference text (already normalized).
-  
+
   Returns:
     A Score namedtuple with precision, recall, and fmeasure.
   """
-  
+
   if not reference or not candidate:
     return Score(precision=0.0, recall=0.0, fmeasure=0.0)
-  
+
   # Count character occurrences
   ref_chars = Counter(reference)
   cand_chars = Counter(candidate)
-  
+
   # Calculate overlapping characters
   overlap = sum((ref_chars & cand_chars).values())
-  
+
   # Calculate precision and recall
   precision = overlap / len(candidate) if len(candidate) > 0 else 0.0
   recall = overlap / len(reference) if len(reference) > 0 else 0.0
-  
+
   # Calculate F-measure
   if precision + recall > 0:
     fmeasure = 2 * (precision * recall) / (precision + recall)
   else:
     fmeasure = 0.0
-  
-  return Score(precision=precision, recall=recall, fmeasure=fmeasure)
\ No newline at end of file
+
+  return Score(precision=precision, recall=recall, fmeasure=fmeasure)
diff --git a/src/google/adk/evaluation/text_utils.py b/src/google/adk/evaluation/text_utils.py
index c4ff75c675..46ef562256 100644
--- a/src/google/adk/evaluation/text_utils.py
+++ b/src/google/adk/evaluation/text_utils.py
@@ -21,14 +21,14 @@
 
 def normalize_text(text: str) -> str:
   """Normalize text using NFC normalization and strip whitespace.
-  
+
   This ensures consistent text comparison across different Unicode
   representations, which is particularly important for non-English text.
-  
+
   Args:
     text: The text to normalize.
-    
+
   Returns:
     The normalized text.
   """
-  return unicodedata.normalize("NFC", text).strip()
\ No newline at end of file
+  return unicodedata.normalize("NFC", text).strip()
diff --git a/tests/unittests/evaluation/test_non_english_eval.py b/tests/unittests/evaluation/test_non_english_eval.py
index 3a89ddf784..9cdd0efa45 100644
--- a/tests/unittests/evaluation/test_non_english_eval.py
+++ b/tests/unittests/evaluation/test_non_english_eval.py
@@ -15,22 +15,24 @@
 """Tests for final_response_match_v1."""
 
 from __future__ import annotations
+
 import pytest
 
+
 def test_normalization_applied_in_rouge():
-    """Normalization should make identical Thai strings match."""
-    from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores
-    from google.adk.evaluation.text_utils import normalize_text
+  """Normalization should make identical Thai strings match."""
+  from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores
+  from google.adk.evaluation.text_utils import normalize_text
 
-    reference = "สวัสดี"
-    candidate = "สวัสดี"
+  reference = "สวัสดี"
+  candidate = "สวัสดี"
 
-    # Verify normalization directly
-    assert normalize_text(reference) == normalize_text(candidate)
+  # Verify normalization directly
+  assert normalize_text(reference) == normalize_text(candidate)
 
-    # Verify ROUGE score reflects a perfect match
-    score = _calculate_rouge_1_scores(candidate, reference)
+  # Verify ROUGE score reflects a perfect match
+  score = _calculate_rouge_1_scores(candidate, reference)
 
-    assert score.precision == pytest.approx(1.0)
-    assert score.recall == pytest.approx(1.0)
-    assert score.fmeasure == pytest.approx(1.0)
\ No newline at end of file
+  assert score.precision == pytest.approx(1.0)
+  assert score.recall == pytest.approx(1.0)
+  assert score.fmeasure == pytest.approx(1.0)