google-deepmind · Karan24Soni · Jun 12, 2026
diff --git a/proeval/utils/metrics.py b/proeval/utils/metrics.py
@@ -75,7 +75,12 @@ def topic_entropy(topics: List[str], normalize: bool = True) -> float:
     counts = Counter(topics)
     n = len(topics)
     probs = np.array([c / n for c in counts.values()])
-    entropy = float(-np.sum(probs * np.log2(probs + 1e-12)))
+
+    # The 1e-12 smoothing term avoids log2(0) but makes a single-class
+    # distribution (true entropy == 0) compute as a tiny negative number
+    # (e.g. -1.44e-12). Clamp to 0 so entropy is never negative.
+    entropy = max(0.0, float(-np.sum(probs * np.log2(probs + 1e-12))))
+
     if normalize:
         max_ent = np.log2(len(counts)) if len(counts) > 1 else 1.0
         return (entropy / max_ent) * 100.0 if max_ent > 0 else 0.0
@@ -425,4 +430,4 @@ def print_results_table(
         print(
             "AUROC: Higher is better. Measures discrimination ability "
             "(0.5=random, 1.0=perfect)."
-        )
+        )
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
@@ -0,0 +1,130 @@
+# Copyright 2026 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for proeval.utils.metrics."""
+import numpy as np
+import pytest
+from proeval.utils.metrics import (
+    compute_samples_to_threshold,
+    embedding_coverage,
+    failure_rate,
+    overall_diversity,
+    topic_entropy,
+)
+def test_topic_entropy_empty():
+    assert topic_entropy([]) == 0.0
+
+@pytest.mark.parametrize("topics", [["a"], ["a", "a", "a"], ["x", "x"]])
+def test_topic_entropy_single_topic(topics):
+    assert topic_entropy(topics, normalize=True) == 0.0
+    assert topic_entropy(topics, normalize=False) == 0.0
+
+def test_topic_entropy_uniform():
+    result = topic_entropy(["a", "b", "a", "b"])
+    assert result == pytest.approx(100.0, abs=1e-6)
+
+def test_topic_entropy_non_negative():
+    for topics in ([], ["a"], ["a", "a"], ["a", "b", "c"], ["a", "a", "b"]):
+        assert topic_entropy(topics, normalize=False) >= 0.0
+
+def test_topic_entropy_skewed():
+    result = topic_entropy(["a", "a", "a", "b"])
+    assert 0.0 < result < 100.0
+
+def test_embedding_coverage_too_few_rows():
+    assert embedding_coverage(np.array([[1.0, 2.0]])) == 0.0
+    assert embedding_coverage(np.zeros((0, 2))) == 0.0
+
+def test_embedding_coverage_non_2d():
+    assert embedding_coverage(np.array([1.0, 2.0, 3.0])) == 0.0
+
+def test_embedding_coverage_diverse_vs_collinear():
+    # normalize_to_01=True can yield out-of-bounds values, so we test 
+    # the relative ordering with False to ensure diverse > collinear.
+    rng = np.random.default_rng(0)
+    diverse = rng.standard_normal((20, 8))
+    collinear = np.ones((20, 8))
+    diverse_score = embedding_coverage(diverse, normalize_to_01=False)
+    collinear_score = embedding_coverage(collinear, normalize_to_01=False)
+    assert diverse_score > collinear_score
+
+def test_embedding_coverage_finite_raw_logdet():
+    rng = np.random.default_rng(1)
+    emb = rng.standard_normal((10, 4))
+    assert np.isfinite(embedding_coverage(emb, normalize_to_01=False))
+
+
+@pytest.mark.parametrize(
+    "entropy,coverage,w_entropy,w_coverage,expected",
+    [
+        (100.0, 1.0, 0.5, 0.5, 100.0),
+        (0.0, 0.0, 0.5, 0.5, 0.0),
+        (50.0, 0.5, 0.5, 0.5, 50.0),
+        (100.0, 0.0, 1.0, 0.0, 100.0),
+        (0.0, 1.0, 0.0, 1.0, 100.0),
+    ],
+)
+def test_overall_diversity_weights(entropy, coverage, w_entropy, w_coverage, expected):
+    result = overall_diversity(entropy, coverage, w_entropy, w_coverage)
+    assert result == pytest.approx(expected)
+
+# tests for failure_rate
+def test_failure_rate_empty():
+    assert failure_rate([]) == 0.0
+
+@pytest.mark.parametrize(
+    "scores,expected",
+    [
+        ([1.0, 0.0, 1.0, 0.0, 0.0], 40.0),
+        ([0.0, 0.0, 0.0], 0.0),
+        ([1.0, 1.0], 100.0),
+        ([1.0, 0.0, 0.0, 0.0], 25.0),
+    ],
+)
+def test_failure_rate_calc(scores, expected):
+    assert failure_rate(scores) == pytest.approx(expected)
+
+
+def test_compute_samples_skips_empty_estimates():
+    results = {
+        "empty_method": {"estimates": []},
+        "good": {"estimates": [[1.0, 0.0]]},
+    }
+    means, stds = compute_samples_to_threshold(results, true_mean=0.0, thresholds=[0.5])
+    assert "empty_method" not in means
+    assert "good" in means
+
+def test_compute_samples_threshold_crossing():
+    results = {"method": {"estimates": [[0.5, 0.3, 0.1, 0.02, 0.005]]}}
+    means, stds = compute_samples_to_threshold(results, true_mean=0.0, thresholds=[0.05])
+    assert means["method"][0.05] == pytest.approx(4.0)
+    assert stds["method"][0.05] == 0.0
+
+def test_compute_samples_never_reached():
+    results = {"method": {"estimates": [[0.5, 0.4, 0.3]]}}
+    means, _ = compute_samples_to_threshold(results, true_mean=0.0, thresholds=[1e-9])
+    assert means["method"][1e-9] == pytest.approx(3.0)
+
+def test_compute_samples_multiple_runs():
+    results = {
+        "method": {
+            "estimates": [
+                [0.5, 0.04, 0.01, 0.001],
+                [0.5, 0.4, 0.3, 0.02],
+            ]
+        }
+    }
+    means, stds = compute_samples_to_threshold(results, true_mean=0.0, thresholds=[0.05])
+    assert means["method"][0.05] == pytest.approx(3.0)
+    assert stds["method"][0.05] > 0.0