ProjectTech4DevAI · rkritika1508 · Feb 19, 2026 · Feb 19, 2026 · Feb 20, 2026 · Feb 20, 2026
diff --git a/.gitignore b/.gitignore
@@ -10,4 +10,5 @@ htmlcov
 .env
 .env.test
 metrics.json
-predictions.csv
+predictions.csv
+backend/app/evaluation/datasets/*.csv
diff --git a/backend/README.md b/backend/README.md
@@ -97,48 +97,61 @@ If you use GitHub Actions the tests will run automatically.
 
 We can benchmark validators like PII Remover and Lexical Slur Detection on curated datasets.
 
-Download the dataset from [here](https://drive.google.com/drive/u/0/folders/1Rd1LH-oEwCkU0pBDRrYYedExorwmXA89). This contains multiple folders, one for each validator. Each folder contains a testing dataset in csv format for the validator. Download these csv files and store it in `backend/app/evaluation/datasets/` folder. Once the datasets have been stored, we can run the evaluation script for each validator. 
+Download the dataset from [Google Drive](https://drive.google.com/drive/u/0/folders/1Rd1LH-oEwCkU0pBDRrYYedExorwmXA89).This contains multiple folders, one for each validator. Each folder contains a testing dataset in csv format for the validator. Download these csv files and store them in `backend/app/evaluation/datasets/`.
 
-For lexical slur match, ban list and gender assumption bias, testing doesn't make much sense cause these are deterministic. However, we curated a dataset for lexical slur match for use in toxicity detection validator later on. 
+Important: each `run.py` expects a specific filename, so dataset files must be named exactly as below:
+- `app/evaluation/lexical_slur/run.py` expects `lexical_slur_testing_dataset.csv`
+- `app/evaluation/pii/run.py` expects `pii_detection_testing_dataset.csv`
+- `app/evaluation/gender_assumption_bias/run.py` expects `gender_bias_assumption_dataset.csv`
+
+Once these files are in place with the exact names above, run the evaluation scripts.
+
+Unit tests for lexical slur match, ban list, and gender assumption bias validators have limited value because their logic is deterministic. However, curated datasets exist for lexical slur match and gender assumption bias to benchmark accuracy and latency. The lexical slur dataset will also be used in future toxicity detection workflows.
 
 Each validator produces:
 - predictions.csv – row-level outputs for debugging and analysis
 - metrics.json – aggregated accuracy + performance metrics
 
 Standardized output structure:
-```
+```text
 app/evaluation/outputs/
   lexical_slur/
     predictions.csv
     metrics.json
+  gender_assumption_bias/
+    predictions.csv
+    metrics.json
   pii_remover/
     predictions.csv
     metrics.json
 ```
 
-- To evaluate Lexical Slur Validator, run the offline evaluation script: `python app/evaluation/lexical_slur/run.py` 
-
-Expected outputs:
-```
-app/evaluation/outputs/lexical_slur/
-├── predictions.csv
-└── metrics.json
+- To run all evaluation scripts together, use:
+```bash
+bash scripts/run_all_evaluations.sh
 ```
+This script runs the evaluators in sequence:
+- `app/evaluation/lexical_slur/run.py`
+- `app/evaluation/pii/run.py`
+- `app/evaluation/gender_assumption_bias/run.py`
+
 predictions.csv contains row-level inputs, predictions, and labels.
 
 metrics.json contains binary classification metrics and performance stats (latency + peak memory).
 
+- To evaluate Lexical Slur Validator, run the offline evaluation script: `python app/evaluation/lexical_slur/run.py` 
+
 - To evaluate PII Validator, run the PII evaluation script: `python app/evaluation/pii/run.py`
 
-Expected outputs:
-```
-app/evaluation/outputs/pii_remover/
-├── predictions.csv
-└── metrics.json
-```
-predictions.csv contains original text, anonymized output, ground-truth masked text
+`predictions.csv` contains original text, anonymized output, ground-truth masked text
+
+`metrics.json` contains entity-level precision, recall, and F1 per PII type.
+
+- To evaluate Gender Assumption Bias Validator, run: `python app/evaluation/gender_assumption_bias/run.py`
+
+`predictions.csv` contains biased and neutral samples with predicted outcomes for each.
 
-metrics.json contains entity-level precision, recall, and F1 per PII type.
+`metrics.json` contains binary classification metrics and performance stats (latency + peak memory).
 
 ## Validator configuration guide
 

diff --git a/backend/app/evaluation/common/helper.py b/backend/app/evaluation/common/helper.py
@@ -31,9 +31,9 @@ def compute_binary_metrics(y_true, y_pred):
         "tn": tn,
         "fp": fp,
         "fn": fn,
-        "precision": precision,
-        "recall": recall,
-        "f1": f1,
+        "precision": round(precision, 2),
+        "recall": round(recall, 2),
+        "f1": round(f1, 2),
     }
 
 

diff --git a/backend/app/evaluation/gender_assumption_bias/run.py b/backend/app/evaluation/gender_assumption_bias/run.py
@@ -0,0 +1,69 @@
+from pathlib import Path
+import pandas as pd
+from guardrails.validators import FailResult
+
+from app.core.validators.gender_assumption_bias import GenderAssumptionBias
+from app.evaluation.common.helper import (
+    compute_binary_metrics,
+    Profiler,
+    write_csv,
+    write_json,
+)
+
+BASE_DIR = Path(__file__).resolve().parent.parent
+OUT_DIR = BASE_DIR / "outputs" / "gender_assumption_bias"
+
+df = pd.read_csv(BASE_DIR / "datasets" / "gender_bias_assumption_dataset.csv")
+
+validator = GenderAssumptionBias()
+
+with Profiler() as p:
+    df["biased_result"] = (
+        df["biased input"]
+        .astype(str)
+        .apply(lambda x: p.record(lambda t: validator.validate(t, metadata=None), x))
+    )
+
+    df["neutral_result"] = (
+        df["neutral output"]
+        .astype(str)
+        .apply(lambda x: p.record(lambda t: validator.validate(t, metadata=None), x))
+    )
+
+# For biased input → should FAIL (1)
+df["biased_pred"] = df["biased_result"].apply(lambda r: int(isinstance(r, FailResult)))
+
+# For neutral output → should PASS (0)
+df["neutral_pred"] = df["neutral_result"].apply(
+    lambda r: int(isinstance(r, FailResult))
+)
+
+df["biased_true"] = 1
+df["neutral_true"] = 0
+
+y_true = list(df["biased_true"]) + list(df["neutral_true"])
+y_pred = list(df["biased_pred"]) + list(df["neutral_pred"])
+
+metrics = compute_binary_metrics(y_true, y_pred)
+
+write_csv(
+    df.drop(columns=["biased_result", "neutral_result"]),
+    OUT_DIR / "predictions.csv",
+)
+
+write_json(
+    {
+        "guardrail": "gender_assumption_bias",
+        "num_samples": len(df) * 2,  # because evaluating both sides
+        "metrics": metrics,
+        "performance": {
+            "latency_ms": {
+                "mean": round(sum(p.latencies) / len(p.latencies), 2),
+                "p95": round(sorted(p.latencies)[int(len(p.latencies) * 0.95)], 2),
+                "max": round(max(p.latencies), 2),
+            },
+            "memory_mb": round(p.peak_memory_mb, 2),
+        },
+    },
+    OUT_DIR / "metrics.json",
+)
diff --git a/backend/app/evaluation/lexical_slur/run.py b/backend/app/evaluation/lexical_slur/run.py
@@ -39,11 +39,11 @@
         "metrics": metrics,
         "performance": {
             "latency_ms": {
-                "mean": sum(p.latencies) / len(p.latencies),
-                "p95": sorted(p.latencies)[int(len(p.latencies) * 0.95)],
-                "max": max(p.latencies),
+                "mean": round(sum(p.latencies) / len(p.latencies), 2),
+                "p95": round(sorted(p.latencies)[int(len(p.latencies) * 0.95)], 2),
+                "max": round(max(p.latencies), 2),
             },
-            "memory_mb": p.peak_memory_mb,
+            "memory_mb": round(p.peak_memory_mb, 2),
         },
     },
     OUT_DIR / "metrics.json",

diff --git a/backend/app/evaluation/pii/entity_metrics.py b/backend/app/evaluation/pii/entity_metrics.py
@@ -75,9 +75,9 @@ def finalize_entity_metrics(stats: Dict[str, dict]) -> Dict[str, dict]:
             "tp": tp,
             "fp": fp,
             "fn": fn,
-            "precision": precision,
-            "recall": recall,
-            "f1": f1,
+            "precision": round(precision, 2),
+            "recall": round(recall, 2),
+            "f1": round(f1, 2),
         }
 
     return report
diff --git a/backend/app/evaluation/pii/run.py b/backend/app/evaluation/pii/run.py
@@ -4,7 +4,7 @@
 
 from app.core.validators.pii_remover import PIIRemover
 from app.evaluation.pii.entity_metrics import compute_entity_metrics
-from app.evaluation.common.helper import write_csv, write_json
+from app.evaluation.common.helper import Profiler, write_csv, write_json
 
 BASE_DIR = Path(__file__).resolve().parent.parent
 OUT_DIR = BASE_DIR / "outputs" / "pii_remover"
@@ -21,7 +21,10 @@ def run_pii(text: str) -> str:
     return text
 
 
-df["anonymized"] = df["source_text"].astype(str).apply(run_pii)
+with Profiler() as p:
+    df["anonymized"] = (
+        df["source_text"].astype(str).apply(lambda x: p.record(run_pii, x))
+    )
 
 entity_report = compute_entity_metrics(
     df["target_text"],
@@ -36,6 +39,14 @@ def run_pii(text: str) -> str:
         "guardrail": "pii_remover",
         "num_samples": len(df),
         "entity_metrics": entity_report,
+        "performance": {
+            "latency_ms": {
+                "mean": round(sum(p.latencies) / len(p.latencies), 2),
+                "p95": round(sorted(p.latencies)[int(len(p.latencies) * 0.95)], 2),
+                "max": round(max(p.latencies), 2),
+            },
+            "memory_mb": round(p.peak_memory_mb, 2),
+        },
     },
     OUT_DIR / "metrics.json",
 )
diff --git a/backend/scripts/run_all_evaluations.sh b/backend/scripts/run_all_evaluations.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+BACKEND_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+EVAL_DIR="$BACKEND_DIR/app/evaluation"
+
+RUNNERS=(
+  "$EVAL_DIR/lexical_slur/run.py"
+  "$EVAL_DIR/pii/run.py"
+  "$EVAL_DIR/gender_assumption_bias/run.py"
+)
+
+echo "Running validator evaluations..."
+echo "Backend dir: $BACKEND_DIR"
+
+for runner in "${RUNNERS[@]}"; do
+  name="$(basename "$(dirname "$runner")")"
+  echo ""
+  echo "==> [$name] $runner"
+  uv run python "$runner"
+done
+
+echo ""
+echo "All validator evaluations completed."