From 130d60976ec2b09a4759b7740f316bc9916c148a Mon Sep 17 00:00:00 2001 From: rkritika1508 Date: Thu, 19 Feb 2026 17:51:36 +0530 Subject: [PATCH 1/5] added gender assumption bias evaluation --- .gitignore | 3 +- .../evaluation/gender_assumption_bias/run.py | 69 +++++++++++++++++++ backend/scripts/run_all_evaluations.sh | 25 +++++++ 3 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 backend/app/evaluation/gender_assumption_bias/run.py create mode 100755 backend/scripts/run_all_evaluations.sh diff --git a/.gitignore b/.gitignore index 47c77ce..7d78803 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ htmlcov .env .env.test metrics.json -predictions.csv \ No newline at end of file +predictions.csv +backend/app/evaluation/datasets/*.csv \ No newline at end of file diff --git a/backend/app/evaluation/gender_assumption_bias/run.py b/backend/app/evaluation/gender_assumption_bias/run.py new file mode 100644 index 0000000..41912c7 --- /dev/null +++ b/backend/app/evaluation/gender_assumption_bias/run.py @@ -0,0 +1,69 @@ +from pathlib import Path +import pandas as pd +from guardrails.validators import FailResult + +from app.core.validators.gender_assumption_bias import GenderAssumptionBias +from app.evaluation.common.helper import ( + compute_binary_metrics, + Profiler, + write_csv, + write_json, +) + +BASE_DIR = Path(__file__).resolve().parent.parent +OUT_DIR = BASE_DIR / "outputs" / "gender_assumption_bias" + +df = pd.read_csv(BASE_DIR / "datasets" / "gender_bias_assumption_dataset.csv") +print(BASE_DIR, OUT_DIR) +validator = GenderAssumptionBias() + +with Profiler() as p: + df["biased_result"] = ( + df["biased input"] + .astype(str) + .apply(lambda x: p.record(lambda t: validator.validate(t, metadata=None), x)) + ) + + df["neutral_result"] = ( + df["neutral output"] + .astype(str) + .apply(lambda x: p.record(lambda t: validator.validate(t, metadata=None), x)) + ) + +# For biased input → should FAIL (1) +df["biased_pred"] = df["biased_result"].apply(lambda r: int(isinstance(r, FailResult))) + +# For neutral output → should PASS (0) +df["neutral_pred"] = df["neutral_result"].apply( + lambda r: int(isinstance(r, FailResult)) +) + +df["biased_true"] = 1 +df["neutral_true"] = 0 + +y_true = list(df["biased_true"]) + list(df["neutral_true"]) +y_pred = list(df["biased_pred"]) + list(df["neutral_pred"]) + +metrics = compute_binary_metrics(y_true, y_pred) + +write_csv( + df.drop(columns=["biased_result", "neutral_result"]), + OUT_DIR / "predictions.csv", +) + +write_json( + { + "guardrail": "gender_assumption_bias", + "num_samples": len(df) * 2, # because evaluating both sides + "metrics": metrics, + "performance": { + "latency_ms": { + "mean": sum(p.latencies) / len(p.latencies), + "p95": sorted(p.latencies)[int(len(p.latencies) * 0.95)], + "max": max(p.latencies), + }, + "memory_mb": p.peak_memory_mb, + }, + }, + OUT_DIR / "metrics.json", +) diff --git a/backend/scripts/run_all_evaluations.sh b/backend/scripts/run_all_evaluations.sh new file mode 100755 index 0000000..65de917 --- /dev/null +++ b/backend/scripts/run_all_evaluations.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +set -euo pipefail + +BACKEND_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +EVAL_DIR="$BACKEND_DIR/app/evaluation" + +RUNNERS=( + "$EVAL_DIR/lexical_slur/run.py" + "$EVAL_DIR/pii/run.py" + "$EVAL_DIR/gender_assumption_bias/run.py" +) + +echo "Running validator evaluations..." +echo "Backend dir: $BACKEND_DIR" + +for runner in "${RUNNERS[@]}"; do + name="$(basename "$(dirname "$runner")")" + echo "" + echo "==> [$name] $runner" + uv run python "$runner" +done + +echo "" +echo "All validator evaluations completed." From ab492c49fe50b01a30debdb6f0b37afaa86c6371 Mon Sep 17 00:00:00 2001 From: rkritika1508 Date: Thu, 19 Feb 2026 18:21:49 +0530 Subject: [PATCH 2/5] removed print --- backend/app/evaluation/gender_assumption_bias/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/app/evaluation/gender_assumption_bias/run.py b/backend/app/evaluation/gender_assumption_bias/run.py index 41912c7..f83ac7e 100644 --- a/backend/app/evaluation/gender_assumption_bias/run.py +++ b/backend/app/evaluation/gender_assumption_bias/run.py @@ -14,7 +14,7 @@ OUT_DIR = BASE_DIR / "outputs" / "gender_assumption_bias" df = pd.read_csv(BASE_DIR / "datasets" / "gender_bias_assumption_dataset.csv") -print(BASE_DIR, OUT_DIR) + validator = GenderAssumptionBias() with Profiler() as p: From 660e2de17d90f968d9cc3fd55b23a83407ab08a5 Mon Sep 17 00:00:00 2001 From: rkritika1508 Date: Fri, 20 Feb 2026 13:55:54 +0530 Subject: [PATCH 3/5] resolved comments --- backend/README.md | 26 +++++++++++++++++-- backend/app/evaluation/common/helper.py | 6 ++--- .../evaluation/gender_assumption_bias/run.py | 8 +++--- backend/app/evaluation/lexical_slur/run.py | 8 +++--- backend/app/evaluation/pii/entity_metrics.py | 6 ++--- backend/app/evaluation/pii/run.py | 15 +++++++++-- 6 files changed, 51 insertions(+), 18 deletions(-) diff --git a/backend/README.md b/backend/README.md index 1bb528b..212cf96 100644 --- a/backend/README.md +++ b/backend/README.md @@ -97,9 +97,16 @@ If you use GitHub Actions the tests will run automatically. We can benchmark validators like PII Remover and Lexical Slur Detection on curated datasets. -Download the dataset from [here](https://drive.google.com/drive/u/0/folders/1Rd1LH-oEwCkU0pBDRrYYedExorwmXA89). This contains multiple folders, one for each validator. Each folder contains a testing dataset in csv format for the validator. Download these csv files and store it in `backend/app/evaluation/datasets/` folder. Once the datasets have been stored, we can run the evaluation script for each validator. +Download the dataset from [here](https://drive.google.com/drive/u/0/folders/1Rd1LH-oEwCkU0pBDRrYYedExorwmXA89). This contains multiple folders, one for each validator. Each folder contains a testing dataset in csv format for the validator. Download these csv files and store them in `backend/app/evaluation/datasets/`. -For lexical slur match, ban list and gender assumption bias, testing doesn't make much sense cause these are deterministic. However, we curated a dataset for lexical slur match for use in toxicity detection validator later on. +Important: each `run.py` expects a specific filename, so dataset files must be named exactly as below: +- `app/evaluation/lexical_slur/run.py` expects `lexical_slur_testing_dataset.csv` +- `app/evaluation/pii/run.py` expects `pii_detection_testing_dataset.csv` +- `app/evaluation/gender_assumption_bias/run.py` expects `gender_bias_assumption_dataset.csv` + +Once these files are in place with the exact names above, run the evaluation scripts. + +For lexical slur match, ban list and gender assumption bias, testing doesn't make much sense cause these are deterministic. However, we curated datasets for lexical slur match and gender assumption bias for benchmarking. The curated dataset for lexical slur match will be later in toxicity detection workflows. Each validator produces: - predictions.csv – row-level outputs for debugging and analysis @@ -111,6 +118,9 @@ app/evaluation/outputs/ lexical_slur/ predictions.csv metrics.json + gender_assumption_bias/ + predictions.csv + metrics.json pii_remover/ predictions.csv metrics.json @@ -140,6 +150,18 @@ predictions.csv contains original text, anonymized output, ground-truth masked t metrics.json contains entity-level precision, recall, and F1 per PII type. +- To evaluate Gender Assumption Bias Validator, run: `python app/evaluation/gender_assumption_bias/run.py` + +Expected outputs: +``` +app/evaluation/outputs/gender_assumption_bias/ +├── predictions.csv +└── metrics.json +``` +`predictions.csv` contains biased and neutral samples with predicted outcomes for each. + +`metrics.json` contains binary classification metrics and performance stats (latency + peak memory). + ### Test running stack If your stack is already up and you just want to run the tests, you can use: diff --git a/backend/app/evaluation/common/helper.py b/backend/app/evaluation/common/helper.py index a51dcc2..019b0a5 100644 --- a/backend/app/evaluation/common/helper.py +++ b/backend/app/evaluation/common/helper.py @@ -31,9 +31,9 @@ def compute_binary_metrics(y_true, y_pred): "tn": tn, "fp": fp, "fn": fn, - "precision": precision, - "recall": recall, - "f1": f1, + "precision": round(precision, 2), + "recall": round(recall, 2), + "f1": round(f1, 2), } diff --git a/backend/app/evaluation/gender_assumption_bias/run.py b/backend/app/evaluation/gender_assumption_bias/run.py index f83ac7e..9fee7ab 100644 --- a/backend/app/evaluation/gender_assumption_bias/run.py +++ b/backend/app/evaluation/gender_assumption_bias/run.py @@ -58,11 +58,11 @@ "metrics": metrics, "performance": { "latency_ms": { - "mean": sum(p.latencies) / len(p.latencies), - "p95": sorted(p.latencies)[int(len(p.latencies) * 0.95)], - "max": max(p.latencies), + "mean": round(sum(p.latencies) / len(p.latencies), 2), + "p95": round(sorted(p.latencies)[int(len(p.latencies) * 0.95)], 2), + "max": round(max(p.latencies), 2), }, - "memory_mb": p.peak_memory_mb, + "memory_mb": round(p.peak_memory_mb, 2), }, }, OUT_DIR / "metrics.json", diff --git a/backend/app/evaluation/lexical_slur/run.py b/backend/app/evaluation/lexical_slur/run.py index 6927747..c187aa7 100644 --- a/backend/app/evaluation/lexical_slur/run.py +++ b/backend/app/evaluation/lexical_slur/run.py @@ -39,11 +39,11 @@ "metrics": metrics, "performance": { "latency_ms": { - "mean": sum(p.latencies) / len(p.latencies), - "p95": sorted(p.latencies)[int(len(p.latencies) * 0.95)], - "max": max(p.latencies), + "mean": round(sum(p.latencies) / len(p.latencies), 2), + "p95": round(sorted(p.latencies)[int(len(p.latencies) * 0.95)], 2), + "max": round(max(p.latencies), 2), }, - "memory_mb": p.peak_memory_mb, + "memory_mb": round(p.peak_memory_mb, 2), }, }, OUT_DIR / "metrics.json", diff --git a/backend/app/evaluation/pii/entity_metrics.py b/backend/app/evaluation/pii/entity_metrics.py index 8664529..73b1bae 100644 --- a/backend/app/evaluation/pii/entity_metrics.py +++ b/backend/app/evaluation/pii/entity_metrics.py @@ -75,9 +75,9 @@ def finalize_entity_metrics(stats: Dict[str, dict]) -> Dict[str, dict]: "tp": tp, "fp": fp, "fn": fn, - "precision": precision, - "recall": recall, - "f1": f1, + "precision": round(precision, 2), + "recall": round(recall, 2), + "f1": round(f1, 2), } return report diff --git a/backend/app/evaluation/pii/run.py b/backend/app/evaluation/pii/run.py index 5dfd819..aec2af8 100644 --- a/backend/app/evaluation/pii/run.py +++ b/backend/app/evaluation/pii/run.py @@ -4,7 +4,7 @@ from app.core.validators.pii_remover import PIIRemover from app.evaluation.pii.entity_metrics import compute_entity_metrics -from app.evaluation.common.helper import write_csv, write_json +from app.evaluation.common.helper import Profiler, write_csv, write_json BASE_DIR = Path(__file__).resolve().parent.parent OUT_DIR = BASE_DIR / "outputs" / "pii_remover" @@ -21,7 +21,10 @@ def run_pii(text: str) -> str: return text -df["anonymized"] = df["source_text"].astype(str).apply(run_pii) +with Profiler() as p: + df["anonymized"] = ( + df["source_text"].astype(str).apply(lambda x: p.record(run_pii, x)) + ) entity_report = compute_entity_metrics( df["target_text"], @@ -36,6 +39,14 @@ def run_pii(text: str) -> str: "guardrail": "pii_remover", "num_samples": len(df), "entity_metrics": entity_report, + "performance": { + "latency_ms": { + "mean": round(sum(p.latencies) / len(p.latencies), 2), + "p95": round(sorted(p.latencies)[int(len(p.latencies) * 0.95)], 2), + "max": round(max(p.latencies), 2), + }, + "memory_mb": round(p.peak_memory_mb, 2), + }, }, OUT_DIR / "metrics.json", ) From 7182e768af88c1ae0fb38b18a10343e394e37570 Mon Sep 17 00:00:00 2001 From: rkritika1508 Date: Fri, 20 Feb 2026 14:03:29 +0530 Subject: [PATCH 4/5] resolved comments --- backend/README.md | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/backend/README.md b/backend/README.md index 212cf96..43471a4 100644 --- a/backend/README.md +++ b/backend/README.md @@ -97,7 +97,7 @@ If you use GitHub Actions the tests will run automatically. We can benchmark validators like PII Remover and Lexical Slur Detection on curated datasets. -Download the dataset from [here](https://drive.google.com/drive/u/0/folders/1Rd1LH-oEwCkU0pBDRrYYedExorwmXA89). This contains multiple folders, one for each validator. Each folder contains a testing dataset in csv format for the validator. Download these csv files and store them in `backend/app/evaluation/datasets/`. +Download the dataset from [Google Drive](https://drive.google.com/drive/u/0/folders/1Rd1LH-oEwCkU0pBDRrYYedExorwmXA89).This contains multiple folders, one for each validator. Each folder contains a testing dataset in csv format for the validator. Download these csv files and store them in `backend/app/evaluation/datasets/`. Important: each `run.py` expects a specific filename, so dataset files must be named exactly as below: - `app/evaluation/lexical_slur/run.py` expects `lexical_slur_testing_dataset.csv` @@ -106,14 +106,14 @@ Important: each `run.py` expects a specific filename, so dataset files must be n Once these files are in place with the exact names above, run the evaluation scripts. -For lexical slur match, ban list and gender assumption bias, testing doesn't make much sense cause these are deterministic. However, we curated datasets for lexical slur match and gender assumption bias for benchmarking. The curated dataset for lexical slur match will be later in toxicity detection workflows. +Unit tests for lexical slur match, ban list, and gender assumption bias validators have limited value because their logic is deterministic. However, curated datasets exist for lexical slur match and gender assumption bias to benchmark accuracy and latency. The lexical slur dataset will also be used in future toxicity detection workflows. Each validator produces: - predictions.csv – row-level outputs for debugging and analysis - metrics.json – aggregated accuracy + performance metrics Standardized output structure: -``` +```text app/evaluation/outputs/ lexical_slur/ predictions.csv @@ -128,8 +128,28 @@ app/evaluation/outputs/ - To evaluate Lexical Slur Validator, run the offline evaluation script: `python app/evaluation/lexical_slur/run.py` -Expected outputs: +- To run all evaluation scripts together, use: +```bash +bash scripts/run_all_evaluations.sh +``` +This script runs the evaluators in sequence: +- `app/evaluation/lexical_slur/run.py` +- `app/evaluation/pii/run.py` +- `app/evaluation/gender_assumption_bias/run.py` + +Expected aggregate outputs: +```text +app/evaluation/outputs/ + lexical_slur/predictions.csv + lexical_slur/metrics.json + pii_remover/predictions.csv + pii_remover/metrics.json + gender_assumption_bias/predictions.csv + gender_assumption_bias/metrics.json ``` + +Expected outputs: +```text app/evaluation/outputs/lexical_slur/ ├── predictions.csv └── metrics.json @@ -141,7 +161,7 @@ metrics.json contains binary classification metrics and performance stats (laten - To evaluate PII Validator, run the PII evaluation script: `python app/evaluation/pii/run.py` Expected outputs: -``` +```text app/evaluation/outputs/pii_remover/ ├── predictions.csv └── metrics.json @@ -153,7 +173,7 @@ metrics.json contains entity-level precision, recall, and F1 per PII type. - To evaluate Gender Assumption Bias Validator, run: `python app/evaluation/gender_assumption_bias/run.py` Expected outputs: -``` +```text app/evaluation/outputs/gender_assumption_bias/ ├── predictions.csv └── metrics.json From 77a48f10be7258e70ba7fbd7863937b28f37c206 Mon Sep 17 00:00:00 2001 From: rkritika1508 Date: Sun, 22 Feb 2026 16:50:13 +0530 Subject: [PATCH 5/5] resolved comments --- backend/README.md | 37 ++++--------------------------------- 1 file changed, 4 insertions(+), 33 deletions(-) diff --git a/backend/README.md b/backend/README.md index 101662a..cec452a 100644 --- a/backend/README.md +++ b/backend/README.md @@ -126,8 +126,6 @@ app/evaluation/outputs/ metrics.json ``` -- To evaluate Lexical Slur Validator, run the offline evaluation script: `python app/evaluation/lexical_slur/run.py` - - To run all evaluation scripts together, use: ```bash bash scripts/run_all_evaluations.sh @@ -137,47 +135,20 @@ This script runs the evaluators in sequence: - `app/evaluation/pii/run.py` - `app/evaluation/gender_assumption_bias/run.py` -Expected aggregate outputs: -```text -app/evaluation/outputs/ - lexical_slur/predictions.csv - lexical_slur/metrics.json - pii_remover/predictions.csv - pii_remover/metrics.json - gender_assumption_bias/predictions.csv - gender_assumption_bias/metrics.json -``` - -Expected outputs: -```text -app/evaluation/outputs/lexical_slur/ -├── predictions.csv -└── metrics.json -``` predictions.csv contains row-level inputs, predictions, and labels. metrics.json contains binary classification metrics and performance stats (latency + peak memory). +- To evaluate Lexical Slur Validator, run the offline evaluation script: `python app/evaluation/lexical_slur/run.py` + - To evaluate PII Validator, run the PII evaluation script: `python app/evaluation/pii/run.py` -Expected outputs: -```text -app/evaluation/outputs/pii_remover/ -├── predictions.csv -└── metrics.json -``` -predictions.csv contains original text, anonymized output, ground-truth masked text +`predictions.csv` contains original text, anonymized output, ground-truth masked text -metrics.json contains entity-level precision, recall, and F1 per PII type. +`metrics.json` contains entity-level precision, recall, and F1 per PII type. - To evaluate Gender Assumption Bias Validator, run: `python app/evaluation/gender_assumption_bias/run.py` -Expected outputs: -```text -app/evaluation/outputs/gender_assumption_bias/ -├── predictions.csv -└── metrics.json -``` `predictions.csv` contains biased and neutral samples with predicted outcomes for each. `metrics.json` contains binary classification metrics and performance stats (latency + peak memory).