diff --git a/.gitignore b/.gitignore index 47c77ce..7d78803 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ htmlcov .env .env.test metrics.json -predictions.csv \ No newline at end of file +predictions.csv +backend/app/evaluation/datasets/*.csv \ No newline at end of file diff --git a/backend/README.md b/backend/README.md index bf1f38a..cec452a 100644 --- a/backend/README.md +++ b/backend/README.md @@ -97,48 +97,61 @@ If you use GitHub Actions the tests will run automatically. We can benchmark validators like PII Remover and Lexical Slur Detection on curated datasets. -Download the dataset from [here](https://drive.google.com/drive/u/0/folders/1Rd1LH-oEwCkU0pBDRrYYedExorwmXA89). This contains multiple folders, one for each validator. Each folder contains a testing dataset in csv format for the validator. Download these csv files and store it in `backend/app/evaluation/datasets/` folder. Once the datasets have been stored, we can run the evaluation script for each validator. +Download the dataset from [Google Drive](https://drive.google.com/drive/u/0/folders/1Rd1LH-oEwCkU0pBDRrYYedExorwmXA89).This contains multiple folders, one for each validator. Each folder contains a testing dataset in csv format for the validator. Download these csv files and store them in `backend/app/evaluation/datasets/`. -For lexical slur match, ban list and gender assumption bias, testing doesn't make much sense cause these are deterministic. However, we curated a dataset for lexical slur match for use in toxicity detection validator later on. +Important: each `run.py` expects a specific filename, so dataset files must be named exactly as below: +- `app/evaluation/lexical_slur/run.py` expects `lexical_slur_testing_dataset.csv` +- `app/evaluation/pii/run.py` expects `pii_detection_testing_dataset.csv` +- `app/evaluation/gender_assumption_bias/run.py` expects `gender_bias_assumption_dataset.csv` + +Once these files are in place with the exact names above, run the evaluation scripts. + +Unit tests for lexical slur match, ban list, and gender assumption bias validators have limited value because their logic is deterministic. However, curated datasets exist for lexical slur match and gender assumption bias to benchmark accuracy and latency. The lexical slur dataset will also be used in future toxicity detection workflows. Each validator produces: - predictions.csv – row-level outputs for debugging and analysis - metrics.json – aggregated accuracy + performance metrics Standardized output structure: -``` +```text app/evaluation/outputs/ lexical_slur/ predictions.csv metrics.json + gender_assumption_bias/ + predictions.csv + metrics.json pii_remover/ predictions.csv metrics.json ``` -- To evaluate Lexical Slur Validator, run the offline evaluation script: `python app/evaluation/lexical_slur/run.py` - -Expected outputs: -``` -app/evaluation/outputs/lexical_slur/ -├── predictions.csv -└── metrics.json +- To run all evaluation scripts together, use: +```bash +bash scripts/run_all_evaluations.sh ``` +This script runs the evaluators in sequence: +- `app/evaluation/lexical_slur/run.py` +- `app/evaluation/pii/run.py` +- `app/evaluation/gender_assumption_bias/run.py` + predictions.csv contains row-level inputs, predictions, and labels. metrics.json contains binary classification metrics and performance stats (latency + peak memory). +- To evaluate Lexical Slur Validator, run the offline evaluation script: `python app/evaluation/lexical_slur/run.py` + - To evaluate PII Validator, run the PII evaluation script: `python app/evaluation/pii/run.py` -Expected outputs: -``` -app/evaluation/outputs/pii_remover/ -├── predictions.csv -└── metrics.json -``` -predictions.csv contains original text, anonymized output, ground-truth masked text +`predictions.csv` contains original text, anonymized output, ground-truth masked text + +`metrics.json` contains entity-level precision, recall, and F1 per PII type. + +- To evaluate Gender Assumption Bias Validator, run: `python app/evaluation/gender_assumption_bias/run.py` + +`predictions.csv` contains biased and neutral samples with predicted outcomes for each. -metrics.json contains entity-level precision, recall, and F1 per PII type. +`metrics.json` contains binary classification metrics and performance stats (latency + peak memory). ## Validator configuration guide diff --git a/backend/app/evaluation/common/helper.py b/backend/app/evaluation/common/helper.py index a51dcc2..019b0a5 100644 --- a/backend/app/evaluation/common/helper.py +++ b/backend/app/evaluation/common/helper.py @@ -31,9 +31,9 @@ def compute_binary_metrics(y_true, y_pred): "tn": tn, "fp": fp, "fn": fn, - "precision": precision, - "recall": recall, - "f1": f1, + "precision": round(precision, 2), + "recall": round(recall, 2), + "f1": round(f1, 2), } diff --git a/backend/app/evaluation/gender_assumption_bias/run.py b/backend/app/evaluation/gender_assumption_bias/run.py new file mode 100644 index 0000000..9fee7ab --- /dev/null +++ b/backend/app/evaluation/gender_assumption_bias/run.py @@ -0,0 +1,69 @@ +from pathlib import Path +import pandas as pd +from guardrails.validators import FailResult + +from app.core.validators.gender_assumption_bias import GenderAssumptionBias +from app.evaluation.common.helper import ( + compute_binary_metrics, + Profiler, + write_csv, + write_json, +) + +BASE_DIR = Path(__file__).resolve().parent.parent +OUT_DIR = BASE_DIR / "outputs" / "gender_assumption_bias" + +df = pd.read_csv(BASE_DIR / "datasets" / "gender_bias_assumption_dataset.csv") + +validator = GenderAssumptionBias() + +with Profiler() as p: + df["biased_result"] = ( + df["biased input"] + .astype(str) + .apply(lambda x: p.record(lambda t: validator.validate(t, metadata=None), x)) + ) + + df["neutral_result"] = ( + df["neutral output"] + .astype(str) + .apply(lambda x: p.record(lambda t: validator.validate(t, metadata=None), x)) + ) + +# For biased input → should FAIL (1) +df["biased_pred"] = df["biased_result"].apply(lambda r: int(isinstance(r, FailResult))) + +# For neutral output → should PASS (0) +df["neutral_pred"] = df["neutral_result"].apply( + lambda r: int(isinstance(r, FailResult)) +) + +df["biased_true"] = 1 +df["neutral_true"] = 0 + +y_true = list(df["biased_true"]) + list(df["neutral_true"]) +y_pred = list(df["biased_pred"]) + list(df["neutral_pred"]) + +metrics = compute_binary_metrics(y_true, y_pred) + +write_csv( + df.drop(columns=["biased_result", "neutral_result"]), + OUT_DIR / "predictions.csv", +) + +write_json( + { + "guardrail": "gender_assumption_bias", + "num_samples": len(df) * 2, # because evaluating both sides + "metrics": metrics, + "performance": { + "latency_ms": { + "mean": round(sum(p.latencies) / len(p.latencies), 2), + "p95": round(sorted(p.latencies)[int(len(p.latencies) * 0.95)], 2), + "max": round(max(p.latencies), 2), + }, + "memory_mb": round(p.peak_memory_mb, 2), + }, + }, + OUT_DIR / "metrics.json", +) diff --git a/backend/app/evaluation/lexical_slur/run.py b/backend/app/evaluation/lexical_slur/run.py index 6927747..c187aa7 100644 --- a/backend/app/evaluation/lexical_slur/run.py +++ b/backend/app/evaluation/lexical_slur/run.py @@ -39,11 +39,11 @@ "metrics": metrics, "performance": { "latency_ms": { - "mean": sum(p.latencies) / len(p.latencies), - "p95": sorted(p.latencies)[int(len(p.latencies) * 0.95)], - "max": max(p.latencies), + "mean": round(sum(p.latencies) / len(p.latencies), 2), + "p95": round(sorted(p.latencies)[int(len(p.latencies) * 0.95)], 2), + "max": round(max(p.latencies), 2), }, - "memory_mb": p.peak_memory_mb, + "memory_mb": round(p.peak_memory_mb, 2), }, }, OUT_DIR / "metrics.json", diff --git a/backend/app/evaluation/pii/entity_metrics.py b/backend/app/evaluation/pii/entity_metrics.py index 8664529..73b1bae 100644 --- a/backend/app/evaluation/pii/entity_metrics.py +++ b/backend/app/evaluation/pii/entity_metrics.py @@ -75,9 +75,9 @@ def finalize_entity_metrics(stats: Dict[str, dict]) -> Dict[str, dict]: "tp": tp, "fp": fp, "fn": fn, - "precision": precision, - "recall": recall, - "f1": f1, + "precision": round(precision, 2), + "recall": round(recall, 2), + "f1": round(f1, 2), } return report diff --git a/backend/app/evaluation/pii/run.py b/backend/app/evaluation/pii/run.py index 5dfd819..aec2af8 100644 --- a/backend/app/evaluation/pii/run.py +++ b/backend/app/evaluation/pii/run.py @@ -4,7 +4,7 @@ from app.core.validators.pii_remover import PIIRemover from app.evaluation.pii.entity_metrics import compute_entity_metrics -from app.evaluation.common.helper import write_csv, write_json +from app.evaluation.common.helper import Profiler, write_csv, write_json BASE_DIR = Path(__file__).resolve().parent.parent OUT_DIR = BASE_DIR / "outputs" / "pii_remover" @@ -21,7 +21,10 @@ def run_pii(text: str) -> str: return text -df["anonymized"] = df["source_text"].astype(str).apply(run_pii) +with Profiler() as p: + df["anonymized"] = ( + df["source_text"].astype(str).apply(lambda x: p.record(run_pii, x)) + ) entity_report = compute_entity_metrics( df["target_text"], @@ -36,6 +39,14 @@ def run_pii(text: str) -> str: "guardrail": "pii_remover", "num_samples": len(df), "entity_metrics": entity_report, + "performance": { + "latency_ms": { + "mean": round(sum(p.latencies) / len(p.latencies), 2), + "p95": round(sorted(p.latencies)[int(len(p.latencies) * 0.95)], 2), + "max": round(max(p.latencies), 2), + }, + "memory_mb": round(p.peak_memory_mb, 2), + }, }, OUT_DIR / "metrics.json", ) diff --git a/backend/scripts/run_all_evaluations.sh b/backend/scripts/run_all_evaluations.sh new file mode 100755 index 0000000..65de917 --- /dev/null +++ b/backend/scripts/run_all_evaluations.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +set -euo pipefail + +BACKEND_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +EVAL_DIR="$BACKEND_DIR/app/evaluation" + +RUNNERS=( + "$EVAL_DIR/lexical_slur/run.py" + "$EVAL_DIR/pii/run.py" + "$EVAL_DIR/gender_assumption_bias/run.py" +) + +echo "Running validator evaluations..." +echo "Backend dir: $BACKEND_DIR" + +for runner in "${RUNNERS[@]}"; do + name="$(basename "$(dirname "$runner")")" + echo "" + echo "==> [$name] $runner" + uv run python "$runner" +done + +echo "" +echo "All validator evaluations completed."