diff --git a/packages/uipath/pyproject.toml b/packages/uipath/pyproject.toml
index 8e9c9f581..81c597d68 100644
--- a/packages/uipath/pyproject.toml
+++ b/packages/uipath/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "uipath"
-version = "2.10.70"
+version = "2.10.72"
 description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.11"
diff --git a/packages/uipath/samples/classifier_demo/README.md b/packages/uipath/samples/classifier_demo/README.md
new file mode 100644
index 000000000..638765ab1
--- /dev/null
+++ b/packages/uipath/samples/classifier_demo/README.md
@@ -0,0 +1,139 @@
+# Classifier evaluator end-to-end demo
+
+A minimal intent-classification agent that exercises the new
+`ClassifierEvaluator` end-to-end. Use this as the test fixture for both
+SDK-only validation (Path A below) and Studio Web full-stack validation
+(Path B).
+
+## What's here
+
+```
+classifier_demo/
+├── main.py                       # 3-class keyword classifier
+├── uipath.json
+├── pyproject.toml
+├── bindings.json
+└── evaluations/
+    ├── eval-sets/
+    │   └── main.json             # 9 datapoints, 3 per class, some intentionally wrong
+    └── evaluators/
+        ├── intent_match.json     # per-datapoint ExactMatch on agent_output.intent
+        └── intent_classifier.json # the new uipath-classifier (pure metadata)
+```
+
+The eval set is wired so that for every datapoint both evaluators run:
+- `intent_match` produces a 1.0/0.0 score with `{"expected": "...", "actual": "..."}` justification.
+- `intent_classifier` produces a sentinel 0.0 score with `{"classes": [...], "source_evaluator": "intent_match"}` justification.
+
+Downstream (the C# layer in Studio Web) reads both to compute precision /
+recall / F-score across the dataset.
+
+> Heads-up — every datapoint must have an entry for the classifier in
+> `evaluationCriterias` (even an empty `{}`). The runtime currently skips
+> evaluators that aren't keyed in `evaluationCriterias` for a datapoint, so
+> omitting them silently drops the classifier results.
+
+## Path A — SDK only (real run, ~30 seconds)
+
+```bash
+cd packages/uipath
+uv sync --all-extras
+
+cd samples/classifier_demo
+uv run --project ../.. uipath eval main main.json --no-report --output-file /tmp/out.json
+```
+
+Expected: a results table with two columns (`intent_classifier`, `intent_match`).
+`intent_match` averages to 0.7 (6/9 correct). `intent_classifier` shows 0.0 per
+row by design — its real work is to ship the classes list to the backend.
+
+To see the metadata payload that lands in the backend's
+`CodedEvaluatorScore.Justification`:
+
+```bash
+python3 -c "
+import json
+with open('/tmp/out.json') as f: d = json.load(f)
+for r in d['evaluationSetResults'][0]['evaluationRunResults']:
+    print(r['evaluatorName'], r['result'].get('details'))
+"
+```
+
+You should see something like:
+
+```
+intent_classifier  {'expected': '', 'actual': '', 'classes': ['book', 'cancel', 'reschedule'], 'source_evaluator': 'intent_match'}
+intent_match       {'expected': 'book', 'actual': 'book'}
+```
+
+## Path B — Full Studio Web stack (real UI, click Run, see panel)
+
+Currently blocked on environment that I (the assistant who built this) didn't
+have available locally. The pieces:
+
+### Prereqs (per `Agents/LOCAL_DEVELOPMENT.md`)
+- Docker installed and running
+- `make` available
+- Azure CLI authenticated session (`az login`)
+- Azure DevOps PAT exported as `AZURE_DEVOPS_PAT`
+- GitHub NPM registry token exported as `GH_NPM_REGISTRY_TOKEN`
+- Azure access token exported as `AZURE_ACCESS_TOKEN` (for the python worker build)
+- `cloud-provider-kind` binary (used for the local KinD cluster)
+
+### Steps
+
+1. **Point python-eval-worker at the local SDK branch.** The published
+   `uipath` package on PyPI doesn't yet have `ClassifierEvaluator`. Edit
+   `Agents/python-eval-worker/pyproject.toml`:
+
+   ```toml
+   [tool.uv.sources]
+   uipath = { path = "../../uipath-python/packages/uipath", editable = true }
+   ```
+
+   Then `cd python-eval-worker && uv lock && uv sync`.
+
+2. **Bring up the local KinD cluster** (from `Agents/`):
+   ```bash
+   make create-kind-cluster
+   kubectl get nodes
+   sudo ./bin/cloud-provider-kind &      # in a separate shell or background
+   make up
+   make deploy
+   ```
+
+3. **Build the backend with the classifier changes:**
+   ```bash
+   git checkout feat/eval-classifier-backend       # in Agents repo
+   # Re-trigger the helm/skaffold deploy for the backend
+   make deploy
+   ```
+
+4. **Build the frontend with the UI changes:**
+   ```bash
+   git checkout feat/eval-dataset-evaluators-ui    # in Agents repo
+   # Same deploy command rebuilds frontend image
+   ```
+
+5. **Open Studio Web** (URL surfaced by the deploy output), create an agent
+   project, upload the eval-set + evaluator JSONs from this directory (or
+   author them in the UI — the picker now shows a "Classifier" entry under
+   the AGGREGATION section), and click Run.
+
+6. **Verify** the Aggregations panel renders between the run header and the
+   datapoint table, with the confusion matrix matching what Path A's Python
+   shim computes (macro F1 ≈ 0.667 on this fixture).
+
+### Open questions for the team owning local dev
+
+- Does the existing PAT / token set get refreshed automatically by the dev tooling, or do contributors need to rotate them periodically?
+- Is there a simpler "local-only" path that bypasses the KinD cluster (e.g. docker-compose) for changes that don't touch K8s manifests?
+- What's the standard pattern for pointing the python worker at a non-PyPI uipath build? The `[tool.uv.sources]` override above is the standard uv path — confirm there's no Helm/skaffold complication.
+
+## Companion PRs
+
+| Repo | Branch | PR | What |
+|---|---|---|---|
+| uipath-python | `feat/eval-classifier-evaluator` | [#1674](https://github.com/UiPath/uipath-python/pull/1674) | SDK `ClassifierEvaluator` |
+| Agents | `feat/eval-classifier-backend` | [#5313](https://github.com/UiPath/Agents/pull/5313) | C# math + activity + envelope storage |
+| Agents | `feat/eval-dataset-evaluators-ui` | [#5306](https://github.com/UiPath/Agents/pull/5306) | Frontend picker + Aggregations panel |
diff --git a/packages/uipath/samples/classifier_demo/bindings.json b/packages/uipath/samples/classifier_demo/bindings.json
new file mode 100644
index 000000000..5e9beeb01
--- /dev/null
+++ b/packages/uipath/samples/classifier_demo/bindings.json
@@ -0,0 +1,4 @@
+{
+  "version": "2.0",
+  "resources": []
+}
diff --git a/packages/uipath/samples/classifier_demo/evaluations/eval-sets/main.json b/packages/uipath/samples/classifier_demo/evaluations/eval-sets/main.json
new file mode 100644
index 000000000..117e9e240
--- /dev/null
+++ b/packages/uipath/samples/classifier_demo/evaluations/eval-sets/main.json
@@ -0,0 +1,173 @@
+{
+  "version": "1.0",
+  "id": "classifier-demo-eval-set",
+  "name": "Classifier demo eval set",
+  "evaluatorRefs": [
+    "intent_match",
+    "intent_classifier"
+  ],
+  "evaluations": [
+    {
+      "id": "book-1",
+      "name": "book \u2014 straightforward",
+      "inputs": {
+        "utterance": "I want to book a table for two"
+      },
+      "expectedOutput": {
+        "intent": "book"
+      },
+      "evaluationCriterias": {
+        "intent_match": {
+          "expectedOutput": {
+            "intent": "book"
+          }
+        },
+        "intent_classifier": {}
+      }
+    },
+    {
+      "id": "book-2",
+      "name": "book \u2014 schedule keyword",
+      "inputs": {
+        "utterance": "Please schedule an appointment"
+      },
+      "expectedOutput": {
+        "intent": "book"
+      },
+      "evaluationCriterias": {
+        "intent_match": {
+          "expectedOutput": {
+            "intent": "book"
+          }
+        },
+        "intent_classifier": {}
+      }
+    },
+    {
+      "id": "book-3",
+      "name": "book \u2014 agent misclassifies (utterance triggers cancel keyword)",
+      "inputs": {
+        "utterance": "I had to cancel my last attempt but I want to reserve a slot now"
+      },
+      "expectedOutput": {
+        "intent": "book"
+      },
+      "evaluationCriterias": {
+        "intent_match": {
+          "expectedOutput": {
+            "intent": "book"
+          }
+        },
+        "intent_classifier": {}
+      }
+    },
+    {
+      "id": "cancel-1",
+      "name": "cancel \u2014 straightforward",
+      "inputs": {
+        "utterance": "Please cancel my reservation"
+      },
+      "expectedOutput": {
+        "intent": "cancel"
+      },
+      "evaluationCriterias": {
+        "intent_match": {
+          "expectedOutput": {
+            "intent": "cancel"
+          }
+        },
+        "intent_classifier": {}
+      }
+    },
+    {
+      "id": "cancel-2",
+      "name": "cancel \u2014 void synonym",
+      "inputs": {
+        "utterance": "I want to void the order"
+      },
+      "expectedOutput": {
+        "intent": "cancel"
+      },
+      "evaluationCriterias": {
+        "intent_match": {
+          "expectedOutput": {
+            "intent": "cancel"
+          }
+        },
+        "intent_classifier": {}
+      }
+    },
+    {
+      "id": "cancel-3",
+      "name": "cancel \u2014 agent misclassifies (utterance has 'move' which triggers reschedule)",
+      "inputs": {
+        "utterance": "I need to move past this and cancel everything"
+      },
+      "expectedOutput": {
+        "intent": "cancel"
+      },
+      "evaluationCriterias": {
+        "intent_match": {
+          "expectedOutput": {
+            "intent": "cancel"
+          }
+        },
+        "intent_classifier": {}
+      }
+    },
+    {
+      "id": "reschedule-1",
+      "name": "reschedule \u2014 straightforward",
+      "inputs": {
+        "utterance": "I want to reschedule the meeting"
+      },
+      "expectedOutput": {
+        "intent": "reschedule"
+      },
+      "evaluationCriterias": {
+        "intent_match": {
+          "expectedOutput": {
+            "intent": "reschedule"
+          }
+        },
+        "intent_classifier": {}
+      }
+    },
+    {
+      "id": "reschedule-2",
+      "name": "reschedule \u2014 move synonym",
+      "inputs": {
+        "utterance": "Can we move the slot to tomorrow"
+      },
+      "expectedOutput": {
+        "intent": "reschedule"
+      },
+      "evaluationCriterias": {
+        "intent_match": {
+          "expectedOutput": {
+            "intent": "reschedule"
+          }
+        },
+        "intent_classifier": {}
+      }
+    },
+    {
+      "id": "reschedule-3",
+      "name": "reschedule \u2014 agent misclassifies (falls through to default 'book')",
+      "inputs": {
+        "utterance": "Different timing please"
+      },
+      "expectedOutput": {
+        "intent": "reschedule"
+      },
+      "evaluationCriterias": {
+        "intent_match": {
+          "expectedOutput": {
+            "intent": "reschedule"
+          }
+        },
+        "intent_classifier": {}
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_classifier.json b/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_classifier.json
new file mode 100644
index 000000000..ace8cb712
--- /dev/null
+++ b/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_classifier.json
@@ -0,0 +1,11 @@
+{
+  "version": "1.0",
+  "id": "intent_classifier",
+  "description": "Classification aggregator. Pure metadata — carries the classes list + source evaluator name to downstream consumers (the C# backend computes precision/recall/F-score over the dataset). Per-datapoint result is a no-op carrying the metadata.",
+  "evaluatorTypeId": "uipath-classifier",
+  "evaluatorConfig": {
+    "name": "intent_classifier",
+    "classes": ["book", "cancel", "reschedule"],
+    "sourceEvaluator": "intent_match"
+  }
+}
diff --git a/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_match.json b/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_match.json
new file mode 100644
index 000000000..552c7220f
--- /dev/null
+++ b/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_match.json
@@ -0,0 +1,15 @@
+{
+  "version": "1.0",
+  "id": "intent_match",
+  "description": "Per-datapoint ExactMatch on the agent's `intent` output. Produces expected/actual justification that the ClassifierEvaluator pipeline reads.",
+  "evaluatorTypeId": "uipath-exact-match",
+  "evaluatorConfig": {
+    "name": "intent_match",
+    "targetOutputKey": "intent",
+    "caseSensitive": false,
+    "negated": false,
+    "defaultEvaluationCriteria": {
+      "expectedOutput": "book"
+    }
+  }
+}
diff --git a/packages/uipath/samples/classifier_demo/main.py b/packages/uipath/samples/classifier_demo/main.py
new file mode 100644
index 000000000..b6e1eea48
--- /dev/null
+++ b/packages/uipath/samples/classifier_demo/main.py
@@ -0,0 +1,42 @@
+"""Tiny intent-classification agent for the ClassifierEvaluator demo.
+
+Given an utterance, returns the intent label. Three intents:
+  - book        (anything containing "book" / "reserve" / "schedule")
+  - cancel      (anything containing "cancel" / "void")
+  - reschedule  (anything containing "reschedule" / "move")
+
+A few datapoints are deliberately misclassified so the run-level
+classification metrics (precision/recall/F-score) come out non-trivially.
+"""
+
+from dataclasses import dataclass
+
+
+@dataclass
+class IntentInput:
+    utterance: str
+
+
+@dataclass
+class IntentOutput:
+    intent: str
+
+
+BOOK_KEYWORDS = {"book", "reserve", "schedule"}
+CANCEL_KEYWORDS = {"cancel", "void"}
+RESCHEDULE_KEYWORDS = {"reschedule", "move"}
+
+
+async def main(input: IntentInput) -> IntentOutput:
+    """Classify the utterance into book / cancel / reschedule."""
+    text = input.utterance.lower()
+    tokens = set(text.split())
+
+    if tokens & RESCHEDULE_KEYWORDS:
+        return IntentOutput(intent="reschedule")
+    if tokens & CANCEL_KEYWORDS:
+        return IntentOutput(intent="cancel")
+    if tokens & BOOK_KEYWORDS:
+        return IntentOutput(intent="book")
+    # Fallback to "book" — deliberately wrong-ish so the matrix is interesting.
+    return IntentOutput(intent="book")
diff --git a/packages/uipath/samples/classifier_demo/pyproject.toml b/packages/uipath/samples/classifier_demo/pyproject.toml
new file mode 100644
index 000000000..307e3778c
--- /dev/null
+++ b/packages/uipath/samples/classifier_demo/pyproject.toml
@@ -0,0 +1,9 @@
+[project]
+name = "classifier-demo"
+version = "0.0.1"
+description = "Tiny intent-classification agent that exercises the new ClassifierEvaluator end-to-end via `uipath eval`."
+requires-python = ">=3.11"
+dependencies = ["uipath"]
+
+[dependency-groups]
+dev = ["uipath-dev"]
diff --git a/packages/uipath/samples/classifier_demo/uipath.json b/packages/uipath/samples/classifier_demo/uipath.json
new file mode 100644
index 000000000..9b02c2654
--- /dev/null
+++ b/packages/uipath/samples/classifier_demo/uipath.json
@@ -0,0 +1,5 @@
+{
+  "functions": {
+    "main": "main.py:main"
+  }
+}
diff --git a/packages/uipath/src/uipath/eval/evaluators/__init__.py b/packages/uipath/src/uipath/eval/evaluators/__init__.py
index 03a4bf63b..c68271b11 100644
--- a/packages/uipath/src/uipath/eval/evaluators/__init__.py
+++ b/packages/uipath/src/uipath/eval/evaluators/__init__.py
@@ -9,6 +9,7 @@
     BaseEvaluatorConfig,
     BaseEvaluatorJustification,
 )
+from ._aggregators import AggregatorSpec, ClassificationAggregatorSpec
 from .base_legacy_evaluator import BaseLegacyEvaluator
 from .binary_classification_evaluator import BinaryClassificationEvaluator
 
@@ -71,6 +72,9 @@
     "BinaryClassificationEvaluator",
     "MulticlassClassificationEvaluator",
     "ContainsEvaluator",
+    # Aggregator specs (config metadata attached to per-datapoint evaluators)
+    "AggregatorSpec",
+    "ClassificationAggregatorSpec",
     "ExactMatchEvaluator",
     "JsonSimilarityEvaluator",
     "BaseLLMOutputEvaluator",
diff --git a/packages/uipath/src/uipath/eval/evaluators/_aggregators.py b/packages/uipath/src/uipath/eval/evaluators/_aggregators.py
new file mode 100644
index 000000000..968974546
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators/_aggregators.py
@@ -0,0 +1,35 @@
+"""Aggregator specs attached to per-datapoint evaluator configs.
+
+An aggregator is run-level — it consumes the per-datapoint results of an
+evaluator after the eval set finishes. The aggregator itself does not run in
+the Python runtime; this module just defines the config shape so the downstream
+consumer (the C# backend) can pick it up via the evaluator's stored config.
+
+Today the only aggregator is `classification`, which compares each datapoint's
+expected vs. predicted class to build a confusion matrix and precision/recall/
+F-score metrics.
+"""
+
+from typing import Literal
+
+from pydantic import BaseModel, ConfigDict
+from pydantic.alias_generators import to_camel
+
+
+class ClassificationAggregatorSpec(BaseModel):
+    """Configuration for a classification aggregator.
+
+    Attached to a per-datapoint evaluator (e.g. ExactMatch) to mark that the
+    evaluator's results should be aggregated into classification metrics. The
+    classes list defines the exhaustive label space; the C# layer scans each
+    datapoint's expected output for the first class that matches.
+    """
+
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
+
+    name: Literal["classification"] = "classification"
+    classes: list[str]
+
+
+# Union of all supported aggregator specs. Add new variants here.
+AggregatorSpec = ClassificationAggregatorSpec
diff --git a/packages/uipath/src/uipath/eval/evaluators/exact_match_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/exact_match_evaluator.py
index 0f1b3e8e8..398afc408 100644
--- a/packages/uipath/src/uipath/eval/evaluators/exact_match_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/exact_match_evaluator.py
@@ -1,11 +1,14 @@
 """Exact match evaluator for agent outputs."""
 
+from typing import Optional
+
 from ..models import (
     AgentExecution,
     EvaluationResult,
     EvaluatorType,
     NumericEvaluationResult,
 )
+from ._aggregators import AggregatorSpec
 from .base_evaluator import BaseEvaluatorJustification
 from .output_evaluator import (
     OutputEvaluationCriteria,
@@ -15,16 +18,38 @@
 
 
 class ExactMatchEvaluatorConfig(OutputEvaluatorConfig[OutputEvaluationCriteria]):
-    """Configuration for the exact match evaluator."""
+    """Configuration for the exact match evaluator.
+
+    The optional `aggregators` field attaches run-level aggregators (e.g. a
+    classification aggregator with a fixed class set) that the downstream
+    backend will compute after the eval set finishes. The Python runtime
+    itself only forwards `aggregators` into the per-datapoint justification
+    so the C# layer can pick it up; no per-datapoint math happens here.
+    """
 
     name: str = "ExactMatchEvaluator"
     case_sensitive: bool = False
     negated: bool = False
+    aggregators: Optional[list[AggregatorSpec]] = None
+
+
+class ExactMatchJustification(BaseEvaluatorJustification):
+    """ExactMatch's per-datapoint justification.
+
+    Carries the standard `expected` / `actual` plus the run-level
+    `aggregators` config inlined per datapoint. The aggregators value is
+    identical across datapoints — it's repeated only so the downstream
+    consumer (the C# post-pass) can discover aggregator configuration from
+    per-datapoint records without needing access to evaluator snapshots.
+    Omitted entirely when no aggregators are configured.
+    """
+
+    aggregators: Optional[list[AggregatorSpec]] = None
 
 
 class ExactMatchEvaluator(
     OutputEvaluator[
-        OutputEvaluationCriteria, ExactMatchEvaluatorConfig, BaseEvaluatorJustification
+        OutputEvaluationCriteria, ExactMatchEvaluatorConfig, ExactMatchJustification
     ]
 ):
     """Evaluator that performs exact structural matching between expected and actual outputs.
@@ -46,15 +71,11 @@ async def evaluate(
     ) -> EvaluationResult:
         """Evaluate whether actual output exactly matches expected output.
 
-        Args:
-            agent_execution: The execution details containing:
-                - agent_input: The input received by the agent
-                - agent_output: The actual output from the agent
-                - agent_trace: The execution spans to use for the evaluation
-            evaluation_criteria: The criteria to evaluate
-
         Returns:
-            EvaluationResult: Boolean result indicating exact match (True/False)
+            EvaluationResult: Boolean result indicating exact match (True/False).
+            The justification embeds the configured `aggregators` list so the
+            downstream C# post-pass can discover aggregator configuration
+            per datapoint.
         """
         actual_output = self._get_actual_output(agent_execution)
         expected_output = self._get_expected_output(evaluation_criteria)
@@ -72,12 +93,19 @@ async def evaluate(
         if self.evaluator_config.negated:
             is_exact_match = not is_exact_match
 
-        validated_justification = self.validate_justification(
-            {
-                "expected": str(expected_output),
-                "actual": str(actual_output),
-            }
-        )
+        justification_payload: dict[str, object] = {
+            "expected": str(expected_output),
+            "actual": str(actual_output),
+        }
+        if self.evaluator_config.aggregators:
+            # Pydantic models serialize via their parent BaseModel; embed as dicts
+            # so the wire shape is JSON-friendly and readable from C#.
+            justification_payload["aggregators"] = [
+                spec.model_dump(by_alias=True) if hasattr(spec, "model_dump") else spec
+                for spec in self.evaluator_config.aggregators
+            ]
+
+        validated_justification = self.validate_justification(justification_payload)
         return NumericEvaluationResult(
             score=float(is_exact_match),
             details=validated_justification,
diff --git a/packages/uipath/src/uipath/eval/evaluators/legacy_exact_match_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/legacy_exact_match_evaluator.py
index 42ffae047..9a5ef0c93 100644
--- a/packages/uipath/src/uipath/eval/evaluators/legacy_exact_match_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/legacy_exact_match_evaluator.py
@@ -1,9 +1,15 @@
 """Exact match evaluator for binary pass/fail evaluation of agent outputs."""
 
+import json
+from typing import Optional
+
+from pydantic import Field
+
 from uipath.eval.models import BooleanEvaluationResult, EvaluationResult
 
 from .._helpers.output_path import resolve_output_path
 from ..models.models import AgentExecution
+from ._aggregators import AggregatorSpec
 from .base_legacy_evaluator import LegacyEvaluationCriteria, LegacyEvaluatorConfig
 from .legacy_deterministic_evaluator_base import BaseLegacyDeterministicEvaluator
 
@@ -24,6 +30,13 @@ class LegacyExactMatchEvaluator(
     to floats for consistent comparison.
     """
 
+    # Optional run-level aggregator config (e.g. a classification aggregator with a
+    # fixed class set). The evaluator does no per-datapoint aggregation; it only
+    # forwards this config into the per-datapoint justification so the downstream
+    # C# post-pass can build a confusion matrix + P/R/F1 across the dataset.
+    # Deserialized from the legacy evaluator JSON's top-level `aggregators` key.
+    aggregators: Optional[list[AggregatorSpec]] = Field(default=None, alias="aggregators")
+
     async def evaluate(
         self,
         agent_execution: AgentExecution,
@@ -39,7 +52,10 @@ async def evaluate(
             evaluation_criteria: The criteria to evaluate
 
         Returns:
-            EvaluationResult: Boolean result indicating exact match (True/False)
+            EvaluationResult: Boolean result. When `aggregators` is configured, the
+            result's `details` carries a JSON string of {expected, actual, aggregators}
+            so the C# post-pass can discover aggregator config and the expected label
+            per datapoint.
         """
         actual_output = agent_execution.agent_output
         expected_output = evaluation_criteria.expected_output
@@ -66,7 +82,23 @@ async def evaluate(
                 if not actual_resolved or not expected_resolved:
                     actual_output = expected_output = {}
 
-        return BooleanEvaluationResult(
-            score=self._canonical_json(actual_output)
-            == self._canonical_json(expected_output)
+        is_match = self._canonical_json(actual_output) == self._canonical_json(
+            expected_output
         )
+
+        # Legacy evaluators use a `str` justification (generic J = str). Emit a JSON
+        # string directly — _serialize_justification passes strings through unchanged,
+        # so this lands verbatim in EvalScore.Justification on the C# side.
+        details: Optional[str] = None
+        if self.aggregators:
+            details = json.dumps(
+                {
+                    "expected": str(expected_output),
+                    "actual": str(actual_output),
+                    "aggregators": [
+                        spec.model_dump(by_alias=True) for spec in self.aggregators
+                    ],
+                }
+            )
+
+        return BooleanEvaluationResult(score=is_match, details=details)
diff --git a/packages/uipath/tests/evaluators/test_classifier_evaluator.py b/packages/uipath/tests/evaluators/test_classifier_evaluator.py
new file mode 100644
index 000000000..182a2dac5
--- /dev/null
+++ b/packages/uipath/tests/evaluators/test_classifier_evaluator.py
@@ -0,0 +1,179 @@
+"""Tests for the pure-metadata ClassifierEvaluator.
+
+This evaluator carries a `classes` list to downstream consumers (the C# layer
+in Studio Web). Its per-datapoint evaluate is a no-op that emits the classes
+list as a justification payload. The tests below pin that contract.
+"""
+
+import json
+
+import pytest
+
+from uipath.eval.evaluators import (
+    ClassifierEvaluator,
+    ClassifierJustification,
+)
+from uipath.eval.evaluators.base_evaluator import BaseEvaluationCriteria
+from uipath.eval.evaluators.evaluator_factory import EvaluatorFactory
+from uipath.eval.models import AgentExecution, EvaluatorType, NumericEvaluationResult
+from uipath.eval.models.models import UiPathEvaluationError
+
+
+def _build_evaluator(
+    classes: list[str] | None = None, source_evaluator: str = "intent_match"
+) -> ClassifierEvaluator:
+    # Construct via the factory to match how real eval-set runs build evaluators.
+    data = {
+        "version": "1.0",
+        "id": "intent_classifier",
+        "name": "intent_classifier",
+        "evaluatorTypeId": EvaluatorType.CLASSIFIER.value,
+        "evaluatorConfig": {
+            "name": "intent_classifier",
+            "classes": classes
+            if classes is not None
+            else ["book", "cancel", "reschedule"],
+            "sourceEvaluator": source_evaluator,
+        },
+    }
+    evaluator = EvaluatorFactory.create_evaluator(data)
+    assert isinstance(evaluator, ClassifierEvaluator)
+    return evaluator
+
+
+def _agent_execution(output: dict[str, str] | str | None = None) -> AgentExecution:
+    return AgentExecution(
+        agent_input={"text": "hello"},
+        agent_output=output if output is not None else {"intent": "book"},
+        agent_trace=[],
+    )
+
+
+class TestClassifierEvaluator:
+    async def test_evaluate_returns_zero_score_with_classifier_justification(
+        self,
+    ) -> None:
+        evaluator = _build_evaluator()
+        result = await evaluator.evaluate(_agent_execution(), BaseEvaluationCriteria())
+
+        assert isinstance(result, NumericEvaluationResult)
+        assert result.score == 0.0
+        assert isinstance(result.details, ClassifierJustification)
+        assert result.details.classes == ["book", "cancel", "reschedule"]
+        assert result.details.source_evaluator == "intent_match"
+        # expected / actual are not meaningful for this evaluator
+        assert result.details.expected == ""
+        assert result.details.actual == ""
+
+    async def test_classes_list_is_independent_copy(self) -> None:
+        # If a caller mutates the result's classes list, it shouldn't leak into the config.
+        evaluator = _build_evaluator(classes=["a", "b"])
+        result = await evaluator.evaluate(_agent_execution(), BaseEvaluationCriteria())
+        assert isinstance(result.details, ClassifierJustification)
+        result.details.classes.append("c")
+        assert evaluator.evaluator_config.classes == ["a", "b"]
+
+    async def test_score_is_zero_regardless_of_agent_output(self) -> None:
+        evaluator = _build_evaluator()
+        for output in (
+            None,
+            {},
+            {"intent": "book"},
+            {"intent": "totally-unrelated"},
+            "free text output mentioning cancel",
+        ):
+            result = await evaluator.evaluate(
+                _agent_execution(output), BaseEvaluationCriteria()
+            )
+            assert result.score == 0.0
+
+    async def test_evaluate_does_not_error_on_missing_criteria(self) -> None:
+        # The runtime's validate_and_evaluate_criteria falls back to
+        # default_evaluation_criteria when None is passed. Confirm the config's
+        # default_evaluation_criteria covers that case.
+        evaluator = _build_evaluator()
+        result = await evaluator.validate_and_evaluate_criteria(
+            _agent_execution(), None
+        )
+        assert result.score == 0.0
+        assert isinstance(result.details, ClassifierJustification)
+        assert result.details.classes == ["book", "cancel", "reschedule"]
+
+
+class TestClassifierJustificationWireShape:
+    """Pin the JSON shape that flows from CLI → C# via _serialize_justification."""
+
+    async def test_model_dump_carries_all_config_metadata(self) -> None:
+        evaluator = _build_evaluator()
+        result = await evaluator.evaluate(_agent_execution(), BaseEvaluationCriteria())
+        assert isinstance(result.details, ClassifierJustification)
+
+        dumped = result.details.model_dump()
+        # The CLI ships this via json.dumps(model_dump()) — the resulting string
+        # is what lands in CodedEvaluatorScore.Justification in the backend.
+        wire = json.loads(json.dumps(dumped))
+        assert wire["classes"] == ["book", "cancel", "reschedule"]
+        assert wire["source_evaluator"] == "intent_match"
+        assert wire["expected"] == ""
+        assert wire["actual"] == ""
+
+    async def test_wire_payload_can_be_round_tripped_back_to_model(self) -> None:
+        evaluator = _build_evaluator()
+        result = await evaluator.evaluate(_agent_execution(), BaseEvaluationCriteria())
+        assert isinstance(result.details, ClassifierJustification)
+
+        wire_string = json.dumps(result.details.model_dump())
+        parsed = ClassifierJustification.model_validate_json(wire_string)
+        assert parsed.classes == ["book", "cancel", "reschedule"]
+        assert parsed.source_evaluator == "intent_match"
+
+
+class TestFactoryIntegration:
+    def test_factory_builds_classifier_from_v1_config(self) -> None:
+        data = {
+            "version": "1.0",
+            "id": "intent_classifier",
+            "name": "intent_classifier",
+            "evaluatorTypeId": EvaluatorType.CLASSIFIER.value,
+            "evaluatorConfig": {
+                "name": "intent_classifier",
+                "classes": ["book", "cancel", "reschedule"],
+                "sourceEvaluator": "intent_match",
+            },
+        }
+        evaluator = EvaluatorFactory.create_evaluator(data)
+        assert isinstance(evaluator, ClassifierEvaluator)
+        assert evaluator.evaluator_config.classes == ["book", "cancel", "reschedule"]
+        assert evaluator.evaluator_config.source_evaluator == "intent_match"
+        assert evaluator.id == "intent_classifier"
+
+    def test_factory_accepts_snake_case_aliases(self) -> None:
+        data = {
+            "version": "1.0",
+            "id": "intent_classifier",
+            "name": "intent_classifier",
+            "evaluatorTypeId": EvaluatorType.CLASSIFIER.value,
+            "evaluatorConfig": {
+                "name": "intent_classifier",
+                "classes": ["yes", "no"],
+                "source_evaluator": "yes_no_match",
+            },
+        }
+        evaluator = EvaluatorFactory.create_evaluator(data)
+        assert isinstance(evaluator, ClassifierEvaluator)
+        assert evaluator.evaluator_config.source_evaluator == "yes_no_match"
+
+    def test_factory_rejects_config_missing_classes(self) -> None:
+        data = {
+            "version": "1.0",
+            "id": "intent_classifier",
+            "name": "intent_classifier",
+            "evaluatorTypeId": EvaluatorType.CLASSIFIER.value,
+            "evaluatorConfig": {
+                "name": "intent_classifier",
+                "sourceEvaluator": "intent_match",
+                # classes missing
+            },
+        }
+        with pytest.raises(UiPathEvaluationError):
+            EvaluatorFactory.create_evaluator(data)
diff --git a/packages/uipath/uv.lock b/packages/uipath/uv.lock
index 72e9fde23..4dfa4bf8d 100644
--- a/packages/uipath/uv.lock
+++ b/packages/uipath/uv.lock
@@ -2552,7 +2552,7 @@ wheels = [
 
 [[package]]
 name = "uipath"
-version = "2.10.70"
+version = "2.10.72"
 source = { editable = "." }
 dependencies = [
     { name = "applicationinsights" },