diff --git a/packages/uipath/pyproject.toml b/packages/uipath/pyproject.toml index 8e9c9f581..81c597d68 100644 --- a/packages/uipath/pyproject.toml +++ b/packages/uipath/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "uipath" -version = "2.10.70" +version = "2.10.72" description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools." readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.11" diff --git a/packages/uipath/samples/classifier_demo/README.md b/packages/uipath/samples/classifier_demo/README.md new file mode 100644 index 000000000..638765ab1 --- /dev/null +++ b/packages/uipath/samples/classifier_demo/README.md @@ -0,0 +1,139 @@ +# Classifier evaluator end-to-end demo + +A minimal intent-classification agent that exercises the new +`ClassifierEvaluator` end-to-end. Use this as the test fixture for both +SDK-only validation (Path A below) and Studio Web full-stack validation +(Path B). + +## What's here + +``` +classifier_demo/ +├── main.py # 3-class keyword classifier +├── uipath.json +├── pyproject.toml +├── bindings.json +└── evaluations/ + ├── eval-sets/ + │ └── main.json # 9 datapoints, 3 per class, some intentionally wrong + └── evaluators/ + ├── intent_match.json # per-datapoint ExactMatch on agent_output.intent + └── intent_classifier.json # the new uipath-classifier (pure metadata) +``` + +The eval set is wired so that for every datapoint both evaluators run: +- `intent_match` produces a 1.0/0.0 score with `{"expected": "...", "actual": "..."}` justification. +- `intent_classifier` produces a sentinel 0.0 score with `{"classes": [...], "source_evaluator": "intent_match"}` justification. + +Downstream (the C# layer in Studio Web) reads both to compute precision / +recall / F-score across the dataset. + +> Heads-up — every datapoint must have an entry for the classifier in +> `evaluationCriterias` (even an empty `{}`). The runtime currently skips +> evaluators that aren't keyed in `evaluationCriterias` for a datapoint, so +> omitting them silently drops the classifier results. + +## Path A — SDK only (real run, ~30 seconds) + +```bash +cd packages/uipath +uv sync --all-extras + +cd samples/classifier_demo +uv run --project ../.. uipath eval main main.json --no-report --output-file /tmp/out.json +``` + +Expected: a results table with two columns (`intent_classifier`, `intent_match`). +`intent_match` averages to 0.7 (6/9 correct). `intent_classifier` shows 0.0 per +row by design — its real work is to ship the classes list to the backend. + +To see the metadata payload that lands in the backend's +`CodedEvaluatorScore.Justification`: + +```bash +python3 -c " +import json +with open('/tmp/out.json') as f: d = json.load(f) +for r in d['evaluationSetResults'][0]['evaluationRunResults']: + print(r['evaluatorName'], r['result'].get('details')) +" +``` + +You should see something like: + +``` +intent_classifier {'expected': '', 'actual': '', 'classes': ['book', 'cancel', 'reschedule'], 'source_evaluator': 'intent_match'} +intent_match {'expected': 'book', 'actual': 'book'} +``` + +## Path B — Full Studio Web stack (real UI, click Run, see panel) + +Currently blocked on environment that I (the assistant who built this) didn't +have available locally. The pieces: + +### Prereqs (per `Agents/LOCAL_DEVELOPMENT.md`) +- Docker installed and running +- `make` available +- Azure CLI authenticated session (`az login`) +- Azure DevOps PAT exported as `AZURE_DEVOPS_PAT` +- GitHub NPM registry token exported as `GH_NPM_REGISTRY_TOKEN` +- Azure access token exported as `AZURE_ACCESS_TOKEN` (for the python worker build) +- `cloud-provider-kind` binary (used for the local KinD cluster) + +### Steps + +1. **Point python-eval-worker at the local SDK branch.** The published + `uipath` package on PyPI doesn't yet have `ClassifierEvaluator`. Edit + `Agents/python-eval-worker/pyproject.toml`: + + ```toml + [tool.uv.sources] + uipath = { path = "../../uipath-python/packages/uipath", editable = true } + ``` + + Then `cd python-eval-worker && uv lock && uv sync`. + +2. **Bring up the local KinD cluster** (from `Agents/`): + ```bash + make create-kind-cluster + kubectl get nodes + sudo ./bin/cloud-provider-kind & # in a separate shell or background + make up + make deploy + ``` + +3. **Build the backend with the classifier changes:** + ```bash + git checkout feat/eval-classifier-backend # in Agents repo + # Re-trigger the helm/skaffold deploy for the backend + make deploy + ``` + +4. **Build the frontend with the UI changes:** + ```bash + git checkout feat/eval-dataset-evaluators-ui # in Agents repo + # Same deploy command rebuilds frontend image + ``` + +5. **Open Studio Web** (URL surfaced by the deploy output), create an agent + project, upload the eval-set + evaluator JSONs from this directory (or + author them in the UI — the picker now shows a "Classifier" entry under + the AGGREGATION section), and click Run. + +6. **Verify** the Aggregations panel renders between the run header and the + datapoint table, with the confusion matrix matching what Path A's Python + shim computes (macro F1 ≈ 0.667 on this fixture). + +### Open questions for the team owning local dev + +- Does the existing PAT / token set get refreshed automatically by the dev tooling, or do contributors need to rotate them periodically? +- Is there a simpler "local-only" path that bypasses the KinD cluster (e.g. docker-compose) for changes that don't touch K8s manifests? +- What's the standard pattern for pointing the python worker at a non-PyPI uipath build? The `[tool.uv.sources]` override above is the standard uv path — confirm there's no Helm/skaffold complication. + +## Companion PRs + +| Repo | Branch | PR | What | +|---|---|---|---| +| uipath-python | `feat/eval-classifier-evaluator` | [#1674](https://github.com/UiPath/uipath-python/pull/1674) | SDK `ClassifierEvaluator` | +| Agents | `feat/eval-classifier-backend` | [#5313](https://github.com/UiPath/Agents/pull/5313) | C# math + activity + envelope storage | +| Agents | `feat/eval-dataset-evaluators-ui` | [#5306](https://github.com/UiPath/Agents/pull/5306) | Frontend picker + Aggregations panel | diff --git a/packages/uipath/samples/classifier_demo/bindings.json b/packages/uipath/samples/classifier_demo/bindings.json new file mode 100644 index 000000000..5e9beeb01 --- /dev/null +++ b/packages/uipath/samples/classifier_demo/bindings.json @@ -0,0 +1,4 @@ +{ + "version": "2.0", + "resources": [] +} diff --git a/packages/uipath/samples/classifier_demo/evaluations/eval-sets/main.json b/packages/uipath/samples/classifier_demo/evaluations/eval-sets/main.json new file mode 100644 index 000000000..117e9e240 --- /dev/null +++ b/packages/uipath/samples/classifier_demo/evaluations/eval-sets/main.json @@ -0,0 +1,173 @@ +{ + "version": "1.0", + "id": "classifier-demo-eval-set", + "name": "Classifier demo eval set", + "evaluatorRefs": [ + "intent_match", + "intent_classifier" + ], + "evaluations": [ + { + "id": "book-1", + "name": "book \u2014 straightforward", + "inputs": { + "utterance": "I want to book a table for two" + }, + "expectedOutput": { + "intent": "book" + }, + "evaluationCriterias": { + "intent_match": { + "expectedOutput": { + "intent": "book" + } + }, + "intent_classifier": {} + } + }, + { + "id": "book-2", + "name": "book \u2014 schedule keyword", + "inputs": { + "utterance": "Please schedule an appointment" + }, + "expectedOutput": { + "intent": "book" + }, + "evaluationCriterias": { + "intent_match": { + "expectedOutput": { + "intent": "book" + } + }, + "intent_classifier": {} + } + }, + { + "id": "book-3", + "name": "book \u2014 agent misclassifies (utterance triggers cancel keyword)", + "inputs": { + "utterance": "I had to cancel my last attempt but I want to reserve a slot now" + }, + "expectedOutput": { + "intent": "book" + }, + "evaluationCriterias": { + "intent_match": { + "expectedOutput": { + "intent": "book" + } + }, + "intent_classifier": {} + } + }, + { + "id": "cancel-1", + "name": "cancel \u2014 straightforward", + "inputs": { + "utterance": "Please cancel my reservation" + }, + "expectedOutput": { + "intent": "cancel" + }, + "evaluationCriterias": { + "intent_match": { + "expectedOutput": { + "intent": "cancel" + } + }, + "intent_classifier": {} + } + }, + { + "id": "cancel-2", + "name": "cancel \u2014 void synonym", + "inputs": { + "utterance": "I want to void the order" + }, + "expectedOutput": { + "intent": "cancel" + }, + "evaluationCriterias": { + "intent_match": { + "expectedOutput": { + "intent": "cancel" + } + }, + "intent_classifier": {} + } + }, + { + "id": "cancel-3", + "name": "cancel \u2014 agent misclassifies (utterance has 'move' which triggers reschedule)", + "inputs": { + "utterance": "I need to move past this and cancel everything" + }, + "expectedOutput": { + "intent": "cancel" + }, + "evaluationCriterias": { + "intent_match": { + "expectedOutput": { + "intent": "cancel" + } + }, + "intent_classifier": {} + } + }, + { + "id": "reschedule-1", + "name": "reschedule \u2014 straightforward", + "inputs": { + "utterance": "I want to reschedule the meeting" + }, + "expectedOutput": { + "intent": "reschedule" + }, + "evaluationCriterias": { + "intent_match": { + "expectedOutput": { + "intent": "reschedule" + } + }, + "intent_classifier": {} + } + }, + { + "id": "reschedule-2", + "name": "reschedule \u2014 move synonym", + "inputs": { + "utterance": "Can we move the slot to tomorrow" + }, + "expectedOutput": { + "intent": "reschedule" + }, + "evaluationCriterias": { + "intent_match": { + "expectedOutput": { + "intent": "reschedule" + } + }, + "intent_classifier": {} + } + }, + { + "id": "reschedule-3", + "name": "reschedule \u2014 agent misclassifies (falls through to default 'book')", + "inputs": { + "utterance": "Different timing please" + }, + "expectedOutput": { + "intent": "reschedule" + }, + "evaluationCriterias": { + "intent_match": { + "expectedOutput": { + "intent": "reschedule" + } + }, + "intent_classifier": {} + } + } + ] +} \ No newline at end of file diff --git a/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_classifier.json b/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_classifier.json new file mode 100644 index 000000000..ace8cb712 --- /dev/null +++ b/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_classifier.json @@ -0,0 +1,11 @@ +{ + "version": "1.0", + "id": "intent_classifier", + "description": "Classification aggregator. Pure metadata — carries the classes list + source evaluator name to downstream consumers (the C# backend computes precision/recall/F-score over the dataset). Per-datapoint result is a no-op carrying the metadata.", + "evaluatorTypeId": "uipath-classifier", + "evaluatorConfig": { + "name": "intent_classifier", + "classes": ["book", "cancel", "reschedule"], + "sourceEvaluator": "intent_match" + } +} diff --git a/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_match.json b/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_match.json new file mode 100644 index 000000000..552c7220f --- /dev/null +++ b/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_match.json @@ -0,0 +1,15 @@ +{ + "version": "1.0", + "id": "intent_match", + "description": "Per-datapoint ExactMatch on the agent's `intent` output. Produces expected/actual justification that the ClassifierEvaluator pipeline reads.", + "evaluatorTypeId": "uipath-exact-match", + "evaluatorConfig": { + "name": "intent_match", + "targetOutputKey": "intent", + "caseSensitive": false, + "negated": false, + "defaultEvaluationCriteria": { + "expectedOutput": "book" + } + } +} diff --git a/packages/uipath/samples/classifier_demo/main.py b/packages/uipath/samples/classifier_demo/main.py new file mode 100644 index 000000000..b6e1eea48 --- /dev/null +++ b/packages/uipath/samples/classifier_demo/main.py @@ -0,0 +1,42 @@ +"""Tiny intent-classification agent for the ClassifierEvaluator demo. + +Given an utterance, returns the intent label. Three intents: + - book (anything containing "book" / "reserve" / "schedule") + - cancel (anything containing "cancel" / "void") + - reschedule (anything containing "reschedule" / "move") + +A few datapoints are deliberately misclassified so the run-level +classification metrics (precision/recall/F-score) come out non-trivially. +""" + +from dataclasses import dataclass + + +@dataclass +class IntentInput: + utterance: str + + +@dataclass +class IntentOutput: + intent: str + + +BOOK_KEYWORDS = {"book", "reserve", "schedule"} +CANCEL_KEYWORDS = {"cancel", "void"} +RESCHEDULE_KEYWORDS = {"reschedule", "move"} + + +async def main(input: IntentInput) -> IntentOutput: + """Classify the utterance into book / cancel / reschedule.""" + text = input.utterance.lower() + tokens = set(text.split()) + + if tokens & RESCHEDULE_KEYWORDS: + return IntentOutput(intent="reschedule") + if tokens & CANCEL_KEYWORDS: + return IntentOutput(intent="cancel") + if tokens & BOOK_KEYWORDS: + return IntentOutput(intent="book") + # Fallback to "book" — deliberately wrong-ish so the matrix is interesting. + return IntentOutput(intent="book") diff --git a/packages/uipath/samples/classifier_demo/pyproject.toml b/packages/uipath/samples/classifier_demo/pyproject.toml new file mode 100644 index 000000000..307e3778c --- /dev/null +++ b/packages/uipath/samples/classifier_demo/pyproject.toml @@ -0,0 +1,9 @@ +[project] +name = "classifier-demo" +version = "0.0.1" +description = "Tiny intent-classification agent that exercises the new ClassifierEvaluator end-to-end via `uipath eval`." +requires-python = ">=3.11" +dependencies = ["uipath"] + +[dependency-groups] +dev = ["uipath-dev"] diff --git a/packages/uipath/samples/classifier_demo/uipath.json b/packages/uipath/samples/classifier_demo/uipath.json new file mode 100644 index 000000000..9b02c2654 --- /dev/null +++ b/packages/uipath/samples/classifier_demo/uipath.json @@ -0,0 +1,5 @@ +{ + "functions": { + "main": "main.py:main" + } +} diff --git a/packages/uipath/src/uipath/eval/evaluators/__init__.py b/packages/uipath/src/uipath/eval/evaluators/__init__.py index 03a4bf63b..c68271b11 100644 --- a/packages/uipath/src/uipath/eval/evaluators/__init__.py +++ b/packages/uipath/src/uipath/eval/evaluators/__init__.py @@ -9,6 +9,7 @@ BaseEvaluatorConfig, BaseEvaluatorJustification, ) +from ._aggregators import AggregatorSpec, ClassificationAggregatorSpec from .base_legacy_evaluator import BaseLegacyEvaluator from .binary_classification_evaluator import BinaryClassificationEvaluator @@ -71,6 +72,9 @@ "BinaryClassificationEvaluator", "MulticlassClassificationEvaluator", "ContainsEvaluator", + # Aggregator specs (config metadata attached to per-datapoint evaluators) + "AggregatorSpec", + "ClassificationAggregatorSpec", "ExactMatchEvaluator", "JsonSimilarityEvaluator", "BaseLLMOutputEvaluator", diff --git a/packages/uipath/src/uipath/eval/evaluators/_aggregators.py b/packages/uipath/src/uipath/eval/evaluators/_aggregators.py new file mode 100644 index 000000000..968974546 --- /dev/null +++ b/packages/uipath/src/uipath/eval/evaluators/_aggregators.py @@ -0,0 +1,35 @@ +"""Aggregator specs attached to per-datapoint evaluator configs. + +An aggregator is run-level — it consumes the per-datapoint results of an +evaluator after the eval set finishes. The aggregator itself does not run in +the Python runtime; this module just defines the config shape so the downstream +consumer (the C# backend) can pick it up via the evaluator's stored config. + +Today the only aggregator is `classification`, which compares each datapoint's +expected vs. predicted class to build a confusion matrix and precision/recall/ +F-score metrics. +""" + +from typing import Literal + +from pydantic import BaseModel, ConfigDict +from pydantic.alias_generators import to_camel + + +class ClassificationAggregatorSpec(BaseModel): + """Configuration for a classification aggregator. + + Attached to a per-datapoint evaluator (e.g. ExactMatch) to mark that the + evaluator's results should be aggregated into classification metrics. The + classes list defines the exhaustive label space; the C# layer scans each + datapoint's expected output for the first class that matches. + """ + + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) + + name: Literal["classification"] = "classification" + classes: list[str] + + +# Union of all supported aggregator specs. Add new variants here. +AggregatorSpec = ClassificationAggregatorSpec diff --git a/packages/uipath/src/uipath/eval/evaluators/exact_match_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/exact_match_evaluator.py index 0f1b3e8e8..398afc408 100644 --- a/packages/uipath/src/uipath/eval/evaluators/exact_match_evaluator.py +++ b/packages/uipath/src/uipath/eval/evaluators/exact_match_evaluator.py @@ -1,11 +1,14 @@ """Exact match evaluator for agent outputs.""" +from typing import Optional + from ..models import ( AgentExecution, EvaluationResult, EvaluatorType, NumericEvaluationResult, ) +from ._aggregators import AggregatorSpec from .base_evaluator import BaseEvaluatorJustification from .output_evaluator import ( OutputEvaluationCriteria, @@ -15,16 +18,38 @@ class ExactMatchEvaluatorConfig(OutputEvaluatorConfig[OutputEvaluationCriteria]): - """Configuration for the exact match evaluator.""" + """Configuration for the exact match evaluator. + + The optional `aggregators` field attaches run-level aggregators (e.g. a + classification aggregator with a fixed class set) that the downstream + backend will compute after the eval set finishes. The Python runtime + itself only forwards `aggregators` into the per-datapoint justification + so the C# layer can pick it up; no per-datapoint math happens here. + """ name: str = "ExactMatchEvaluator" case_sensitive: bool = False negated: bool = False + aggregators: Optional[list[AggregatorSpec]] = None + + +class ExactMatchJustification(BaseEvaluatorJustification): + """ExactMatch's per-datapoint justification. + + Carries the standard `expected` / `actual` plus the run-level + `aggregators` config inlined per datapoint. The aggregators value is + identical across datapoints — it's repeated only so the downstream + consumer (the C# post-pass) can discover aggregator configuration from + per-datapoint records without needing access to evaluator snapshots. + Omitted entirely when no aggregators are configured. + """ + + aggregators: Optional[list[AggregatorSpec]] = None class ExactMatchEvaluator( OutputEvaluator[ - OutputEvaluationCriteria, ExactMatchEvaluatorConfig, BaseEvaluatorJustification + OutputEvaluationCriteria, ExactMatchEvaluatorConfig, ExactMatchJustification ] ): """Evaluator that performs exact structural matching between expected and actual outputs. @@ -46,15 +71,11 @@ async def evaluate( ) -> EvaluationResult: """Evaluate whether actual output exactly matches expected output. - Args: - agent_execution: The execution details containing: - - agent_input: The input received by the agent - - agent_output: The actual output from the agent - - agent_trace: The execution spans to use for the evaluation - evaluation_criteria: The criteria to evaluate - Returns: - EvaluationResult: Boolean result indicating exact match (True/False) + EvaluationResult: Boolean result indicating exact match (True/False). + The justification embeds the configured `aggregators` list so the + downstream C# post-pass can discover aggregator configuration + per datapoint. """ actual_output = self._get_actual_output(agent_execution) expected_output = self._get_expected_output(evaluation_criteria) @@ -72,12 +93,19 @@ async def evaluate( if self.evaluator_config.negated: is_exact_match = not is_exact_match - validated_justification = self.validate_justification( - { - "expected": str(expected_output), - "actual": str(actual_output), - } - ) + justification_payload: dict[str, object] = { + "expected": str(expected_output), + "actual": str(actual_output), + } + if self.evaluator_config.aggregators: + # Pydantic models serialize via their parent BaseModel; embed as dicts + # so the wire shape is JSON-friendly and readable from C#. + justification_payload["aggregators"] = [ + spec.model_dump(by_alias=True) if hasattr(spec, "model_dump") else spec + for spec in self.evaluator_config.aggregators + ] + + validated_justification = self.validate_justification(justification_payload) return NumericEvaluationResult( score=float(is_exact_match), details=validated_justification, diff --git a/packages/uipath/src/uipath/eval/evaluators/legacy_exact_match_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/legacy_exact_match_evaluator.py index 42ffae047..9a5ef0c93 100644 --- a/packages/uipath/src/uipath/eval/evaluators/legacy_exact_match_evaluator.py +++ b/packages/uipath/src/uipath/eval/evaluators/legacy_exact_match_evaluator.py @@ -1,9 +1,15 @@ """Exact match evaluator for binary pass/fail evaluation of agent outputs.""" +import json +from typing import Optional + +from pydantic import Field + from uipath.eval.models import BooleanEvaluationResult, EvaluationResult from .._helpers.output_path import resolve_output_path from ..models.models import AgentExecution +from ._aggregators import AggregatorSpec from .base_legacy_evaluator import LegacyEvaluationCriteria, LegacyEvaluatorConfig from .legacy_deterministic_evaluator_base import BaseLegacyDeterministicEvaluator @@ -24,6 +30,13 @@ class LegacyExactMatchEvaluator( to floats for consistent comparison. """ + # Optional run-level aggregator config (e.g. a classification aggregator with a + # fixed class set). The evaluator does no per-datapoint aggregation; it only + # forwards this config into the per-datapoint justification so the downstream + # C# post-pass can build a confusion matrix + P/R/F1 across the dataset. + # Deserialized from the legacy evaluator JSON's top-level `aggregators` key. + aggregators: Optional[list[AggregatorSpec]] = Field(default=None, alias="aggregators") + async def evaluate( self, agent_execution: AgentExecution, @@ -39,7 +52,10 @@ async def evaluate( evaluation_criteria: The criteria to evaluate Returns: - EvaluationResult: Boolean result indicating exact match (True/False) + EvaluationResult: Boolean result. When `aggregators` is configured, the + result's `details` carries a JSON string of {expected, actual, aggregators} + so the C# post-pass can discover aggregator config and the expected label + per datapoint. """ actual_output = agent_execution.agent_output expected_output = evaluation_criteria.expected_output @@ -66,7 +82,23 @@ async def evaluate( if not actual_resolved or not expected_resolved: actual_output = expected_output = {} - return BooleanEvaluationResult( - score=self._canonical_json(actual_output) - == self._canonical_json(expected_output) + is_match = self._canonical_json(actual_output) == self._canonical_json( + expected_output ) + + # Legacy evaluators use a `str` justification (generic J = str). Emit a JSON + # string directly — _serialize_justification passes strings through unchanged, + # so this lands verbatim in EvalScore.Justification on the C# side. + details: Optional[str] = None + if self.aggregators: + details = json.dumps( + { + "expected": str(expected_output), + "actual": str(actual_output), + "aggregators": [ + spec.model_dump(by_alias=True) for spec in self.aggregators + ], + } + ) + + return BooleanEvaluationResult(score=is_match, details=details) diff --git a/packages/uipath/tests/evaluators/test_classifier_evaluator.py b/packages/uipath/tests/evaluators/test_classifier_evaluator.py new file mode 100644 index 000000000..182a2dac5 --- /dev/null +++ b/packages/uipath/tests/evaluators/test_classifier_evaluator.py @@ -0,0 +1,179 @@ +"""Tests for the pure-metadata ClassifierEvaluator. + +This evaluator carries a `classes` list to downstream consumers (the C# layer +in Studio Web). Its per-datapoint evaluate is a no-op that emits the classes +list as a justification payload. The tests below pin that contract. +""" + +import json + +import pytest + +from uipath.eval.evaluators import ( + ClassifierEvaluator, + ClassifierJustification, +) +from uipath.eval.evaluators.base_evaluator import BaseEvaluationCriteria +from uipath.eval.evaluators.evaluator_factory import EvaluatorFactory +from uipath.eval.models import AgentExecution, EvaluatorType, NumericEvaluationResult +from uipath.eval.models.models import UiPathEvaluationError + + +def _build_evaluator( + classes: list[str] | None = None, source_evaluator: str = "intent_match" +) -> ClassifierEvaluator: + # Construct via the factory to match how real eval-set runs build evaluators. + data = { + "version": "1.0", + "id": "intent_classifier", + "name": "intent_classifier", + "evaluatorTypeId": EvaluatorType.CLASSIFIER.value, + "evaluatorConfig": { + "name": "intent_classifier", + "classes": classes + if classes is not None + else ["book", "cancel", "reschedule"], + "sourceEvaluator": source_evaluator, + }, + } + evaluator = EvaluatorFactory.create_evaluator(data) + assert isinstance(evaluator, ClassifierEvaluator) + return evaluator + + +def _agent_execution(output: dict[str, str] | str | None = None) -> AgentExecution: + return AgentExecution( + agent_input={"text": "hello"}, + agent_output=output if output is not None else {"intent": "book"}, + agent_trace=[], + ) + + +class TestClassifierEvaluator: + async def test_evaluate_returns_zero_score_with_classifier_justification( + self, + ) -> None: + evaluator = _build_evaluator() + result = await evaluator.evaluate(_agent_execution(), BaseEvaluationCriteria()) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.0 + assert isinstance(result.details, ClassifierJustification) + assert result.details.classes == ["book", "cancel", "reschedule"] + assert result.details.source_evaluator == "intent_match" + # expected / actual are not meaningful for this evaluator + assert result.details.expected == "" + assert result.details.actual == "" + + async def test_classes_list_is_independent_copy(self) -> None: + # If a caller mutates the result's classes list, it shouldn't leak into the config. + evaluator = _build_evaluator(classes=["a", "b"]) + result = await evaluator.evaluate(_agent_execution(), BaseEvaluationCriteria()) + assert isinstance(result.details, ClassifierJustification) + result.details.classes.append("c") + assert evaluator.evaluator_config.classes == ["a", "b"] + + async def test_score_is_zero_regardless_of_agent_output(self) -> None: + evaluator = _build_evaluator() + for output in ( + None, + {}, + {"intent": "book"}, + {"intent": "totally-unrelated"}, + "free text output mentioning cancel", + ): + result = await evaluator.evaluate( + _agent_execution(output), BaseEvaluationCriteria() + ) + assert result.score == 0.0 + + async def test_evaluate_does_not_error_on_missing_criteria(self) -> None: + # The runtime's validate_and_evaluate_criteria falls back to + # default_evaluation_criteria when None is passed. Confirm the config's + # default_evaluation_criteria covers that case. + evaluator = _build_evaluator() + result = await evaluator.validate_and_evaluate_criteria( + _agent_execution(), None + ) + assert result.score == 0.0 + assert isinstance(result.details, ClassifierJustification) + assert result.details.classes == ["book", "cancel", "reschedule"] + + +class TestClassifierJustificationWireShape: + """Pin the JSON shape that flows from CLI → C# via _serialize_justification.""" + + async def test_model_dump_carries_all_config_metadata(self) -> None: + evaluator = _build_evaluator() + result = await evaluator.evaluate(_agent_execution(), BaseEvaluationCriteria()) + assert isinstance(result.details, ClassifierJustification) + + dumped = result.details.model_dump() + # The CLI ships this via json.dumps(model_dump()) — the resulting string + # is what lands in CodedEvaluatorScore.Justification in the backend. + wire = json.loads(json.dumps(dumped)) + assert wire["classes"] == ["book", "cancel", "reschedule"] + assert wire["source_evaluator"] == "intent_match" + assert wire["expected"] == "" + assert wire["actual"] == "" + + async def test_wire_payload_can_be_round_tripped_back_to_model(self) -> None: + evaluator = _build_evaluator() + result = await evaluator.evaluate(_agent_execution(), BaseEvaluationCriteria()) + assert isinstance(result.details, ClassifierJustification) + + wire_string = json.dumps(result.details.model_dump()) + parsed = ClassifierJustification.model_validate_json(wire_string) + assert parsed.classes == ["book", "cancel", "reschedule"] + assert parsed.source_evaluator == "intent_match" + + +class TestFactoryIntegration: + def test_factory_builds_classifier_from_v1_config(self) -> None: + data = { + "version": "1.0", + "id": "intent_classifier", + "name": "intent_classifier", + "evaluatorTypeId": EvaluatorType.CLASSIFIER.value, + "evaluatorConfig": { + "name": "intent_classifier", + "classes": ["book", "cancel", "reschedule"], + "sourceEvaluator": "intent_match", + }, + } + evaluator = EvaluatorFactory.create_evaluator(data) + assert isinstance(evaluator, ClassifierEvaluator) + assert evaluator.evaluator_config.classes == ["book", "cancel", "reschedule"] + assert evaluator.evaluator_config.source_evaluator == "intent_match" + assert evaluator.id == "intent_classifier" + + def test_factory_accepts_snake_case_aliases(self) -> None: + data = { + "version": "1.0", + "id": "intent_classifier", + "name": "intent_classifier", + "evaluatorTypeId": EvaluatorType.CLASSIFIER.value, + "evaluatorConfig": { + "name": "intent_classifier", + "classes": ["yes", "no"], + "source_evaluator": "yes_no_match", + }, + } + evaluator = EvaluatorFactory.create_evaluator(data) + assert isinstance(evaluator, ClassifierEvaluator) + assert evaluator.evaluator_config.source_evaluator == "yes_no_match" + + def test_factory_rejects_config_missing_classes(self) -> None: + data = { + "version": "1.0", + "id": "intent_classifier", + "name": "intent_classifier", + "evaluatorTypeId": EvaluatorType.CLASSIFIER.value, + "evaluatorConfig": { + "name": "intent_classifier", + "sourceEvaluator": "intent_match", + # classes missing + }, + } + with pytest.raises(UiPathEvaluationError): + EvaluatorFactory.create_evaluator(data) diff --git a/packages/uipath/uv.lock b/packages/uipath/uv.lock index 72e9fde23..4dfa4bf8d 100644 --- a/packages/uipath/uv.lock +++ b/packages/uipath/uv.lock @@ -2552,7 +2552,7 @@ wheels = [ [[package]] name = "uipath" -version = "2.10.70" +version = "2.10.72" source = { editable = "." } dependencies = [ { name = "applicationinsights" },