Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 107 additions & 5 deletions packages/uipath/src/uipath/_cli/_evals/_progress_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from uipath.eval.models.evaluation_set import EvaluationItem
from uipath.eval.runtime.events import (
EvalRunCreatedEvent,
EvalRunStatusUpdateEvent,
EvalRunUpdatedEvent,
EvalSetRunCreatedEvent,
EvalSetRunUpdatedEvent,
Expand All @@ -47,6 +48,8 @@ class EvaluationStatus(IntEnum):
IN_PROGRESS = 1
COMPLETED = 2
FAILED = 3
WORKLOAD_EXECUTING = 4
WORKLOAD_FAILED = 5


class StudioWebProgressItem(BaseModel):
Expand Down Expand Up @@ -473,6 +476,7 @@ async def update_eval_run(
evaluators: dict[str, BaseEvaluator[Any, Any, Any]],
is_coded: bool = False,
spans: list[Any] | None = None,
workload_failed: bool = False,
):
"""Update an evaluation run with results."""
coded_evaluators: dict[str, BaseEvaluator[Any, Any, Any]] = {}
Expand Down Expand Up @@ -512,6 +516,7 @@ async def update_eval_run(
actual_output=sw_progress_item.agent_output,
success=sw_progress_item.success,
is_coded=is_coded,
workload_failed=workload_failed,
)
else:
spec = self._update_eval_run_spec(
Expand All @@ -522,6 +527,7 @@ async def update_eval_run(
actual_output=sw_progress_item.agent_output,
success=sw_progress_item.success,
is_coded=is_coded,
workload_failed=workload_failed,
)

await self._client.request_async(
Expand Down Expand Up @@ -554,6 +560,24 @@ async def update_eval_set_run(
scoped="org" if self._is_localhost() else "tenant",
)

@gracefully_handle_errors
async def move_eval_run_status(
self,
eval_run_id: str,
status: EvaluationStatus,
is_coded: bool = False,
):
"""Update an evaluation run's status without changing results."""
spec = self._update_eval_run_status_spec(eval_run_id, status, is_coded)
await self._client.request_async(
method=spec.method,
url=spec.endpoint,
params=spec.params,
json=spec.json,
headers=spec.headers,
scoped="org" if self._is_localhost() else "tenant",
)

async def handle_create_eval_set_run(self, payload: EvalSetRunCreatedEvent) -> None:
try:
self.evaluators = {eval.id: eval for eval in payload.evaluators}
Expand Down Expand Up @@ -782,6 +806,7 @@ async def handle_update_eval_run(self, payload: EvalRunUpdatedEvent) -> None:
self.evaluators,
is_coded=is_coded,
spans=payload.spans,
workload_failed=payload.workload_failed,
)

logger.info(
Expand Down Expand Up @@ -820,6 +845,39 @@ async def handle_update_eval_set_run(self, payload: EvalSetRunUpdatedEvent) -> N
except Exception as e:
self._format_error_message(e, "StudioWeb update eval set run error")

async def handle_move_eval_run_to_running(
self, payload: EvalRunStatusUpdateEvent
) -> None:
"""Handle the event to move an eval run from WorkloadExecuting to Running."""
try:
eval_run_id = self.eval_run_ids.get(payload.execution_id)
if not eval_run_id:
logger.warning(
f"Cannot move eval run to running: eval_run_id not found for "
f"execution_id={payload.execution_id}"
)
return

is_coded = self.is_coded_eval.get(self.eval_set_execution_id or "", False)

logger.info(
f"Moving eval run to Running status: eval_run_id={eval_run_id}, "
f"is_coded={is_coded}"
)

await self.move_eval_run_status(
eval_run_id=eval_run_id,
status=EvaluationStatus.IN_PROGRESS,
is_coded=is_coded,
)

logger.info(
f"Successfully moved eval_run_id={eval_run_id} to Running status"
)

except Exception as e:
self._format_error_message(e, "StudioWeb move eval run to running error")

async def subscribe_to_eval_runtime_events(self, event_bus: EventBus) -> None:
event_bus.subscribe(
EvaluationEvents.CREATE_EVAL_SET_RUN, self.handle_create_eval_set_run
Expand All @@ -833,6 +891,10 @@ async def subscribe_to_eval_runtime_events(self, event_bus: EventBus) -> None:
event_bus.subscribe(
EvaluationEvents.UPDATE_EVAL_SET_RUN, self.handle_update_eval_set_run
)
event_bus.subscribe(
EvaluationEvents.MOVE_EVAL_RUN_TO_RUNNING,
self.handle_move_eval_run_to_running,
)

logger.debug("StudioWeb progress reporter subscribed to evaluation events")

Expand Down Expand Up @@ -1098,12 +1160,18 @@ def _update_eval_run_spec(
execution_time: float,
success: bool,
is_coded: bool = False,
workload_failed: bool = False,
) -> RequestSpec:
# For legacy evaluations, endpoint is without /coded
endpoint_suffix = "coded/" if is_coded else ""

# Determine status based on success
status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED
# Determine status based on success and workload failure
if success:
status = EvaluationStatus.COMPLETED
elif workload_failed:
status = EvaluationStatus.WORKLOAD_FAILED
else:
status = EvaluationStatus.FAILED

inner_payload: dict[str, Any] = {
"evalRunId": eval_run_id,
Expand Down Expand Up @@ -1148,13 +1216,19 @@ def _update_coded_eval_run_spec(
execution_time: float,
success: bool,
is_coded: bool = False,
workload_failed: bool = False,
) -> RequestSpec:
"""Create update spec for coded evaluators."""
# For coded evaluations, endpoint has /coded
endpoint_suffix = "coded/" if is_coded else ""

# Determine status based on success
status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED
# Determine status based on success and workload failure
if success:
status = EvaluationStatus.COMPLETED
elif workload_failed:
status = EvaluationStatus.WORKLOAD_FAILED
else:
status = EvaluationStatus.FAILED

payload: dict[str, Any] = {
"evalRunId": eval_run_id,
Expand Down Expand Up @@ -1187,6 +1261,33 @@ def _update_coded_eval_run_spec(
headers=self._tenant_header(),
)

def _update_eval_run_status_spec(
self,
eval_run_id: str,
status: EvaluationStatus,
is_coded: bool = False,
) -> RequestSpec:
"""Create spec for a status-only update of an eval run."""
endpoint_suffix = "coded/" if is_coded else ""

payload: dict[str, Any] = {
"evalRunId": eval_run_id,
"status": status.value,
}

logger.debug(
f"Updating eval run status: evalRunId={eval_run_id}, status={status.name}"
)

return RequestSpec(
method="PUT",
endpoint=Endpoint(
f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/{endpoint_suffix}evalRun"
),
json=payload,
headers=self._tenant_header(),
)

def _create_eval_run_spec(
self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False
) -> RequestSpec:
Expand Down Expand Up @@ -1234,7 +1335,8 @@ def _create_eval_run_spec(
"evalSetRunId": eval_set_run_id,
"evalSnapshot": eval_snapshot,
# Backend expects integer status
"status": EvaluationStatus.IN_PROGRESS.value,
# Use WORKLOAD_EXECUTING to indicate agent execution phase
"status": EvaluationStatus.WORKLOAD_EXECUTING.value,
}

# Legacy backend expects payload wrapped in "request" field
Expand Down
9 changes: 9 additions & 0 deletions packages/uipath/src/uipath/eval/runtime/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class EvaluationEvents(str, Enum):
CREATE_EVAL_RUN = "create_eval_run"
UPDATE_EVAL_SET_RUN = "update_eval_set_run"
UPDATE_EVAL_RUN = "update_eval_run"
MOVE_EVAL_RUN_TO_RUNNING = "move_eval_run_to_running"


class EvalSetRunCreatedEvent(BaseModel):
Expand Down Expand Up @@ -49,6 +50,12 @@ class EvalItemExceptionDetails(BaseModel):
exception: Exception


class EvalRunStatusUpdateEvent(BaseModel):
"""Event emitted to update an eval run's status (e.g., move to Running)."""

execution_id: str


class EvalRunUpdatedEvent(BaseModel):
"""Event emitted when an individual evaluation run is updated with results."""

Expand All @@ -63,6 +70,7 @@ class EvalRunUpdatedEvent(BaseModel):
spans: list[ReadableSpan]
logs: list[logging.LogRecord]
exception_details: EvalItemExceptionDetails | None = None
workload_failed: bool = False

@model_validator(mode="after")
def validate_exception_details(self):
Expand All @@ -83,6 +91,7 @@ class EvalSetRunUpdatedEvent(BaseModel):
ProgressEvent = Union[
EvalSetRunCreatedEvent,
EvalRunCreatedEvent,
EvalRunStatusUpdateEvent,
EvalRunUpdatedEvent,
EvalSetRunUpdatedEvent,
]
14 changes: 14 additions & 0 deletions packages/uipath/src/uipath/eval/runtime/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
from .events import (
EvalItemExceptionDetails,
EvalRunCreatedEvent,
EvalRunStatusUpdateEvent,
EvalRunUpdatedEvent,
EvalSetRunCreatedEvent,
EvalSetRunUpdatedEvent,
Expand Down Expand Up @@ -504,6 +505,7 @@ async def _execute_eval(
evaluation_run_results = UiPathEvalRunResult(
evaluation_name=eval_item.name, evaluation_run_results=[]
)
agent_execution_succeeded = False

try:
try:
Expand Down Expand Up @@ -626,6 +628,17 @@ async def _execute_eval(
# The evaluation will be completed when resumed
return evaluation_run_results

# Agent execution succeeded - mark flag and move to Running
# (evaluation/scoring phase is about to begin)
agent_execution_succeeded = True
await self.event_bus.publish(
EvaluationEvents.MOVE_EVAL_RUN_TO_RUNNING,
EvalRunStatusUpdateEvent(
execution_id=execution_id,
),
wait_for_completion=False,
)

if self.context.verbose:
evaluation_run_results.agent_execution_output = (
convert_eval_execution_output_to_serializable(
Expand Down Expand Up @@ -739,6 +752,7 @@ async def _execute_eval(
exception_details=exception_details,
spans=[],
logs=[],
workload_failed=not agent_execution_succeeded,
)
if isinstance(e, EvaluationRuntimeException):
eval_run_updated_event.spans = e.spans
Expand Down
Loading