diff --git a/packages/uipath/src/uipath/_cli/_evals/_progress_reporter.py b/packages/uipath/src/uipath/_cli/_evals/_progress_reporter.py index 7c5114516..5380bc952 100644 --- a/packages/uipath/src/uipath/_cli/_evals/_progress_reporter.py +++ b/packages/uipath/src/uipath/_cli/_evals/_progress_reporter.py @@ -31,6 +31,7 @@ from uipath.eval.models.evaluation_set import EvaluationItem from uipath.eval.runtime.events import ( EvalRunCreatedEvent, + EvalRunStatusUpdateEvent, EvalRunUpdatedEvent, EvalSetRunCreatedEvent, EvalSetRunUpdatedEvent, @@ -47,6 +48,8 @@ class EvaluationStatus(IntEnum): IN_PROGRESS = 1 COMPLETED = 2 FAILED = 3 + WORKLOAD_EXECUTING = 4 + WORKLOAD_FAILED = 5 class StudioWebProgressItem(BaseModel): @@ -473,6 +476,7 @@ async def update_eval_run( evaluators: dict[str, BaseEvaluator[Any, Any, Any]], is_coded: bool = False, spans: list[Any] | None = None, + workload_failed: bool = False, ): """Update an evaluation run with results.""" coded_evaluators: dict[str, BaseEvaluator[Any, Any, Any]] = {} @@ -512,6 +516,7 @@ async def update_eval_run( actual_output=sw_progress_item.agent_output, success=sw_progress_item.success, is_coded=is_coded, + workload_failed=workload_failed, ) else: spec = self._update_eval_run_spec( @@ -522,6 +527,7 @@ async def update_eval_run( actual_output=sw_progress_item.agent_output, success=sw_progress_item.success, is_coded=is_coded, + workload_failed=workload_failed, ) await self._client.request_async( @@ -554,6 +560,24 @@ async def update_eval_set_run( scoped="org" if self._is_localhost() else "tenant", ) + @gracefully_handle_errors + async def move_eval_run_status( + self, + eval_run_id: str, + status: EvaluationStatus, + is_coded: bool = False, + ): + """Update an evaluation run's status without changing results.""" + spec = self._update_eval_run_status_spec(eval_run_id, status, is_coded) + await self._client.request_async( + method=spec.method, + url=spec.endpoint, + params=spec.params, + json=spec.json, + headers=spec.headers, + scoped="org" if self._is_localhost() else "tenant", + ) + async def handle_create_eval_set_run(self, payload: EvalSetRunCreatedEvent) -> None: try: self.evaluators = {eval.id: eval for eval in payload.evaluators} @@ -782,6 +806,7 @@ async def handle_update_eval_run(self, payload: EvalRunUpdatedEvent) -> None: self.evaluators, is_coded=is_coded, spans=payload.spans, + workload_failed=payload.workload_failed, ) logger.info( @@ -820,6 +845,39 @@ async def handle_update_eval_set_run(self, payload: EvalSetRunUpdatedEvent) -> N except Exception as e: self._format_error_message(e, "StudioWeb update eval set run error") + async def handle_move_eval_run_to_running( + self, payload: EvalRunStatusUpdateEvent + ) -> None: + """Handle the event to move an eval run from WorkloadExecuting to Running.""" + try: + eval_run_id = self.eval_run_ids.get(payload.execution_id) + if not eval_run_id: + logger.warning( + f"Cannot move eval run to running: eval_run_id not found for " + f"execution_id={payload.execution_id}" + ) + return + + is_coded = self.is_coded_eval.get(self.eval_set_execution_id or "", False) + + logger.info( + f"Moving eval run to Running status: eval_run_id={eval_run_id}, " + f"is_coded={is_coded}" + ) + + await self.move_eval_run_status( + eval_run_id=eval_run_id, + status=EvaluationStatus.IN_PROGRESS, + is_coded=is_coded, + ) + + logger.info( + f"Successfully moved eval_run_id={eval_run_id} to Running status" + ) + + except Exception as e: + self._format_error_message(e, "StudioWeb move eval run to running error") + async def subscribe_to_eval_runtime_events(self, event_bus: EventBus) -> None: event_bus.subscribe( EvaluationEvents.CREATE_EVAL_SET_RUN, self.handle_create_eval_set_run @@ -833,6 +891,10 @@ async def subscribe_to_eval_runtime_events(self, event_bus: EventBus) -> None: event_bus.subscribe( EvaluationEvents.UPDATE_EVAL_SET_RUN, self.handle_update_eval_set_run ) + event_bus.subscribe( + EvaluationEvents.MOVE_EVAL_RUN_TO_RUNNING, + self.handle_move_eval_run_to_running, + ) logger.debug("StudioWeb progress reporter subscribed to evaluation events") @@ -1098,12 +1160,18 @@ def _update_eval_run_spec( execution_time: float, success: bool, is_coded: bool = False, + workload_failed: bool = False, ) -> RequestSpec: # For legacy evaluations, endpoint is without /coded endpoint_suffix = "coded/" if is_coded else "" - # Determine status based on success - status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED + # Determine status based on success and workload failure + if success: + status = EvaluationStatus.COMPLETED + elif workload_failed: + status = EvaluationStatus.WORKLOAD_FAILED + else: + status = EvaluationStatus.FAILED inner_payload: dict[str, Any] = { "evalRunId": eval_run_id, @@ -1148,13 +1216,19 @@ def _update_coded_eval_run_spec( execution_time: float, success: bool, is_coded: bool = False, + workload_failed: bool = False, ) -> RequestSpec: """Create update spec for coded evaluators.""" # For coded evaluations, endpoint has /coded endpoint_suffix = "coded/" if is_coded else "" - # Determine status based on success - status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED + # Determine status based on success and workload failure + if success: + status = EvaluationStatus.COMPLETED + elif workload_failed: + status = EvaluationStatus.WORKLOAD_FAILED + else: + status = EvaluationStatus.FAILED payload: dict[str, Any] = { "evalRunId": eval_run_id, @@ -1187,6 +1261,33 @@ def _update_coded_eval_run_spec( headers=self._tenant_header(), ) + def _update_eval_run_status_spec( + self, + eval_run_id: str, + status: EvaluationStatus, + is_coded: bool = False, + ) -> RequestSpec: + """Create spec for a status-only update of an eval run.""" + endpoint_suffix = "coded/" if is_coded else "" + + payload: dict[str, Any] = { + "evalRunId": eval_run_id, + "status": status.value, + } + + logger.debug( + f"Updating eval run status: evalRunId={eval_run_id}, status={status.name}" + ) + + return RequestSpec( + method="PUT", + endpoint=Endpoint( + f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/{endpoint_suffix}evalRun" + ), + json=payload, + headers=self._tenant_header(), + ) + def _create_eval_run_spec( self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False ) -> RequestSpec: @@ -1234,7 +1335,8 @@ def _create_eval_run_spec( "evalSetRunId": eval_set_run_id, "evalSnapshot": eval_snapshot, # Backend expects integer status - "status": EvaluationStatus.IN_PROGRESS.value, + # Use WORKLOAD_EXECUTING to indicate agent execution phase + "status": EvaluationStatus.WORKLOAD_EXECUTING.value, } # Legacy backend expects payload wrapped in "request" field diff --git a/packages/uipath/src/uipath/eval/runtime/events.py b/packages/uipath/src/uipath/eval/runtime/events.py index 589f82ba7..2e3dba502 100644 --- a/packages/uipath/src/uipath/eval/runtime/events.py +++ b/packages/uipath/src/uipath/eval/runtime/events.py @@ -19,6 +19,7 @@ class EvaluationEvents(str, Enum): CREATE_EVAL_RUN = "create_eval_run" UPDATE_EVAL_SET_RUN = "update_eval_set_run" UPDATE_EVAL_RUN = "update_eval_run" + MOVE_EVAL_RUN_TO_RUNNING = "move_eval_run_to_running" class EvalSetRunCreatedEvent(BaseModel): @@ -49,6 +50,12 @@ class EvalItemExceptionDetails(BaseModel): exception: Exception +class EvalRunStatusUpdateEvent(BaseModel): + """Event emitted to update an eval run's status (e.g., move to Running).""" + + execution_id: str + + class EvalRunUpdatedEvent(BaseModel): """Event emitted when an individual evaluation run is updated with results.""" @@ -63,6 +70,7 @@ class EvalRunUpdatedEvent(BaseModel): spans: list[ReadableSpan] logs: list[logging.LogRecord] exception_details: EvalItemExceptionDetails | None = None + workload_failed: bool = False @model_validator(mode="after") def validate_exception_details(self): @@ -83,6 +91,7 @@ class EvalSetRunUpdatedEvent(BaseModel): ProgressEvent = Union[ EvalSetRunCreatedEvent, EvalRunCreatedEvent, + EvalRunStatusUpdateEvent, EvalRunUpdatedEvent, EvalSetRunUpdatedEvent, ] diff --git a/packages/uipath/src/uipath/eval/runtime/runtime.py b/packages/uipath/src/uipath/eval/runtime/runtime.py index 17fb91c32..2c978527d 100644 --- a/packages/uipath/src/uipath/eval/runtime/runtime.py +++ b/packages/uipath/src/uipath/eval/runtime/runtime.py @@ -86,6 +86,7 @@ from .events import ( EvalItemExceptionDetails, EvalRunCreatedEvent, + EvalRunStatusUpdateEvent, EvalRunUpdatedEvent, EvalSetRunCreatedEvent, EvalSetRunUpdatedEvent, @@ -504,6 +505,7 @@ async def _execute_eval( evaluation_run_results = UiPathEvalRunResult( evaluation_name=eval_item.name, evaluation_run_results=[] ) + agent_execution_succeeded = False try: try: @@ -626,6 +628,17 @@ async def _execute_eval( # The evaluation will be completed when resumed return evaluation_run_results + # Agent execution succeeded - mark flag and move to Running + # (evaluation/scoring phase is about to begin) + agent_execution_succeeded = True + await self.event_bus.publish( + EvaluationEvents.MOVE_EVAL_RUN_TO_RUNNING, + EvalRunStatusUpdateEvent( + execution_id=execution_id, + ), + wait_for_completion=False, + ) + if self.context.verbose: evaluation_run_results.agent_execution_output = ( convert_eval_execution_output_to_serializable( @@ -739,6 +752,7 @@ async def _execute_eval( exception_details=exception_details, spans=[], logs=[], + workload_failed=not agent_execution_succeeded, ) if isinstance(e, EvaluationRuntimeException): eval_run_updated_event.spans = e.spans