diff --git a/ironic/conf/redfish.py b/ironic/conf/redfish.py index 26db778e7c..530de73e89 100644 --- a/ironic/conf/redfish.py +++ b/ironic/conf/redfish.py @@ -92,9 +92,67 @@ 'failed firmware update tasks')), cfg.IntOpt('firmware_update_wait_unresponsive_bmc', min=0, - default=300, + default=0, help=_('Number of seconds to wait before proceeding with the ' 'reboot to finish the BMC firmware update step')), + cfg.IntOpt('firmware_update_required_successes', + min=0, + default=3, + help=_('Number of successful responses required to ' + 'consider post-upgrade BMC validation a success. ' + 'Set to 0 to disable post-upgrade validation ' + 'entirely.')), + cfg.IntOpt('firmware_update_validation_interval', + min=0, + default=30, + help=_('Timeout (in seconds) to wait between validation ' + 'attempts. Set to 0 for rapid succession retries ' + 'with no delay.')), + cfg.IntOpt('firmware_update_resource_validation_timeout', + min=0, + default=480, + help=_('Timeout (in seconds) to wait for BMC resources ' + '(System, Manager, NetworkAdapters) to become stable ' + 'and consistently available after firmware update. ' + 'Set to 0 to disable post-upgrade validation ' + 'entirely.')), + cfg.IntOpt('firmware_update_bmc_timeout', + min=0, + default=300, + help=_('Timeout (in seconds) for BMC firmware updates. ' + 'BMC firmware updates may need extended time to handle ' + 'BMC transitional states during the firmware update ' + 'process.')), + cfg.IntOpt('firmware_update_reboot_delay', + min=0, + default=300, + help=_('Default wait time (in seconds) for component-specific ' + 'firmware update operations. Used for: BIOS firmware ' + 'update wait before reboot, BMC firmware version check ' + 'timeout, and NIC firmware task completion timeout.')), + cfg.IntOpt('firmware_update_bmc_version_check_interval', + min=0, + default=30, + help=_('Interval (in seconds) for checking BMC firmware ' + 'version after BMC firmware update. Used to verify ' + 'if BMC firmware has been successfully applied.')), + cfg.IntOpt('firmware_update_nic_starting_wait', + min=0, + default=30, + help=_('Time (in seconds) to wait for a NIC firmware update ' + 'task to progress beyond the STARTING state before ' + 'triggering a reboot. Some NICs need a reboot to ' + 'start applying firmware, while others can begin ' + 'immediately. This timeout helps determine which ' + 'behavior the hardware exhibits.')), + cfg.IntOpt('firmware_update_overall_timeout', + min=0, + default=7200, + help=_('Maximum time (in seconds) allowed for the entire ' + 'firmware update operation to complete. This provides ' + 'a safety net for firmware updates that get stuck. ' + 'Set to 0 to disable this timeout (not recommended). ' + 'Default is 7200 seconds (2 hours).')), cfg.StrOpt('firmware_source', choices=[('http', _('If firmware source URL is also HTTP, then ' 'serve from original location, otherwise ' @@ -155,6 +213,21 @@ '$default_inspection_hooks. Hooks can be added before ' 'or after the defaults like this: ' '"prehook,$default_hooks,posthook".')), + cfg.IntOpt('post_boot_retry_attempts', + min=1, + default=6, + help=_('Maximum number of retry attempts when BMC rejects ' + 'boot device changes during POST (Power-On Self-Test). ' + 'Some BMCs (e.g. HPE iLO) reject boot device ' + 'modifications while the system is in POST after a ' + 'firmware update or reboot.')), + cfg.IntOpt('post_boot_retry_delay', + min=1, + default=5, + help=_('Minimum delay in seconds between retry attempts ' + 'for POST-related boot device errors. Exponential ' + 'backoff is applied, starting from this value up to ' + '6x this value.')), ] diff --git a/ironic/drivers/modules/redfish/firmware.py b/ironic/drivers/modules/redfish/firmware.py index 12c23a509b..dd49c79475 100644 --- a/ironic/drivers/modules/redfish/firmware.py +++ b/ironic/drivers/modules/redfish/firmware.py @@ -18,6 +18,7 @@ from oslo_utils import timeutils import sushy +from ironic.common import async_steps from ironic.common import exception from ironic.common.i18n import _ from ironic.common import metrics_utils @@ -35,6 +36,16 @@ METRICS = metrics_utils.get_metrics_logger(__name__) +# Temporary field names stored in node.driver_internal_info +BMC_FW_VERSION_BEFORE_UPDATE = 'bmc_fw_version_before_update' +FIRMWARE_REBOOT_REQUESTED = 'firmware_reboot_requested' + +# Temporary field names stored in fw_upd/current_update settings dict +NIC_NEEDS_POST_COMPLETION_REBOOT = 'nic_needs_post_completion_reboot' +NIC_STARTING_TIMESTAMP = 'nic_starting_timestamp' +NIC_REBOOT_TRIGGERED = 'nic_reboot_triggered' +BIOS_REBOOT_TRIGGERED = 'bios_reboot_triggered' + class RedfishFirmware(base.FirmwareInterface): @@ -110,12 +121,25 @@ def cache_firmware_components(self, task): LOG.warning('No manager available to retrieve Firmware ' 'from the bmc of node %s', task.node.uuid) - nic_components = self.retrieve_nic_components(task, system) + nic_components = None + try: + nic_components = self.retrieve_nic_components(task, system) + except (exception.RedfishError, + sushy.exceptions.BadRequestError) as e: + # NOTE(janders) if an exception is raised, log a warning + # with exception details. This is important for HP hardware + # which at the time of writing this are known to return 400 + # responses to GET NetworkAdapters while OS isn't fully booted + LOG.warning('Unable to access NetworkAdapters on node ' + '%(node_uuid)s, Error: %(error)s', + {'node_uuid': task.node.uuid, 'error': e}) + # NOTE(janders) if no exception is raised but no NICs are returned, + # state that clearly but in a lower severity message if nic_components == []: LOG.debug('Could not retrieve Firmware Package Version from ' 'NetworkAdapters on node %(node_uuid)s', {'node_uuid': task.node.uuid}) - else: + elif nic_components: settings.extend(nic_components) if not settings: @@ -159,14 +183,30 @@ def retrieve_nic_components(self, task, system): try: chassis = redfish_utils.get_chassis(task.node, system) except exception.RedfishError: - LOG.warning('No chassis available to retrieve NetworkAdapters ' - 'firmware information on node %(node_uuid)s', - {'node_uuid': task.node.uuid}) + LOG.debug('No chassis available to retrieve NetworkAdapters ' + 'firmware information on node %(node_uuid)s', + {'node_uuid': task.node.uuid}) return nic_list - for net_adp in chassis.network_adapters.get_members(): + try: + network_adapters = chassis.network_adapters + if network_adapters is None: + LOG.debug('NetworkAdapters not available on chassis for ' + 'node %(node_uuid)s', + {'node_uuid': task.node.uuid}) + return nic_list + adapters = network_adapters.get_members() + except sushy.exceptions.MissingAttributeError: + LOG.debug('NetworkAdapters not available on chassis for ' + 'node %(node_uuid)s', + {'node_uuid': task.node.uuid}) + return nic_list + + for net_adp in adapters: for net_adp_ctrl in net_adp.controllers: fw_pkg_v = net_adp_ctrl.firmware_package_version + if not fw_pkg_v: + continue net_adp_fw = {'component': redfish_utils.NIC_COMPONENT_PREFIX + net_adp.identity, 'current_version': fw_pkg_v} nic_list.append(net_adp_fw) @@ -181,7 +221,6 @@ def retrieve_nic_components(self, task, system): @base.service_step(priority=0, abortable=False, argsinfo=_FW_SETTINGS_ARGSINFO, requires_ramdisk=False) - @base.cache_firmware_components def update(self, task, settings): """Update the Firmware on the node using the settings for components. @@ -206,17 +245,320 @@ def update(self, task, settings): {'node_uuid': node.uuid, 'settings': settings}) self._execute_firmware_update(node, update_service, settings) - fw_upd = settings[0] + # Store updated settings and start time for overall timeout tracking + node.set_driver_internal_info('redfish_fw_updates', settings) + node.set_driver_internal_info( + 'redfish_fw_update_start_time', + timeutils.utcnow().isoformat()) + node.save() + + # Return wait state to keep the step active and let polling handle + # the monitoring and eventual completion/reboot + return async_steps.get_return_state(node) + + def _clean_temp_fields(self, node): + """Clean up temporary fields used during firmware update monitoring. + + This ensures no stale data interferes with new firmware updates. + + :param node: the Ironic node object + """ + # BMC-related temp fields + node.del_driver_internal_info(BMC_FW_VERSION_BEFORE_UPDATE) + # General firmware temp fields + node.del_driver_internal_info(FIRMWARE_REBOOT_REQUESTED) + + def _setup_bmc_update_monitoring(self, node, fw_upd): + """Set up monitoring for BMC firmware update. + + BMC updates do not reboot immediately. Instead, we check the BMC + version periodically. If the version changed, we continue without + reboot. If timeout expires without version change, we trigger a reboot. + + :param node: the Ironic node object + :param fw_upd: firmware update settings dict + """ + # Clean any stale temp fields from previous updates + self._clean_temp_fields(node) + + # Record current BMC version before update + try: + system = redfish_utils.get_system(node) + manager = redfish_utils.get_manager(node, system) + current_bmc_version = manager.firmware_version + node.set_driver_internal_info( + BMC_FW_VERSION_BEFORE_UPDATE, current_bmc_version) + LOG.debug('BMC version before update for node %(node)s: ' + '%(version)s', + {'node': node.uuid, 'version': current_bmc_version}) + except Exception as e: + LOG.warning('Could not read BMC version before update for ' + 'node %(node)s: %(error)s', + {'node': node.uuid, 'error': e}) + + LOG.info('BMC firmware update for node %(node)s. ' + 'Monitoring BMC version instead of immediate reboot.', + {'node': node.uuid}) + + # Use wait_interval or default reboot delay wait_interval = fw_upd.get('wait') + if wait_interval is None: + wait_interval = CONF.redfish.firmware_update_reboot_delay + fw_upd['wait'] = wait_interval + # Set wait_start_time for polling interval and bmc_check_start_time + # for total timeout tracking (wait_start_time gets updated each poll) + start_time = str(timeutils.utcnow().isoformat()) + fw_upd['wait_start_time'] = start_time + fw_upd['bmc_check_start_time'] = start_time + # Mark this as a BMC update so we can handle timeouts properly + fw_upd['component_type'] = redfish_utils.BMC + + # BMC: Set async flags without immediate reboot + deploy_utils.set_async_step_flags( + node, + reboot=False, + polling=True + ) + + def _setup_nic_update_monitoring(self, node): + """Set up monitoring for NIC firmware update. + + NIC firmware behavior varies by hardware. Some NICs update immediately, + some need reboot to start. The handler will wait 30s and decide whether + to reboot. + + :param node: the Ironic node object + """ + # Clean any stale temp fields from previous updates + self._clean_temp_fields(node) + + LOG.info('NIC firmware update for node %(node)s. Will monitor ' + 'task state to determine if reboot is needed.', + {'node': node.uuid}) + + # NIC: Set async flags with reboot enabled + # (reboot will be triggered conditionally if hardware needs it) + deploy_utils.set_async_step_flags( + node, + reboot=True, + polling=True + ) + + def _setup_bios_update_monitoring(self, node): + """Set up monitoring for BIOS firmware update. + + BIOS updates require a reboot to apply, so we trigger it as soon + as the update task begins rather than waiting for completion. + + :param node: the Ironic node object + """ + # Clean any stale temp fields from previous updates + self._clean_temp_fields(node) + + LOG.info('BIOS firmware update for node %(node)s. Will reboot ' + 'when update task starts.', + {'node': node.uuid}) + + # BIOS: Set async flags with reboot enabled deploy_utils.set_async_step_flags( node, reboot=True, - skip_current_step=True, polling=True ) - return deploy_utils.reboot_to_finish_step(task, timeout=wait_interval, - disable_ramdisk=True) + def _setup_default_update_monitoring(self, node, fw_upd): + """Set up monitoring for unknown/default firmware component types. + + Default behavior for unknown component types uses standard reboot + handling with configurable wait interval. + + :param node: the Ironic node object + :param fw_upd: firmware update settings dict + """ + # Clean any stale temp fields from previous updates + self._clean_temp_fields(node) + + component = fw_upd.get('component', '') + LOG.warning( + 'Unknown component type %(component)s for node %(node)s. ' + 'Using default firmware update behavior.', + {'component': component, 'node': node.uuid}) + + wait_interval = fw_upd.get('wait') + if wait_interval is None: + wait_interval = ( + node.driver_info.get('firmware_update_unresponsive_bmc_wait') + or CONF.redfish.firmware_update_wait_unresponsive_bmc) + fw_upd['wait'] = wait_interval + + # Default: Set async flags with reboot enabled + deploy_utils.set_async_step_flags( + node, + reboot=True, + polling=True + ) + + + def _get_current_bmc_version(self, node): + """Get current BMC firmware version. + + Note: BMC may be temporarily unresponsive after firmware update. + Expected exceptions (timeouts, connection refused, HTTP errors) are + caught and logged, returning None to indicate version unavailable. + + :param node: the Ironic node object + :returns: Current BMC firmware version string, or None if BMC + is unresponsive/inaccessible + """ + try: + system = redfish_utils.get_system(node) + manager = redfish_utils.get_manager(node, system) + return manager.firmware_version + except (exception.RedfishError, + exception.RedfishConnectionError, + sushy.exceptions.SushyError) as e: + # BMC unresponsiveness is expected after firmware update + # (timeouts, connection refused, HTTP 4xx/5xx errors) + LOG.debug('BMC temporarily unresponsive for node %(node)s: ' + '%(error)s', {'node': node.uuid, 'error': e}) + return None + + def _handle_bmc_update_completion(self, task, update_service, + settings, current_update): + """Handle BMC firmware update completion with version checking. + + For BMC updates, we don't reboot immediately. Instead, we check + the BMC version periodically. If the version changed, we continue + without reboot. If timeout expires without version change, we trigger + a reboot. + + :param task: a TaskManager instance + :param update_service: the sushy firmware update service + :param settings: firmware update settings + :param current_update: the current firmware update being processed + """ + node = task.node + + # Try to get current BMC version + # Note: BMC may be unresponsive after firmware update - expected + current_version = self._get_current_bmc_version(node) + version_before = node.driver_internal_info.get( + BMC_FW_VERSION_BEFORE_UPDATE) + + # If we can read the version and it changed, update is complete + if (current_version is not None + and version_before is not None + and current_version != version_before): + LOG.info( + 'BMC firmware version for node %(node)s changed from ' + '%(old)s to %(new)s. Update complete. Continuing without ' + 'reboot.', + {'node': node.uuid, 'old': version_before, + 'new': current_version}) + node.del_driver_internal_info(BMC_FW_VERSION_BEFORE_UPDATE) + node.save() + self._continue_updates(task, update_service, settings) + return + + # Check if we've been checking for too long + check_start_time = current_update.get('bmc_check_start_time') + + if check_start_time: + check_start = timeutils.parse_isotime(check_start_time) + elapsed_time = timeutils.utcnow(True) - check_start + timeout = current_update.get( + 'wait', CONF.redfish.firmware_update_reboot_delay) + if elapsed_time.seconds >= timeout: + # Timeout: version didn't change or BMC unresponsive + if (current_version is not None + and version_before is not None + and current_version == version_before): + # Version didn't change - skip reboot + LOG.info( + 'BMC firmware version for node %(node)s did not ' + 'change (still %(version)s). Update appears to be ' + 'a no-op or does not require reboot. Continuing ' + 'without reboot.', + {'node': node.uuid, 'version': current_version}) + else: + # Version changed or we can't tell - reboot to apply + LOG.warning( + 'BMC firmware version check timeout expired for ' + 'node %(node)s after %(elapsed)s seconds. ' + 'Will reboot to complete firmware update.', + {'node': node.uuid, 'elapsed': elapsed_time.seconds}) + # Mark that reboot is needed + node.set_driver_internal_info( + FIRMWARE_REBOOT_REQUESTED, True) + # Enable reboot flag now that we're ready to reboot + deploy_utils.set_async_step_flags( + node, + reboot=True, + polling=True + ) + + node.del_driver_internal_info(BMC_FW_VERSION_BEFORE_UPDATE) + node.save() + self._continue_updates(task, update_service, settings) + return + + # Continue checking - set wait to check again + wait_interval = ( + CONF.redfish.firmware_update_bmc_version_check_interval) + current_update['wait'] = wait_interval + current_update['wait_start_time'] = str( + timeutils.utcnow().isoformat()) + current_update['bmc_version_checking'] = True + node.set_driver_internal_info('redfish_fw_updates', settings) + node.save() + + LOG.debug('BMC firmware version check continuing for node %(node)s. ' + 'Will check again in %(interval)s seconds.', + {'node': node.uuid, 'interval': wait_interval}) + + def _handle_nic_update_completion(self, task, update_service, settings, + current_update): + """Handle NIC firmware update completion. + + For NIC updates, check if a reboot is needed based on whether the + task went through the Running state (needs reboot after completion) + or if reboot already occurred during the Starting phase. + + :param task: a TaskManager instance + :param update_service: the sushy firmware update service + :param settings: firmware update settings + :param current_update: the current firmware update being processed + """ + node = task.node + + # Check if reboot is needed (task went to Running state) + needs_reboot = current_update.get( + NIC_NEEDS_POST_COMPLETION_REBOOT, False) + + if needs_reboot: + LOG.info( + 'NIC firmware update task completed for node ' + '%(node)s. Reboot required to apply update.', + {'node': node.uuid}) + + # Mark that reboot is needed + node.set_driver_internal_info( + FIRMWARE_REBOOT_REQUESTED, True) + + # Clean up flags + current_update.pop(NIC_NEEDS_POST_COMPLETION_REBOOT, None) + current_update.pop(NIC_STARTING_TIMESTAMP, None) + current_update.pop(NIC_REBOOT_TRIGGERED, None) + else: + LOG.info( + 'NIC firmware update task completed for node ' + '%(node)s. Reboot already occurred during update ' + 'start.', {'node': node.uuid}) + # Clean up all NIC-related flags + current_update.pop(NIC_STARTING_TIMESTAMP, None) + current_update.pop(NIC_REBOOT_TRIGGERED, None) + + self._continue_updates(task, update_service, settings) def _execute_firmware_update(self, node, update_service, settings): """Executes the next firmware update to the node @@ -229,6 +571,8 @@ def _execute_firmware_update(self, node, update_service, settings): to be executed. """ fw_upd = settings[0] + # Store power timeout to use on reboot operations + fw_upd['power_timeout'] = CONF.redfish.firmware_update_reboot_delay # NOTE(janders) try to get the collection of Systems on the BMC # to determine if there may be more than one System try: @@ -267,26 +611,9 @@ def _execute_firmware_update(self, node, update_service, settings): {'node': node.uuid, 'error': e.message}) raise exception.RedfishError(error=e) - # NOTE(iurygregory): In case we are doing firmware updates we need to - # account for unresponsive BMC, in this case we wait for a set of - # minutes before proceeding to the power actions. - # In case the node has firmware_update_unresponsive_bmc_wait set we - # give priority over the configuration option. - wait_unres_bmc = ( - node.driver_info.get('firmware_update_unresponsive_bmc_wait') - or CONF.redfish.firmware_update_wait_unresponsive_bmc - ) - LOG.debug('BMC firmware update in progress. Waiting %(wait_time)s ' - 'seconds before proceeding to reboot the node %(node_uuid)s ' - 'to complete the step', {'node_uuid': node.uuid, - 'wait_time': wait_unres_bmc}) - # TODO(iurygregory): Improve the logic here to identify if the BMC - # is back, so we don't have to unconditionally wait. - # The wait_unres_bmc will be the maximum time to wait. - time.sleep(wait_unres_bmc) - LOG.debug('Wait completed. Proceeding to reboot the node ' - '%(node_uuid)s to complete the step.', - {'node_uuid': node.uuid}) + # Store task monitor URI for periodic task polling + # NOTE(janders): Component-specific wait/reboot behavior is now + # handled by the update() method and periodic polling, not here fw_upd['task_monitor'] = task_monitor.task_monitor_uri node.set_driver_internal_info('redfish_fw_updates', settings) @@ -299,10 +626,127 @@ def _execute_firmware_update(self, node, update_service, settings): fw_clean.append(cleanup) node.set_driver_internal_info('firmware_cleanup', fw_clean) + component = fw_upd.get('component', '') + component_type = redfish_utils.get_component_type(component) + + if component_type == redfish_utils.BMC: + self._setup_bmc_update_monitoring(node, fw_upd) + elif component_type == redfish_utils.NIC: + self._setup_nic_update_monitoring(node) + elif component_type == redfish_utils.BIOS: + self._setup_bios_update_monitoring(node) + else: + self._setup_default_update_monitoring(node, fw_upd) + + + def _validate_resources_stability(self, node): + """Validate that BMC resources are consistently available. + + Requires consecutive successful responses from System, Manager, + and NetworkAdapters resources before considering them stable. + The number of required successes is configured via + CONF.redfish.firmware_update_required_successes. + Timeout is configured via + CONF.redfish.firmware_update_resource_validation_timeout. + + :param node: the Ironic node object + :raises: RedfishError if resources don't stabilize within timeout + """ + timeout = CONF.redfish.firmware_update_resource_validation_timeout + required_successes = CONF.redfish.firmware_update_required_successes + validation_interval = CONF.redfish.firmware_update_validation_interval + + # Skip validation if validation is disabled via configuration + if required_successes == 0 or timeout == 0: + reasons = [] + if required_successes == 0: + reasons.append('required_successes=0') + if timeout == 0: + reasons.append('validation_timeout=0') + + LOG.info('BMC resource validation disabled (%s) for node %(node)s', + ', '.join(reasons), {'node': node.uuid}) + return + + LOG.debug('Starting resource stability validation for node %(node)s ' + '(timeout: %(timeout)s seconds, ' + 'required_successes: %(required)s, ' + 'validation_interval: %(interval)s seconds)', + {'node': node.uuid, 'timeout': timeout, + 'required': required_successes, + 'interval': validation_interval}) + + start_time = time.time() + end_time = start_time + timeout + consecutive_successes = 0 + last_exc = None + + while time.time() < end_time: + try: + # Test System resource + system = redfish_utils.get_system(node) + + # Test Manager resource + redfish_utils.get_manager(node, system) + + # Test Chassis and NetworkAdapters resource (if available) + # Some systems may not have NetworkAdapters, which is valid + chassis = redfish_utils.get_chassis(node, system) + try: + network_adapters = chassis.network_adapters + if network_adapters is not None: + network_adapters.get_members() + except sushy.exceptions.MissingAttributeError: + # NetworkAdapters not available is acceptable + pass + + # All resources successful + consecutive_successes += 1 + LOG.debug('Resource validation success %(count)d/%(required)d ' + 'for node %(node)s', + {'count': consecutive_successes, + 'required': required_successes, + 'node': node.uuid}) + + if consecutive_successes >= required_successes: + LOG.info('All tested Redfish resources stable and ' + ' available for node %(node)s', + {'node': node.uuid}) + return + + except (exception.RedfishError, + exception.RedfishConnectionError, + sushy.exceptions.SushyError) as e: + LOG.debug('BMC resource validation failed for node %(node)s: ' + '%(error)s. This may indicate the BMC is still ' + 'restarting or recovering from firmware update.', + {'node': node.uuid, 'error': e}) + # Resource not available yet, reset counter + if consecutive_successes > 0: + LOG.debug('Resource validation interrupted for node ' + '%(node)s, resetting success counter ' + '(error: %(error)s)', + {'node': node.uuid, 'error': e}) + consecutive_successes = 0 + last_exc = e + + # Wait before next validation attempt + time.sleep(validation_interval) + # Timeout reached without achieving stability + error_msg = _('BMC resources failed to stabilize within ' + '%(timeout)s seconds for node %(node)s') % { + 'timeout': timeout, 'node': node.uuid} + if last_exc: + error_msg += _(', last error: %(error)s') % {'error': last_exc} + LOG.error(error_msg) + raise exception.RedfishError(error=error_msg) + def _continue_updates(self, task, update_service, settings): """Continues processing the firmware updates Continues to process the firmware updates on the node. + First monitors the current task completion, then validates resource + stability before proceeding to next update or completion. Note that the caller must have an exclusive lock on the node. @@ -312,6 +756,7 @@ def _continue_updates(self, task, update_service, settings): """ node = task.node fw_upd = settings[0] + wait_interval = fw_upd.get('wait') if wait_interval: time_now = str(timeutils.utcnow().isoformat()) @@ -331,10 +776,27 @@ def _continue_updates(self, task, update_service, settings): return if len(settings) == 1: + # Last firmware update - check if reboot is needed + reboot_requested = node.driver_internal_info.get( + FIRMWARE_REBOOT_REQUESTED, False) + self._clear_updates(node) LOG.info('Firmware updates completed for node %(node)s', {'node': node.uuid}) + + # If reboot was requested (e.g., for BMC timeout or NIC + # completion), trigger the reboot before notifying conductor + if reboot_requested: + LOG.info('Rebooting node %(node)s to apply firmware updates', + {'node': node.uuid}) + manager_utils.node_power_action(task, states.REBOOT) + + LOG.debug('Validating BMC responsiveness before resuming ' + 'conductor operations for node %(node)s', + {'node': node.uuid}) + self._validate_resources_stability(node) + if task.node.clean_step: manager_utils.notify_conductor_resume_clean(task) elif task.node.service_step: @@ -343,12 +805,40 @@ def _continue_updates(self, task, update_service, settings): manager_utils.notify_conductor_resume_deploy(task) else: + # Validate BMC resources are stable before continuing next update + LOG.info('Validating BMC responsiveness before continuing ' + 'to next firmware update for node %(node)s', + {'node': node.uuid}) + self._validate_resources_stability(node) + settings.pop(0) self._execute_firmware_update(node, update_service, settings) node.save() - manager_utils.node_power_action(task, states.REBOOT) + + # Only reboot if the component code requested it. + if task.node.clean_step: + reboot_field = async_steps.CLEANING_REBOOT + elif task.node.deploy_step: + reboot_field = async_steps.DEPLOYMENT_REBOOT + elif task.node.service_step: + reboot_field = async_steps.SERVICING_REBOOT + else: + reboot_field = None + + # Default to reboot=True for backwards compatibility. + should_reboot = (node.driver_internal_info.get(reboot_field, True) + if reboot_field else True) + + if should_reboot: + power_timeout = settings[0].get('power_timeout', 0) + manager_utils.node_power_action(task, states.REBOOT, + power_timeout) + else: + LOG.debug('Component requested no immediate reboot for node ' + '%(node)s. Continuing with async polling.', + {'node': node.uuid}) def _clear_updates(self, node): """Clears firmware updates artifacts @@ -362,7 +852,10 @@ def _clear_updates(self, node): """ firmware_utils.cleanup(node) node.del_driver_internal_info('redfish_fw_updates') + node.del_driver_internal_info('redfish_fw_update_start_time') node.del_driver_internal_info('firmware_cleanup') + # Clean all temporary fields used during firmware update monitoring + self._clean_temp_fields(node) node.save() @METRICS.timer('RedfishFirmware._query_update_failed') @@ -401,12 +894,390 @@ def _query_update_status(self, task, manager, context): """Periodic job to check firmware update tasks.""" self._check_node_redfish_firmware_update(task) + def _handle_task_completion(self, task, sushy_task, messages, + update_service, settings, current_update): + """Handle firmware update task completion. + + :param task: a TaskManager instance + :param sushy_task: the sushy task object + :param messages: list of task messages + :param update_service: the sushy firmware update service + :param settings: firmware update settings + :param current_update: the current firmware update being processed + """ + node = task.node + + if (sushy_task.task_state == sushy.TASK_STATE_COMPLETED + and sushy_task.task_status in + [sushy.HEALTH_OK, sushy.HEALTH_WARNING]): + LOG.debug('Redfish task completed for node %(node)s, ' + 'firmware %(firmware_image)s: %(messages)s.', + {'node': node.uuid, + 'firmware_image': current_update['url'], + 'messages': ", ".join(messages)}) + + # Component-specific post-update handling + component = current_update.get('component', '') + component_type = redfish_utils.get_component_type(component) + + if component_type == redfish_utils.BMC: + # BMC: Start version checking instead of immediate reboot + self._handle_bmc_update_completion( + task, update_service, settings, current_update) + elif component_type == redfish_utils.NIC: + # NIC: Handle completion with appropriate reboot behavior + self._handle_nic_update_completion( + task, update_service, settings, current_update) + elif component_type == redfish_utils.BIOS: + # BIOS: Check if reboot was actually triggered + # Some BMCs (e.g., HPE iLO) complete the BIOS firmware task + # very quickly (staging the firmware) before Ironic can poll + # and trigger the reboot. In this case, we need to trigger + # the reboot now to actually apply the firmware. + if not current_update.get(BIOS_REBOOT_TRIGGERED): + LOG.info('BIOS firmware update task completed for node ' + '%(node)s but reboot was not triggered yet. ' + 'Triggering reboot now to apply staged firmware.', + {'node': node.uuid}) + current_update[BIOS_REBOOT_TRIGGERED] = True + node.set_driver_internal_info('redfish_fw_updates', + settings) + node.save() + power_timeout = current_update.get('power_timeout', 0) + manager_utils.node_power_action(task, states.REBOOT, + power_timeout) + return + else: + # Reboot was already triggered when task started, + # just continue with next update + LOG.info('BIOS firmware update task completed for node ' + '%(node)s. System was already rebooted. ' + 'Proceeding with continuation.', + {'node': node.uuid}) + # Clean up the reboot trigger flag + current_update.pop(BIOS_REBOOT_TRIGGERED, None) + self._continue_updates(task, update_service, settings) + else: + # Default: continue as before + self._continue_updates(task, update_service, settings) + else: + error_msg = (_('Firmware update failed for node %(node)s, ' + 'firmware %(firmware_image)s. ' + 'Error: %(errors)s') % + {'node': node.uuid, + 'firmware_image': current_update['url'], + 'errors': ", ".join(messages)}) + + self._clear_updates(node) + if task.node.clean_step: + manager_utils.cleaning_error_handler(task, error_msg) + elif task.node.deploy_step: + manager_utils.deploying_error_handler(task, error_msg) + elif task.node.service_step: + manager_utils.servicing_error_handler(task, error_msg) + + def _handle_nic_task_starting(self, task, task_monitor, settings, + current_update): + """Handle NIC firmware update task when it starts. + + NIC firmware behavior varies by hardware: + - Some NICs need reboot to START applying (task stays at Starting) + - Some NICs can start immediately but need reboot to APPLY (goes to + Running, then needs reboot after completion) + + This method waits for the configured time + (CONF.redfish.firmware_update_nic_starting_wait) to determine which + type: + - If still Starting after wait time → trigger reboot to start + - If moves to Running → let it finish, reboot will happen after + completion + + :param task: a TaskManager instance + :param task_monitor: the sushy task monitor + :param settings: firmware update settings + :param current_update: the current firmware update being processed + :returns: True if should stop polling, False to continue + """ + # Upgrade lock at the start since we may modify driver_internal_info + task.upgrade_lock() + node = task.node + + try: + sushy_task = task_monitor.get_task() + task_state = sushy_task.task_state + + LOG.debug('NIC update task state for node %(node)s: %(state)s', + {'node': node.uuid, 'state': task_state}) + + # If task is Running, mark that reboot will be needed after + # completion and let it continue + if task_state == sushy.TASK_STATE_RUNNING: + LOG.debug('NIC update task for node %(node)s is running. ' + 'Will wait for completion then reboot.', + {'node': node.uuid}) + # Clear flags since we're past the starting phase + current_update.pop(NIC_STARTING_TIMESTAMP, None) + current_update.pop(NIC_REBOOT_TRIGGERED, None) + # Mark that reboot will be needed after completion + current_update[NIC_NEEDS_POST_COMPLETION_REBOOT] = True + node.set_driver_internal_info('redfish_fw_updates', settings) + node.save() + return False # Continue polling until completion + + # If task is in STARTING, check if we need to wait or reboot + if task_state == sushy.TASK_STATE_STARTING: + # Check if we already triggered a reboot + if current_update.get(NIC_REBOOT_TRIGGERED): + LOG.debug('NIC firmware update for node %(node)s: ' + 'reboot already triggered, waiting for task ' + 'to progress.', {'node': node.uuid}) + return False # Continue polling + + starting_time = current_update.get(NIC_STARTING_TIMESTAMP) + + if not starting_time: + # First time seeing STARTING - record timestamp + current_update[NIC_STARTING_TIMESTAMP] = str( + timeutils.utcnow().isoformat()) + node.set_driver_internal_info( + 'redfish_fw_updates', settings) + node.save() + LOG.debug('NIC firmware update task for node %(node)s ' + 'is in STARTING state. Waiting to determine if ' + 'reboot is needed to start update.', + {'node': node.uuid}) + return False # Keep polling + + # Check if configured wait time has elapsed + start_time = timeutils.parse_isotime(starting_time) + elapsed = timeutils.utcnow(True) - start_time + nic_starting_wait = ( + CONF.redfish.firmware_update_nic_starting_wait) + + if elapsed.seconds < nic_starting_wait: + # Still within wait window, keep waiting + LOG.debug('NIC update for node %(node)s still in ' + 'STARTING after %(elapsed)s seconds. ' + 'Waiting...', + {'node': node.uuid, + 'elapsed': elapsed.seconds}) + return False # Keep polling + + # Wait time elapsed and still STARTING - need reboot to start + LOG.info('NIC firmware update task for node %(node)s ' + 'remained in STARTING state for %(wait)s+ seconds. ' + 'Hardware requires reboot to start update. ' + 'Triggering reboot.', + {'node': node.uuid, 'wait': nic_starting_wait}) + + # Mark that we triggered a reboot to prevent repeat reboots + current_update[NIC_REBOOT_TRIGGERED] = True + # Clean up timestamp + current_update.pop(NIC_STARTING_TIMESTAMP, None) + node.set_driver_internal_info('redfish_fw_updates', settings) + node.save() + + # Trigger the reboot to start update + power_timeout = current_update.get('power_timeout', 0) + manager_utils.node_power_action(task, states.REBOOT, + power_timeout) + + LOG.info('Reboot initiated for node %(node)s to start ' + 'NIC firmware update', {'node': node.uuid}) + return True # Stop polling, reboot triggered + + except Exception as e: + LOG.warning('Unable to check NIC task state for node ' + '%(node)s: %(error)s. Will retry.', + {'node': node.uuid, 'error': e}) + + return False # Continue polling on error + + def _handle_bios_task_starting(self, task, task_monitor, settings, + current_update): + """Handle BIOS firmware update task when it starts. + + BIOS updates require a reboot to apply the firmware, so we trigger + the reboot as soon as the update task reaches STARTING state rather + than waiting for task completion. + + :param task: a TaskManager instance + :param task_monitor: the sushy task monitor + :param settings: firmware update settings + :param current_update: the current firmware update being processed + :returns: True if reboot was triggered, False otherwise + """ + if current_update.get(BIOS_REBOOT_TRIGGERED): + # Already triggered, just keep polling + return False + + # Upgrade lock at the start since we may modify driver_internal_info + task.upgrade_lock() + node = task.node + + + try: + sushy_task = task_monitor.get_task() + LOG.debug('BIOS update task state for node %(node)s: ' + '%(state)s', + {'node': node.uuid, + 'state': sushy_task.task_state}) + + # Check if task has started (STARTING state or beyond) + # TaskState can be: New, Starting, Running, Suspended, + # Interrupted, Pending, Stopping, Completed, Killed, + # Exception, Service, Cancelling, Cancelled + if sushy_task.task_state in [sushy.TASK_STATE_STARTING, + sushy.TASK_STATE_RUNNING, + sushy.TASK_STATE_PENDING]: + LOG.info('BIOS firmware update task has started for ' + 'node %(node)s (state: %(state)s). ' + 'Triggering reboot to apply update.', + {'node': node.uuid, + 'state': sushy_task.task_state}) + + # Mark reboot as triggered to avoid repeated reboots + current_update[BIOS_REBOOT_TRIGGERED] = True + node.set_driver_internal_info( + 'redfish_fw_updates', settings) + node.save() + + # Trigger the reboot + power_timeout = current_update.get('power_timeout', 0) + manager_utils.node_power_action(task, states.REBOOT, + power_timeout) + + LOG.info('Reboot initiated for node %(node)s to apply ' + 'BIOS firmware update', + {'node': node.uuid}) + return True + except Exception as e: + LOG.warning('Unable to check BIOS task state for node ' + '%(node)s: %(error)s. Will retry.', + {'node': node.uuid, 'error': e}) + + return False + + def _handle_wait_completion(self, task, update_service, settings, + current_update): + """Handle firmware update wait completion. + + :param task: a TaskManager instance + :param update_service: the sushy firmware update service + :param settings: firmware update settings + :param current_update: the current firmware update being processed + """ + # Upgrade lock at the start since we may modify driver_internal_info + task.upgrade_lock() + node = task.node + + # Check if this is BMC version checking + if current_update.get('bmc_version_checking'): + current_update.pop('bmc_version_checking', None) + node.set_driver_internal_info( + 'redfish_fw_updates', settings) + node.save() + # Continue BMC version checking + self._handle_bmc_update_completion( + task, update_service, settings, current_update) + elif current_update.get('component_type') == redfish_utils.BMC: + # BMC update wait expired - check if task is still running + # before transitioning to version checking + task_still_running = False + try: + task_monitor = redfish_utils.get_task_monitor( + node, current_update['task_monitor']) + if task_monitor.is_processing: + task_still_running = True + LOG.debug('BMC firmware update wait expired but task ' + ' still processing for node %(node)s. ' + 'Continuing to monitor task completion.', + {'node': node.uuid}) + except exception.RedfishConnectionError as e: + LOG.debug('Unable to communicate with task monitor for node ' + '%(node)s during wait completion: %(error)s. ' + 'BMC may be resetting, will transition to version ' + 'checking.', {'node': node.uuid, 'error': e}) + except exception.RedfishError as e: + LOG.debug('Task monitor unavailable for node %(node)s: ' + '%(error)s. Task may have completed, transitioning ' + 'to version checking.', + {'node': node.uuid, 'error': e}) + + if task_still_running: + # Task is still running, continue to monitor task completion + # Don't transition to version checking yet. + node.set_driver_internal_info('redfish_fw_updates', settings) + node.save() + return + + # Task completed, deleted or BMC unavailable + # Transition to version checking + LOG.info('BMC firmware update wait expired for node %(node)s. ' + 'Task completed or unavailable. Transitioning to version ' + 'checking mode.', + {'node': node.uuid}) + self._handle_bmc_update_completion( + task, update_service, settings, current_update) + else: + # Regular wait completion - mark reboot needed if this is the + # last update. Note: BIOS components reboot immediately when + # task starts, so they won't use this path. + if len(settings) == 1: + component = current_update.get('component', '') + component_type = redfish_utils.get_component_type(component) + # For default/unknown components, reboot may be needed + if component_type is None: + node.set_driver_internal_info( + FIRMWARE_REBOOT_REQUESTED, True) + node.save() + # Continue with updates + self._continue_updates(task, update_service, settings) + + def _check_overall_timeout(self, task): + """Check if firmware update has exceeded overall timeout. + + :param task: A TaskManager instance + :returns: True if timeout exceeded and error was handled, + False otherwise + """ + node = task.node + overall_timeout = CONF.redfish.firmware_update_overall_timeout + if overall_timeout <= 0: + return False + + start_time_str = node.driver_internal_info.get( + 'redfish_fw_update_start_time') + if not start_time_str: + return False + + start_time = timeutils.parse_isotime(start_time_str) + elapsed = timeutils.utcnow(True) - start_time + if elapsed.total_seconds() < overall_timeout: + return False + + msg = (_('Firmware update on node %(node)s has exceeded ' + 'the overall timeout of %(timeout)s seconds. ' + 'Elapsed time: %(elapsed)s seconds.') + % {'node': node.uuid, + 'timeout': overall_timeout, + 'elapsed': int(elapsed.total_seconds())}) + LOG.error(msg) + task.upgrade_lock() + self._clear_updates(node) + manager_utils.servicing_error_handler(task, msg, traceback=False) + return True + @METRICS.timer('RedfishFirmware._check_node_redfish_firmware_update') def _check_node_redfish_firmware_update(self, task): """Check the progress of running firmware update on a node.""" node = task.node + # Check overall timeout for firmware update operation + if self._check_overall_timeout(task): + return + settings = node.driver_internal_info['redfish_fw_updates'] current_update = settings[0] @@ -421,6 +1292,13 @@ def _check_node_redfish_firmware_update(self, task): {'node': node.uuid, 'error': e}) return + # Touch provisioning to indicate progress is being monitored. + # This prevents heartbeat timeout from triggering for steps that + # don't require the ramdisk agent (requires_ramdisk=False). + # Note: Only touch after successful BMC communication to ensure + # the process eventually times out if the BMC is unresponsive. + node.touch_provisioning() + wait_start_time = current_update.get('wait_start_time') if wait_start_time: wait_start = timeutils.parse_isotime(wait_start_time) @@ -436,7 +1314,9 @@ def _check_node_redfish_firmware_update(self, task): current_update.pop('wait', None) current_update.pop('wait_start_time', None) - self._continue_updates(task, update_service, settings) + # Handle wait completion + self._handle_wait_completion( + task, update_service, settings, current_update) else: LOG.debug('Continuing to wait after firmware update ' '%(firmware_image)s on node %(node)s. ' @@ -450,6 +1330,14 @@ def _check_node_redfish_firmware_update(self, task): try: task_monitor = redfish_utils.get_task_monitor( node, current_update['task_monitor']) + except exception.RedfishConnectionError as e: + # If the BMC firmware is being updated, the BMC will be + # unavailable for some amount of time. + LOG.warning('Unable to communicate with task monitor service ' + 'on node %(node)s. Will try again on the next poll. ' + 'Error: %(error)s', + {'node': node.uuid, 'error': e}) + return except exception.RedfishError: # The BMC deleted the Task before we could query it LOG.warning('Firmware update completed for node %(node)s, ' @@ -460,11 +1348,44 @@ def _check_node_redfish_firmware_update(self, task): self._continue_updates(task, update_service, settings) return + # Special handling for BIOS and NIC updates + component = current_update.get('component', '') + component_type = redfish_utils.get_component_type(component) + + if task_monitor.is_processing and component_type == redfish_utils.BIOS: + # For BIOS, check if task has reached STARTING state + # and trigger reboot immediately + if self._handle_bios_task_starting(task, task_monitor, settings, + current_update): + return # Reboot triggered, done + # Task is still processing, keep polling + return + + if task_monitor.is_processing and component_type == redfish_utils.NIC: + # For NIC, wait 30s to see if hardware needs reboot + if self._handle_nic_task_starting(task, task_monitor, settings, + current_update): + return # Reboot triggered, done + # Task is still processing (or waiting), keep polling + return + if not task_monitor.is_processing: # The last response does not necessarily contain a Task, # so get it sushy_task = task_monitor.get_task() + # NOTE(iurygregory): Some BMCs (particularly HPE iLO) may return + # is_processing=False while the task is still in RUNNING, STARTING, + # or PENDING state. Only treat it as completion if the task state + # indicates it's actually finished. + if sushy_task.task_state in [sushy.TASK_STATE_RUNNING, + sushy.TASK_STATE_STARTING, + sushy.TASK_STATE_PENDING]: + LOG.debug('Firmware update task for node %(node)s is in ' + '%(state)s state. Continuing to poll.', + {'node': node.uuid, 'state': sushy_task.task_state}) + return + # Only parse the messages if the BMC did not return parsed # messages messages = [] @@ -472,35 +1393,17 @@ def _check_node_redfish_firmware_update(self, task): sushy_task.parse_messages() if sushy_task.messages is not None: - messages = [m.message for m in sushy_task.messages] + for m in sushy_task.messages: + msg = m.message + if not msg or msg.lower() in ['unknown', 'unknown error']: + msg = m.message_id + if msg: + messages.append(msg) task.upgrade_lock() - if (sushy_task.task_state == sushy.TASK_STATE_COMPLETED - and sushy_task.task_status in - [sushy.HEALTH_OK, sushy.HEALTH_WARNING]): - LOG.info('Firmware update succeeded for node %(node)s, ' - 'firmware %(firmware_image)s: %(messages)s', - {'node': node.uuid, - 'firmware_image': current_update['url'], - 'messages': ", ".join(messages)}) - - self._continue_updates(task, update_service, settings) - else: - error_msg = (_('Firmware update failed for node %(node)s, ' - 'firmware %(firmware_image)s. ' - 'Error: %(errors)s') % - {'node': node.uuid, - 'firmware_image': current_update['url'], - 'errors': ", ".join(messages)}) - - self._clear_updates(node) - if task.node.clean_step: - manager_utils.cleaning_error_handler(task, error_msg) - elif task.node.deploy_step: - manager_utils.deploying_error_handler(task, error_msg) - elif task.node.service_step: - manager_utils.servicing_error_handler(task, error_msg) - + self._handle_task_completion(task, sushy_task, messages, + update_service, settings, + current_update) else: LOG.debug('Firmware update in progress for node %(node)s, ' 'firmware %(firmware_image)s.', diff --git a/ironic/drivers/modules/redfish/management.py b/ironic/drivers/modules/redfish/management.py index 018c70f6f2..051abd03e2 100644 --- a/ironic/drivers/modules/redfish/management.py +++ b/ironic/drivers/modules/redfish/management.py @@ -22,6 +22,7 @@ from oslo_log import log from oslo_utils import timeutils import sushy +import tenacity from ironic.common import boot_devices from ironic.common import boot_modes @@ -91,6 +92,12 @@ BOOT_DEVICE_PERSISTENT_MAP_REV = {v: k for k, v in BOOT_DEVICE_PERSISTENT_MAP.items()} +VENDORS_REQUIRING_FULL_BOOT_REQUEST = [ + "american megatrends international", + "ami", + "asrockrack" +] + INDICATOR_MAP = { sushy.INDICATOR_LED_LIT: indicator_states.ON, sushy.INDICATOR_LED_OFF: indicator_states.OFF, @@ -110,6 +117,24 @@ }} +def _is_during_post_error(exc): + """Check if a Sushy exception is an 'UnableToModifyDuringSystemPOST'. + + HPE iLO BMCs reject boot device changes while the system is in POST + (Power-On Self-Test), typically after a firmware update or reboot. + + :param exc: An exception instance + :returns: True if this is a POST-related error, False otherwise + """ + if not isinstance(exc, sushy.exceptions.BadRequestError): + return False + is_post_error = 'UnableToModifyDuringSystemPOST' in str(exc) + if is_post_error: + LOG.debug('Detected UnableToModifyDuringSystemPOST error from BMC, ' + 'will trigger retry logic. Error: %s', exc) + return is_post_error + + def _set_boot_device(task, system, device, persistent=False, http_boot_url=None): """An internal routine to set the boot device. @@ -123,6 +148,10 @@ def _set_boot_device(task, system, device, persistent=False, :param http_boot_url: A string value to be sent to the sushy library, which is sent to the BMC as the url to boot from. :raises: SushyError on an error from the Sushy library + Vendor-specific logic: + - Vendors listed in VENDORS_REQUIRING_FULL_BOOT_REQUEST: + Require setting full boot parameters + (mode, enabled, target) even if unchanged. """ # The BMC handling of the persistent setting is vendor specific. @@ -132,6 +161,13 @@ def _set_boot_device(task, system, device, persistent=False, # persistent setting must be set when setting the boot device # (see https://storyboard.openstack.org/#!/story/2008547). vendor = task.node.properties.get('vendor', None) + LOG.debug("Vendor : %(vendor)s node %(uuid)s", + {'vendor': vendor, 'uuid': task.node.uuid}) + requires_full_boot_request = ( + vendor and any(vendor_id in vendor.lower() + for vendor_id in + VENDORS_REQUIRING_FULL_BOOT_REQUEST) + ) if vendor and vendor.lower() == 'supermicro': enabled = BOOT_DEVICE_PERSISTENT_MAP_REV[persistent] LOG.debug('Setting BootSourceOverrideEnable to %(enable)s ' @@ -144,11 +180,59 @@ def _set_boot_device(task, system, device, persistent=False, # NOTE(etingof): this can be racy, esp if BMC is not RESTful enabled = (desired_enabled if desired_enabled != current_enabled else None) - try: + + # Logging callback for retry attempts (closure captures task) + def _log_post_boot_retry(retry_state): + LOG.warning('BMC is in POST, unable to modify boot device for ' + 'node %(node)s. Retrying in %(delay).1f seconds ' + '(attempt %(attempt)d/%(total)d)', + {'node': task.node.uuid, + 'delay': retry_state.next_action.sleep, + 'attempt': retry_state.attempt_number, + 'total': CONF.redfish.post_boot_retry_attempts}) + + @tenacity.retry( + retry=tenacity.retry_if_exception(_is_during_post_error), + stop=tenacity.stop_after_attempt(CONF.redfish.post_boot_retry_attempts), + wait=tenacity.wait_exponential( + multiplier=1, + min=CONF.redfish.post_boot_retry_delay, + max=CONF.redfish.post_boot_retry_delay * 6), + before_sleep=_log_post_boot_retry, + reraise=True) + def _do_set_boot_options(): # NOTE(TheJulia): In sushy, it is uri, due to the convention used # in the standard. URL is used internally in ironic. - system.set_system_boot_options(device, enabled=enabled, - http_boot_uri=http_boot_url) + if requires_full_boot_request: + # Some vendors require sending all boot parameters every time + desired_mode = system.boot.get('mode') \ + or sushy.BOOT_SOURCE_MODE_UEFI + BOOT_DEVICE_PERSISTENT_MAP_REV[persistent] + current_enabled = system.boot.get('enabled') \ + or sushy.BOOT_SOURCE_ENABLED_ONCE + current_target = system.boot.get('target') \ + or sushy.BOOT_SOURCE_TARGET_NONE + + LOG.debug('Vendor "%(vendor)s" requires full boot settings. ' + 'Sending: mode=%(mode)s, enabled=%(enabled)s, ' + 'target=%(target)s for node %(node)s', + {'vendor': vendor, 'mode': desired_mode, + 'enabled': current_enabled, + 'target': current_target, 'node': task.node.uuid}) + system.set_system_boot_options( + device, + mode=desired_mode, + enabled=enabled, + http_boot_uri=http_boot_url + ) + else: + LOG.debug('Sending minimal Redfish boot device change for ' + 'node %(node)s', {'node': task.node.uuid}) + system.set_system_boot_options(device, enabled=enabled, + http_boot_uri=http_boot_url) + + try: + _do_set_boot_options() except sushy.exceptions.SushyError as e: if enabled == sushy.BOOT_SOURCE_ENABLED_CONTINUOUS: # NOTE(dtantsur): continuous boot device settings have been @@ -336,13 +420,54 @@ def set_boot_mode(self, task, mode): """ system = redfish_utils.get_system(task.node) + vendor = task.node.properties.get('vendor', None) + requires_full_boot_request = ( + vendor and any(vendor_id in vendor.lower() + for vendor_id in + VENDORS_REQUIRING_FULL_BOOT_REQUEST) + ) + + LOG.debug('Requested %(vendor)s to set boot mode' + ' to "%(mode)s" for node %(node)s', + {'mode': mode, 'node': task.node.uuid, 'vendor': vendor}) + # NOTE(dtantsur): check the readability of the current mode before # modifying anything. I suspect it can become None transiently after # the update, while we need to know if it is supported *at all*. get_mode_unsupported = (system.boot.get('mode') is None) + desired_mode = BOOT_MODE_MAP_REV[mode] + LOG.debug('Current boot mode read from Redfish for ' + 'node %(node)s is: %(mode)s', + {'node': task.node.uuid, + 'mode': desired_mode}) + try: - system.set_system_boot_options(mode=BOOT_MODE_MAP_REV[mode]) + + if requires_full_boot_request: + current_enabled = system.boot.get('enable') \ + or sushy.BOOT_SOURCE_ENABLED_ONCE + current_target = system.boot.get('enable') \ + or sushy.BOOT_SOURCE_TARGET_PXE + + LOG.debug('Vendor "%(vendor)s" requires full boot settings. ' + 'Sending: mode=%(mode)s, enabled=%(enabled)s, ' + 'target=%(target)s for node %(node)s', + {'vendor': vendor, 'mode': desired_mode, + 'enabled': current_enabled, 'node': task.node.uuid, + 'target': current_target + }) + + system.set_system_boot_options( + mode=desired_mode, + enabled=current_enabled, + target=current_target + ) + else: + LOG.debug('Sending minimal Redfish boot mode ' + 'change for node %(node)s', + {'node': task.node.uuid}) + system.set_system_boot_options(mode=desired_mode) except sushy.exceptions.SushyError as e: error_msg = (_('Setting boot mode to %(mode)s ' 'failed for node %(node)s. ' diff --git a/ironic/drivers/modules/redfish/utils.py b/ironic/drivers/modules/redfish/utils.py index 1504379a6a..af97cc3c63 100644 --- a/ironic/drivers/modules/redfish/utils.py +++ b/ironic/drivers/modules/redfish/utils.py @@ -94,11 +94,35 @@ NIC_COMPONENT_PREFIX = "nic:" "Prefix for NIC Firmware Components" +NIC = "nic" +"NIC Firmware Component type" FIRMWARE_COMPONENTS = [BIOS, BMC] """Firmware Components available to update""" +def get_component_type(component): + """Determine the type of firmware component. + + Note: This helper exists primarily to handle NIC components which use + a prefix pattern (e.g., 'nic:BCM57414', 'nic:adapter1') rather than + exact string matches. This centralizes the component type detection + logic that is used in multiple places (update initiation, task + monitoring, completion handling) and provides a single source of truth + for component type classification. + + :param component: The component name from settings + :returns: One of 'bios', 'bmc', 'nic', or None + """ + if component == BIOS: + return BIOS + elif component == BMC: + return BMC + elif component.startswith(NIC_COMPONENT_PREFIX): + return NIC + return None + + def parse_driver_info(node): """Parse the information required for Ironic to connect to Redfish. diff --git a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py index 883a4a2ece..edf9b41076 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py @@ -25,6 +25,7 @@ from ironic.conductor import task_manager from ironic.conductor import utils as manager_utils from ironic.conf import CONF +from ironic.drivers.modules import deploy_utils from ironic.drivers.modules.redfish import firmware as redfish_fw from ironic.drivers.modules.redfish import firmware_utils from ironic.drivers.modules.redfish import utils as redfish_utils @@ -49,6 +50,41 @@ def setUp(self): self.node = obj_utils.create_test_node( self.context, driver='redfish', driver_info=INFO_DICT) + def _mock_exc_fwup_side_effect(self, firmware_interface, node, + update_service, settings_list): + """Helper to simulate _execute_firmware_update behavior. + + The real _execute_firmware_update: + 1. Adds a task_monitor field to the settings + 2. Calls component-specific setup methods + + This helper replicates that behavior for tests that mock + this method to avoid JSON serialization issues. + + :param firmware_interface: The firmware interface instance (unused, + but passed by mock framework) + :param node: The node being updated + :param update_service: The update service + :param settings_list: The settings list + """ + settings_list[0]['task_monitor'] = '/redfish/v1/TaskService/Tasks/1' + + # Simulate component-specific setup that now happens inside + # _execute_firmware_update + fw_upd = settings_list[0] + component = fw_upd.get('component', '') + + # Call the actual setup method based on component type + # This ensures the driver_internal_info is set correctly + if component == redfish_utils.BMC: + firmware_interface._setup_bmc_update_monitoring(node, fw_upd) + elif component.startswith(redfish_utils.NIC_COMPONENT_PREFIX): + firmware_interface._setup_nic_update_monitoring(node) + elif component == redfish_utils.BIOS: + firmware_interface._setup_bios_update_monitoring(node) + else: + firmware_interface._setup_default_update_monitoring(node, fw_upd) + def test_get_properties(self): with task_manager.acquire(self.context, self.node.uuid, shared=True) as task: @@ -261,12 +297,219 @@ def test_get_chassis_redfish_error(self, sync_fw_cmp_mock, system_mock, shared=True) as task: task.driver.firmware.cache_firmware_components(task) - log_mock.warning.assert_any_call( + log_mock.debug.assert_any_call( 'No chassis available to retrieve NetworkAdapters firmware ' 'information on node %(node_uuid)s', {'node_uuid': self.node.uuid} ) + @mock.patch.object(redfish_fw, 'LOG', autospec=True) + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + @mock.patch.object(redfish_utils, 'get_manager', autospec=True) + @mock.patch.object(objects, 'FirmwareComponentList', autospec=True) + def test_retrieve_nic_components_redfish_connection_error( + self, sync_fw_cmp_mock, manager_mock, system_mock, log_mock): + """Test that RedfishConnectionError during NIC retrieval is handled.""" + system_mock.return_value.identity = "System1" + system_mock.return_value.bios_version = '1.0.0' + manager_mock.return_value.identity = "Manager1" + manager_mock.return_value.firmware_version = '1.0.0' + + sync_fw_cmp_mock.sync_firmware_components.return_value = ( + [{'component': 'bios', 'current_version': '1.0.0'}, + {'component': 'bmc', 'current_version': '1.0.0'}], + [], []) + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(task.driver.firmware, + 'retrieve_nic_components', + autospec=True) as mock_retrieve: + connection_error = exception.RedfishError( + 'Connection failed') + mock_retrieve.side_effect = connection_error + + task.driver.firmware.cache_firmware_components(task) + + # Verify warning log for exception is called + log_mock.warning.assert_any_call( + 'Unable to access NetworkAdapters on node %(node_uuid)s, ' + 'Error: %(error)s', + {'node_uuid': self.node.uuid, 'error': connection_error} + ) + + # Verify debug log for empty NIC list is NOT called + # (since we caught an exception, not an empty list) + debug_calls = [call for call in log_mock.debug.call_args_list + if 'Could not retrieve Firmware Package Version from ' + 'NetworkAdapters' in str(call)] + self.assertEqual(len(debug_calls), 0) + + @mock.patch.object(redfish_fw, 'LOG', autospec=True) + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + @mock.patch.object(redfish_utils, 'get_manager', autospec=True) + @mock.patch.object(objects, 'FirmwareComponentList', autospec=True) + def test_retrieve_nic_components_sushy_bad_request_error( + self, sync_fw_cmp_mock, manager_mock, system_mock, log_mock): + """Test that sushy BadRequestError during NIC retrieval is handled.""" + system_mock.return_value.identity = "System1" + system_mock.return_value.bios_version = '1.0.0' + manager_mock.return_value.identity = "Manager1" + manager_mock.return_value.firmware_version = '1.0.0' + + sync_fw_cmp_mock.sync_firmware_components.return_value = ( + [{'component': 'bios', 'current_version': '1.0.0'}, + {'component': 'bmc', 'current_version': '1.0.0'}], + [], []) + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(task.driver.firmware, + 'retrieve_nic_components', + autospec=True) as mock_retrieve: + bad_request_error = sushy.exceptions.BadRequestError( + method='GET', url='/redfish/v1/Chassis/1/NetworkAdapters', + response=mock.Mock(status_code=400)) + mock_retrieve.side_effect = bad_request_error + + task.driver.firmware.cache_firmware_components(task) + + # Verify warning log for exception is called + log_mock.warning.assert_any_call( + 'Unable to access NetworkAdapters on node %(node_uuid)s, ' + 'Error: %(error)s', + {'node_uuid': self.node.uuid, 'error': bad_request_error} + ) + + # Verify debug log for empty NIC list is NOT called + # (since we caught an exception, not an empty list) + debug_calls = [call for call in log_mock.debug.call_args_list + if 'Could not retrieve Firmware Package Version from ' + 'NetworkAdapters' in str(call)] + self.assertEqual(len(debug_calls), 0) + + @mock.patch.object(redfish_fw, 'LOG', autospec=True) + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + @mock.patch.object(redfish_utils, 'get_manager', autospec=True) + @mock.patch.object(redfish_utils, 'get_chassis', autospec=True) + @mock.patch.object(objects, 'FirmwareComponentList', autospec=True) + @mock.patch.object(objects, 'FirmwareComponent', spec_set=True, + autospec=True) + def test_retrieve_nic_components_invalid_firmware_version( + self, fw_cmp_mock, fw_cmp_list, chassis_mock, manager_mock, + system_mock, log_mock): + """Test that NIC components with missing versions are skipped.""" + for invalid_version in [None, ""]: + fw_cmp_list.reset_mock() + fw_cmp_mock.reset_mock() + log_mock.reset_mock() + + create_list = [{'component': 'bios', 'current_version': 'v1.0.0'}, + {'component': 'bmc', 'current_version': 'v1.0.0'}] + fw_cmp_list.sync_firmware_components.return_value = ( + create_list, [], [] + ) + + bios_component = {'component': 'bios', + 'current_version': 'v1.0.0', + 'node_id': self.node.id} + + bmc_component = {'component': 'bmc', 'current_version': 'v1.0.0', + 'node_id': self.node.id} + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + manager_mock.return_value.firmware_version = "v1.0.0" + system_mock.return_value.bios_version = "v1.0.0" + + netadp_ctrl = mock.MagicMock() + netadp_ctrl.firmware_package_version = invalid_version + netadp = mock.MagicMock() + netadp.identity = 'NIC1' + netadp.controllers = [netadp_ctrl] + net_adapters = mock.MagicMock() + net_adapters.get_members.return_value = [netadp] + chassis_mock.return_value.network_adapters = net_adapters + task.driver.firmware.cache_firmware_components(task) + + fw_cmp_list.sync_firmware_components.assert_called_once_with( + task.context, task.node.id, + [{'component': 'bios', 'current_version': 'v1.0.0'}, + {'component': 'bmc', 'current_version': 'v1.0.0'}]) + + fw_cmp_calls = [ + mock.call(task.context, **bios_component), + mock.call().create(), + mock.call(task.context, **bmc_component), + mock.call().create() + ] + fw_cmp_mock.assert_has_calls(fw_cmp_calls) + log_mock.warning.assert_not_called() + + @mock.patch.object(redfish_fw, 'LOG', autospec=True) + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + @mock.patch.object(redfish_utils, 'get_manager', autospec=True) + @mock.patch.object(redfish_utils, 'get_chassis', autospec=True) + @mock.patch.object(objects, 'FirmwareComponentList', autospec=True) + def test_retrieve_nic_components_network_adapters_none( + self, fw_cmp_list, chassis_mock, manager_mock, + system_mock, log_mock): + """Test that None network_adapters is handled gracefully.""" + fw_cmp_list.sync_firmware_components.return_value = ( + [{'component': 'bios', 'current_version': '1.0.0'}, + {'component': 'bmc', 'current_version': '1.0.0'}], + [], []) + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + system_mock.return_value.bios_version = '1.0.0' + manager_mock.return_value.firmware_version = '1.0.0' + # network_adapters is None + chassis_mock.return_value.network_adapters = None + + task.driver.firmware.cache_firmware_components(task) + + # Should log at debug level, not warning + log_mock.debug.assert_any_call( + 'NetworkAdapters not available on chassis for ' + 'node %(node_uuid)s', + {'node_uuid': self.node.uuid} + ) + log_mock.warning.assert_not_called() + + @mock.patch.object(redfish_fw, 'LOG', autospec=True) + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + @mock.patch.object(redfish_utils, 'get_manager', autospec=True) + @mock.patch.object(redfish_utils, 'get_chassis', autospec=True) + @mock.patch.object(objects, 'FirmwareComponentList', autospec=True) + def test_retrieve_nic_components_missing_attribute_error( + self, fw_cmp_list, chassis_mock, manager_mock, + system_mock, log_mock): + """Test that MissingAttributeError is handled gracefully.""" + fw_cmp_list.sync_firmware_components.return_value = ( + [{'component': 'bios', 'current_version': '1.0.0'}, + {'component': 'bmc', 'current_version': '1.0.0'}], + [], []) + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + system_mock.return_value.bios_version = '1.0.0' + manager_mock.return_value.firmware_version = '1.0.0' + # network_adapters raises MissingAttributeError + type(chassis_mock.return_value).network_adapters = ( + mock.PropertyMock( + side_effect=sushy.exceptions.MissingAttributeError)) + + task.driver.firmware.cache_firmware_components(task) + + # Should log at debug level, not warning + log_mock.debug.assert_any_call( + 'NetworkAdapters not available on chassis for ' + 'node %(node_uuid)s', + {'node_uuid': self.node.uuid} + ) + log_mock.warning.assert_not_called() + @mock.patch.object(redfish_utils, 'LOG', autospec=True) @mock.patch.object(redfish_utils, '_get_connection', autospec=True) def test_missing_updateservice(self, conn_mock, log_mock): @@ -535,6 +778,121 @@ def _test__check_node_redfish_firmware_update(self): firmware._check_node_redfish_firmware_update(task) return task, firmware + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) + def test_check_calls_touch_provisioning(self, mock_task_monitor, + mock_get_update_service): + """Test _check_node_redfish_firmware_update calls touch_provisioning. + + This prevents heartbeat timeouts for firmware updates that don't + require the ramdisk agent (requires_ramdisk=False). By calling + touch_provisioning on each poll, we keep provision_updated_at fresh. + """ + self._generate_new_driver_internal_info(['bmc']) + + # Mock task still in progress + mock_task_monitor.return_value.is_processing = True + + firmware = redfish_fw.RedfishFirmware() + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + with mock.patch.object(task.node, 'touch_provisioning', + autospec=True) as mock_touch: + firmware._check_node_redfish_firmware_update(task) + + # Verify touch_provisioning was called + mock_touch.assert_called_once_with() + + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_check_skips_touch_provisioning_on_conn_error( + self, mock_get_update_service): + """Test touch_provisioning is NOT called when BMC connection fails. + + When the BMC is unresponsive, we should NOT update + provision_updated_at. This ensures the process eventually times + out if the BMC never recovers, rather than being kept alive. + """ + self._generate_new_driver_internal_info(['bmc']) + + # Mock connection error + mock_get_update_service.side_effect = exception.RedfishConnectionError( + 'Connection failed') + + firmware = redfish_fw.RedfishFirmware() + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + with mock.patch.object(task.node, 'touch_provisioning', + autospec=True) as mock_touch: + firmware._check_node_redfish_firmware_update(task) + + # Verify touch_provisioning was NOT called on connection error + mock_touch.assert_not_called() + + @mock.patch.object(redfish_fw.manager_utils, 'servicing_error_handler', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_check_overall_timeout_exceeded(self, mock_get_update_service, + mock_error_handler): + """Test firmware update fails when overall timeout is exceeded. + + This ensures firmware updates don't run indefinitely - if the + overall timeout is exceeded, the update should fail with an error. + """ + self._generate_new_driver_internal_info(['bmc']) + + # Set start time to 3 hours ago (exceeds 2 hour default timeout) + past_time = (timeutils.utcnow() + - datetime.timedelta(hours=3)).isoformat() + self.node.set_driver_internal_info('redfish_fw_update_start_time', + past_time) + self.node.save() + + firmware = redfish_fw.RedfishFirmware() + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + firmware._check_node_redfish_firmware_update(task) + + # Verify error handler was called with timeout message + mock_error_handler.assert_called_once() + call_args = mock_error_handler.call_args + self.assertIn('exceeded', call_args[0][1].lower()) + self.assertIn('timeout', call_args[0][1].lower()) + + # Verify the firmware update info was cleaned up + task.node.refresh() + self.assertIsNone( + task.node.driver_internal_info.get('redfish_fw_updates')) + self.assertIsNone( + task.node.driver_internal_info.get( + 'redfish_fw_update_start_time')) + + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) + def test_check_overall_timeout_not_exceeded(self, mock_task_monitor, + mock_get_update_service): + """Test firmware update continues when timeout not exceeded.""" + self._generate_new_driver_internal_info(['bmc']) + + # Set start time to 1 hour ago (within 2 hour default timeout) + past_time = (timeutils.utcnow() + - datetime.timedelta(hours=1)).isoformat() + self.node.set_driver_internal_info('redfish_fw_update_start_time', + past_time) + self.node.save() + + # Mock task still in progress + mock_task_monitor.return_value.is_processing = True + + firmware = redfish_fw.RedfishFirmware() + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + with mock.patch.object(task.node, 'touch_provisioning', + autospec=True) as mock_touch: + firmware._check_node_redfish_firmware_update(task) + + # Verify touch_provisioning was called (update continues) + mock_touch.assert_called_once_with() + @mock.patch.object(redfish_fw, 'LOG', autospec=True) @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) def test_check_conn_error(self, get_us_mock, log_mock): @@ -634,6 +992,40 @@ def test__check_update_in_progress(self, tm_mock, get_us_mock, log_mock): interface._continue_updates.assert_not_called() + @mock.patch.object(redfish_fw, 'LOG', autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) + def test__check_update_task_state(self, tm_mock, get_us_mock, log_mock): + """Test task with is_processing=False but still in active state. + + Some BMCs (particularly HPE iLO) may return is_processing=False + while the task is still in RUNNING, STARTING, or PENDING state. + The update should continue polling and not be treated as complete. + """ + self._generate_new_driver_internal_info(['bmc']) + + # Test each of the three active states + for task_state in [sushy.TASK_STATE_RUNNING, + sushy.TASK_STATE_STARTING, + sushy.TASK_STATE_PENDING]: + log_mock.reset_mock() + + tm_mock.return_value.is_processing = False + mock_task = tm_mock.return_value.get_task.return_value + mock_task.task_state = task_state + mock_task.task_status = sushy.HEALTH_OK + + _, interface = self._test__check_node_redfish_firmware_update() + + # Verify the new debug log message + debug_calls = [ + mock.call('Firmware update task for node %(node)s is in ' + '%(state)s state. Continuing to poll.', + {'node': self.node.uuid, 'state': task_state})] + + log_mock.debug.assert_has_calls(debug_calls) + interface._continue_updates.assert_not_called() + @mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True) @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) @@ -692,11 +1084,14 @@ def test__check_node_firmware_update_fail_servicing( servicing_error_handler_mock.assert_called_once() interface._continue_updates.assert_not_called() + @mock.patch.object(redfish_fw.RedfishFirmware, + '_handle_bmc_update_completion', autospec=True) @mock.patch.object(redfish_fw, 'LOG', autospec=True) @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) def test__check_node_firmware_update_done(self, tm_mock, get_us_mock, - log_mock): + log_mock, + bmc_completion_mock): task_mock = mock.Mock() task_mock.task_state = sushy.TASK_STATE_COMPLETED task_mock.task_status = sushy.HEALTH_OK @@ -711,19 +1106,25 @@ def test__check_node_firmware_update_done(self, tm_mock, get_us_mock, task, interface = self._test__check_node_redfish_firmware_update() task.upgrade_lock.assert_called_once_with() - info_calls = [ - mock.call('Firmware update succeeded for node %(node)s, ' - 'firmware %(firmware_image)s: %(messages)s', + debug_calls = [ + mock.call('Redfish task completed for node %(node)s, ' + 'firmware %(firmware_image)s: %(messages)s.', {'node': self.node.uuid, 'firmware_image': 'https://bmc/v1.0.1', 'messages': 'Firmware update done'})] - log_mock.info.assert_has_calls(info_calls) + log_mock.debug.assert_has_calls(debug_calls) + # NOTE(iurygregory): _validate_resources_stability is now called + # in _continue_updates before power operations, not in + # _handle_task_completion - interface._continue_updates.assert_called_once_with( - task, get_us_mock.return_value, + # BMC updates now go through _handle_bmc_update_completion + bmc_completion_mock.assert_called_once_with( + interface, task, get_us_mock.return_value, [{'component': 'bmc', 'url': 'https://bmc/v1.0.1', - 'task_monitor': '/task/1'}] + 'task_monitor': '/task/1'}], + {'component': 'bmc', 'url': 'https://bmc/v1.0.1', + 'task_monitor': '/task/1'} ) @mock.patch.object(firmware_utils, 'download_to_temp', autospec=True) @@ -816,14 +1217,19 @@ def test_continue_update_waitting(self, log_mock): ] log_mock.debug.assert_has_calls(debug_call) + @mock.patch.object(redfish_fw.RedfishFirmware, + '_validate_resources_stability', autospec=True) @mock.patch.object(redfish_fw, 'LOG', autospec=True) @mock.patch.object(manager_utils, 'notify_conductor_resume_clean', autospec=True) - def test_continue_updates_last(self, cond_resume_clean_mock, log_mock): + def test_continue_updates_last(self, cond_resume_clean_mock, log_mock, + validate_mock): self._generate_new_driver_internal_info(['bmc']) task = self._test_continue_updates() cond_resume_clean_mock.assert_called_once_with(task) + # Verify BMC validation was called before resuming conductor + validate_mock.assert_called_once() info_call = [ mock.call('Firmware updates completed for node %(node)s', @@ -831,15 +1237,19 @@ def test_continue_updates_last(self, cond_resume_clean_mock, log_mock): ] log_mock.info.assert_has_calls(info_call) + @mock.patch.object(redfish_fw.RedfishFirmware, + '_validate_resources_stability', autospec=True) @mock.patch.object(redfish_fw, 'LOG', autospec=True) @mock.patch.object(manager_utils, 'notify_conductor_resume_service', autospec=True) def test_continue_updates_last_service(self, cond_resume_service_mock, - log_mock): + log_mock, validate_mock): self._generate_new_driver_internal_info_service(['bmc']) task = self._test_continue_updates() cond_resume_service_mock.assert_called_once_with(task) + # Verify BMC validation was called before resuming conductor + validate_mock.assert_called_once() info_call = [ mock.call('Firmware updates completed for node %(node)s', @@ -847,12 +1257,15 @@ def test_continue_updates_last_service(self, cond_resume_service_mock, ] log_mock.info.assert_has_calls(info_call) + @mock.patch.object(redfish_fw.RedfishFirmware, + '_validate_resources_stability', autospec=True) @mock.patch.object(redfish_fw, 'LOG', autospec=True) @mock.patch.object(manager_utils, 'node_power_action', autospec=True) @mock.patch.object(redfish_utils, 'get_system_collection', autospec=True) def test_continue_updates_more_updates(self, get_system_collection_mock, node_power_action_mock, - log_mock): + log_mock, + validate_mock): cfg.CONF.set_override('firmware_update_wait_unresponsive_bmc', 0, 'redfish') self._generate_new_driver_internal_info(['bmc', 'bios']) @@ -880,12 +1293,18 @@ def test_continue_updates_more_updates(self, get_system_collection_mock, log_mock.debug.assert_has_calls(debug_calls) self.assertEqual( [{'component': 'bios', 'url': 'https://bios/v1.0.1', - 'task_monitor': '/task/2'}], + 'task_monitor': '/task/2', 'power_timeout': 300}], task.node.driver_internal_info['redfish_fw_updates']) update_service_mock.simple_update.assert_called_once_with( 'https://bios/v1.0.1') - task.node.save.assert_called_once_with() - node_power_action_mock.assert_called_once_with(task, states.REBOOT) + # NOTE(iurygregory): node.save() is called twice: + # 1. Inside _execute_firmware_update via setup methods + # 2. In _continue_updates after _execute_firmware_update returns + self.assertEqual(task.node.save.call_count, 2) + # Verify BMC validation was called before continuing to next update + validate_mock.assert_called_once_with(firmware, task.node) + node_power_action_mock.assert_called_once_with(task, states.REBOOT, + 300) @mock.patch.object(redfish_utils, 'get_system', autospec=True) @mock.patch.object(redfish_utils, 'get_system_collection', autospec=True) @@ -941,12 +1360,15 @@ def test__execute_firmware_update_targets(self, update_service_mock.simple_update.assert_called_once_with( 'https://bios/v1.0.1', targets=[mock.ANY]) + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_utils, 'get_manager', autospec=True) @mock.patch.object(redfish_utils, 'get_system', autospec=True) @mock.patch.object(redfish_utils, 'get_system_collection', autospec=True) - @mock.patch.object(time, 'sleep', autospec=True) - def test__execute_firmware_update_unresponsive_bmc(self, sleep_mock, + def test__execute_firmware_update_unresponsive_bmc(self, get_sys_collec_mock, - system_mock): + system_mock, + manager_mock, + set_async_mock): cfg.CONF.set_override('firmware_update_wait_unresponsive_bmc', 1, 'redfish') self._generate_new_driver_internal_info(['bmc']) @@ -958,27 +1380,31 @@ def test__execute_firmware_update_unresponsive_bmc(self, sleep_mock, system_collection_mock.get_members.return_value = resp_obj['Members'] get_sys_collec_mock.return_value = system_collection_mock + # Mock BMC version reading for setup + mock_manager = mock.Mock() + mock_manager.firmware_version = '1.0.0' + manager_mock.return_value = mock_manager + task_monitor_mock = mock.Mock() task_monitor_mock.task_monitor_uri = '/task/2' update_service_mock = mock.Mock() update_service_mock.simple_update.return_value = task_monitor_mock - firmware = redfish_fw.RedfishFirmware() - settings = [{'component': 'bmc', 'url': 'https://bmc/v1.2.3'}] firmware._execute_firmware_update(self.node, update_service_mock, settings) - update_service_mock.simple_update.assert_called_once_with( 'https://bmc/v1.2.3') - sleep_mock.assert_called_once_with( - CONF.redfish.firmware_update_wait_unresponsive_bmc) - + # Verify BMC monitoring setup was called (internally by _execute) + set_async_mock.assert_called_once_with( + self.node, reboot=False, polling=True) + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_utils, 'get_manager', autospec=True) @mock.patch.object(redfish_utils, 'get_system', autospec=True) @mock.patch.object(redfish_utils, 'get_system_collection', autospec=True) - @mock.patch.object(time, 'sleep', autospec=True) def test__execute_firmware_update_unresponsive_bmc_node_override( - self, sleep_mock, get_sys_collec_mock, system_mock): + self, get_sys_collec_mock, system_mock, manager_mock, + set_async_mock): self._generate_new_driver_internal_info(['bmc']) # Set a specific value for firmware_update_unresponsive_bmc_wait for # the node @@ -987,12 +1413,10 @@ def test__execute_firmware_update_unresponsive_bmc_node_override( d_info['firmware_update_unresponsive_bmc_wait'] = 1 self.node.driver_info = d_info self.node.save() - self.assertNotEqual( CONF.redfish.firmware_update_wait_unresponsive_bmc, self.node.driver_info.get('firmware_update_unresponsive_bmc_wait') ) - with open( 'ironic/tests/json_samples/systems_collection_single.json' ) as f: @@ -1001,18 +1425,1101 @@ def test__execute_firmware_update_unresponsive_bmc_node_override( system_collection_mock.get_members.return_value = resp_obj['Members'] get_sys_collec_mock.return_value = system_collection_mock + # Mock BMC version reading for setup + mock_manager = mock.Mock() + mock_manager.firmware_version = '1.0.0' + manager_mock.return_value = mock_manager + task_monitor_mock = mock.Mock() task_monitor_mock.task_monitor_uri = '/task/2' update_service_mock = mock.Mock() update_service_mock.simple_update.return_value = task_monitor_mock - firmware = redfish_fw.RedfishFirmware() settings = [{'component': 'bmc', 'url': 'https://bmc/v1.2.3'}] firmware._execute_firmware_update(self.node, update_service_mock, settings) - update_service_mock.simple_update.assert_called_once_with( 'https://bmc/v1.2.3') - sleep_mock.assert_called_once_with( - self.node.driver_info.get('firmware_update_unresponsive_bmc_wait') - ) + # Verify BMC monitoring setup was called (internally by _execute) + set_async_mock.assert_called_once_with( + self.node, reboot=False, polling=True) + + def test__validate_resources_stability_success(self): + """Test successful BMC resource validation with consecutive success.""" + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(redfish_utils, 'get_manager', + autospec=True) as manager_mock, \ + mock.patch.object(redfish_utils, 'get_chassis', + autospec=True) as chassis_mock, \ + mock.patch.object(time, 'time', autospec=True) as time_mock, \ + mock.patch.object(time, 'sleep', + autospec=True) as sleep_mock: + + # Mock successful resource responses + system_mock.return_value = mock.Mock() + manager_mock.return_value = mock.Mock() + net_adapters = chassis_mock.return_value.network_adapters + net_adapters.get_members.return_value = [] + + # Mock time progression to simulate consecutive successes + time_mock.side_effect = [0, 1, 2, 3] # 3 successful attempts + + # Should complete successfully after 3 consecutive successes + firmware._validate_resources_stability(task.node) + + # Verify all resources were checked 3 times (required success) + self.assertEqual(system_mock.call_count, 3) + self.assertEqual(manager_mock.call_count, 3) + self.assertEqual(chassis_mock.call_count, 3) + + # Verify sleep was called between validation attempts + expected_calls = [mock.call( + CONF.redfish.firmware_update_validation_interval)] * 2 + sleep_mock.assert_has_calls(expected_calls) + + def test__validate_resources_stability_timeout(self): + """Test BMC resource validation timeout when not achieved.""" + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(redfish_utils, 'get_manager', + autospec=True), \ + mock.patch.object(time, 'time', autospec=True) as time_mock, \ + mock.patch.object(time, 'sleep', autospec=True): + + # Mock system always failing + system_mock.side_effect = exception.RedfishConnectionError( + 'timeout') + + # Mock time progression to exceed timeout + time_mock.side_effect = [0, 500] + + # Should raise RedfishError due to timeout + self.assertRaises(exception.RedfishError, + firmware._validate_resources_stability, + task.node) + + @mock.patch.object(redfish_fw, 'LOG', autospec=True) + def test__validate_resources_stability_intermittent_failures( + self, mock_log): + """Test BMC resource validation with intermittent failures.""" + cfg.CONF.set_override('firmware_update_required_successes', 3, + 'redfish') + cfg.CONF.set_override('firmware_update_validation_interval', 10, + 'redfish') + + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(redfish_utils, 'get_manager', + autospec=True) as manager_mock, \ + mock.patch.object(redfish_utils, 'get_chassis', + autospec=True) as chassis_mock, \ + mock.patch.object(time, 'time', autospec=True) as time_mock, \ + mock.patch.object(time, 'sleep', autospec=True): + + # Mock intermittent failures: success, success, fail, + # success, success, success + # When system_mock raises exception, other calls are not made + call_count = 0 + + def system_side_effect(*args): + nonlocal call_count + call_count += 1 + if call_count == 3: # Third call fails + raise exception.RedfishConnectionError('error') + return mock.Mock() + + system_mock.side_effect = system_side_effect + manager_mock.return_value = mock.Mock() + net_adapters = chassis_mock.return_value.network_adapters + net_adapters.get_members.return_value = [] + + # Mock time progression (6 attempts total) + time_mock.side_effect = [0, 10, 20, 30, 40, 50, 60] + + # Should eventually succeed after counter reset + firmware._validate_resources_stability(task.node) + + # Verify all 6 attempts were made for system + self.assertEqual(system_mock.call_count, 6) + # Manager and chassis called only 5 times (not on failed) + self.assertEqual(manager_mock.call_count, 5) + self.assertEqual(chassis_mock.call_count, 5) + + # Verify verbose logging about BMC recovery was called + expected_log_call = mock.call( + 'BMC resource validation failed for node %(node)s: ' + '%(error)s. This may indicate the BMC is still ' + 'restarting or recovering from firmware update.', + {'node': task.node.uuid, 'error': mock.ANY}) + mock_log.debug.assert_has_calls([expected_log_call]) + + def test__validate_resources_stability_manager_failure(self): + """Test BMC resource validation when Manager resource fails.""" + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(redfish_utils, 'get_manager', + autospec=True) as manager_mock, \ + mock.patch.object(time, 'time', autospec=True) as time_mock: + + # Mock system success, manager failure + system_mock.return_value = mock.Mock() + manager_mock.side_effect = exception.RedfishError( + 'manager error') + + # Mock time progression to exceed timeout + time_mock.side_effect = [0, 500] + + # Should raise RedfishError due to timeout + self.assertRaises(exception.RedfishError, + firmware._validate_resources_stability, + task.node) + + def test__validate_resources_stability_network_adapters_failure(self): + """Test validation when NetworkAdapters resource fails.""" + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(redfish_utils, 'get_manager', + autospec=True) as manager_mock, \ + mock.patch.object(redfish_utils, 'get_chassis', + autospec=True) as chassis_mock, \ + mock.patch.object(time, 'time', autospec=True) as time_mock: + + # Mock system and manager success, NetworkAdapters failure + system_mock.return_value = mock.Mock() + manager_mock.return_value = mock.Mock() + chassis_mock.side_effect = exception.RedfishError( + 'chassis error') + + # Mock time progression to exceed timeout + time_mock.side_effect = [0, 500] + + # Should raise RedfishError due to timeout + self.assertRaises(exception.RedfishError, + firmware._validate_resources_stability, + task.node) + + def test__validate_resources_stability_custom_config(self): + """Test BMC resource validation with custom configuration values.""" + cfg.CONF.set_override('firmware_update_required_successes', 5, + 'redfish') + cfg.CONF.set_override('firmware_update_validation_interval', 5, + 'redfish') + + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(redfish_utils, 'get_manager', + autospec=True) as manager_mock, \ + mock.patch.object(redfish_utils, 'get_chassis', + autospec=True) as chassis_mock, \ + mock.patch.object(time, 'time', autospec=True) as time_mock, \ + mock.patch.object(time, 'sleep', + autospec=True) as sleep_mock: + + # Mock successful resource responses + system_mock.return_value = mock.Mock() + manager_mock.return_value = mock.Mock() + net_adapters = chassis_mock.return_value.network_adapters + net_adapters.get_members.return_value = [] + + # Mock time progression (5 successful attempts) + time_mock.side_effect = [0, 5, 10, 15, 20, 25] + + # Should complete successfully after 5 consecutive successes + firmware._validate_resources_stability(task.node) + + # Verify all resources checked 5 times (custom required) + self.assertEqual(system_mock.call_count, 5) + self.assertEqual(manager_mock.call_count, 5) + self.assertEqual(chassis_mock.call_count, 5) + + # Verify sleep was called with custom interval + expected_calls = [mock.call(5)] * 4 # 4 sleeps between 5 + sleep_mock.assert_has_calls(expected_calls) + + def test__validate_resources_stability_network_adapters_none(self): + """Test validation succeeds when network_adapters is None.""" + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(redfish_utils, 'get_manager', + autospec=True) as manager_mock, \ + mock.patch.object(redfish_utils, 'get_chassis', + autospec=True) as chassis_mock, \ + mock.patch.object(time, 'time', autospec=True) as time_mock, \ + mock.patch.object(time, 'sleep', autospec=True): + + # Mock successful resource responses but network_adapters None + system_mock.return_value = mock.Mock() + manager_mock.return_value = mock.Mock() + chassis_mock.return_value.network_adapters = None + + # Mock time progression to simulate consecutive successes + time_mock.side_effect = [0, 1, 2, 3] + + # Should complete successfully (None network_adapters is OK) + firmware._validate_resources_stability(task.node) + + # Verify all resources were checked 3 times + self.assertEqual(system_mock.call_count, 3) + self.assertEqual(manager_mock.call_count, 3) + self.assertEqual(chassis_mock.call_count, 3) + + def test__validate_resources_stability_network_adapters_missing_attr(self): + """Test validation succeeds when network_adapters is missing.""" + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(redfish_utils, 'get_manager', + autospec=True) as manager_mock, \ + mock.patch.object(redfish_utils, 'get_chassis', + autospec=True) as chassis_mock, \ + mock.patch.object(time, 'time', autospec=True) as time_mock, \ + mock.patch.object(time, 'sleep', autospec=True): + + # Mock successful resource responses + system_mock.return_value = mock.Mock() + manager_mock.return_value = mock.Mock() + # network_adapters raises MissingAttributeError + type(chassis_mock.return_value).network_adapters = ( + mock.PropertyMock( + side_effect=sushy.exceptions.MissingAttributeError)) + + # Mock time progression to simulate consecutive successes + time_mock.side_effect = [0, 1, 2, 3] + + # Should complete successfully (missing network_adapters is OK) + firmware._validate_resources_stability(task.node) + + # Verify all resources were checked 3 times + self.assertEqual(system_mock.call_count, 3) + self.assertEqual(manager_mock.call_count, 3) + self.assertEqual(chassis_mock.call_count, 3) + + @mock.patch.object(redfish_fw, 'LOG', autospec=True) + def test__validate_resources_stability_badrequest_error(self, mock_log): + """Test BMC resource validation handles BadRequestError correctly.""" + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(time, 'time', autospec=True) as time_mock, \ + mock.patch.object(time, 'sleep', autospec=True): + + # Mock BadRequestError from sushy with proper arguments + mock_response = mock.Mock() + mock_response.status_code = 400 + system_mock.side_effect = sushy.exceptions.BadRequestError( + 'http://test', mock_response, mock_response) + + # Mock time progression: start at 0, try once at 10, timeout + # at 500, this allows at least one loop iteration to trigger + # the exception + time_mock.side_effect = [0, 10, 500] + + # Should raise RedfishError due to timeout + self.assertRaises(exception.RedfishError, + firmware._validate_resources_stability, + task.node) + + # Verify verbose logging about BMC recovery was called + expected_log_call = mock.call( + 'BMC resource validation failed for node %(node)s: ' + '%(error)s. This may indicate the BMC is still ' + 'restarting or recovering from firmware update.', + {'node': task.node.uuid, 'error': mock.ANY}) + mock_log.debug.assert_has_calls([expected_log_call]) + + @mock.patch.object(redfish_utils, 'get_manager', autospec=True) + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_update_bmc_uses_configured_timeout(self, mock_get_update_service, + mock_execute_fw_update, + mock_set_async_flags, + mock_get_system, + mock_get_manager): + """Test BMC firmware update sets up version checking.""" + settings = [{'component': 'bmc', 'url': 'http://bmc/v1.0.0'}] + + # Mock system + mock_system = mock.Mock() + mock_get_system.return_value = mock_system + + # Mock BMC version reading + mock_manager = mock.Mock() + mock_manager.firmware_version = '1.0.0' + mock_get_manager.return_value = mock_manager + + # add task_monitor to the side effect + mock_execute_fw_update.side_effect = self._mock_exc_fwup_side_effect + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.node.service_step = {'step': 'update', + 'interface': 'firmware'} + result = task.driver.firmware.update(task, settings) + + # BMC uses version checking, not immediate reboot + mock_set_async_flags.assert_called_once_with( + task.node, + reboot=False, + polling=True + ) + # Verify BMC version check tracking is set up + info = task.node.driver_internal_info + fw_updates = info.get('redfish_fw_updates', []) + self.assertEqual(1, len(fw_updates)) + self.assertIn('bmc_check_start_time', fw_updates[0]) + self.assertIn('bmc_fw_version_before_update', info) + self.assertEqual(states.SERVICEWAIT, result) + + @mock.patch.object(redfish_utils, 'get_manager', autospec=True) + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_update_bmc_uses_bmc_constant(self, mock_get_update_service, + mock_execute_fw_update, + mock_set_async_flags, + mock_get_system, + mock_get_manager): + """Test BMC firmware update detection works with BMC constant.""" + settings = [{'component': redfish_utils.BMC, + 'url': 'http://bmc/v1.0.0'}] + + # Mock system + mock_system = mock.Mock() + mock_get_system.return_value = mock_system + + # Mock BMC version reading + mock_manager = mock.Mock() + mock_manager.firmware_version = '1.0.0' + mock_get_manager.return_value = mock_manager + + # add task_monitor to the side effect + mock_execute_fw_update.side_effect = self._mock_exc_fwup_side_effect + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.node.service_step = {'step': 'update', + 'interface': 'firmware'} + result = task.driver.firmware.update(task, settings) + + # BMC uses version checking, not immediate reboot + mock_set_async_flags.assert_called_once_with( + task.node, + reboot=False, + polling=True + ) + # Verify BMC version check tracking is set up + info = task.node.driver_internal_info + fw_updates = info.get('redfish_fw_updates', []) + self.assertEqual(1, len(fw_updates)) + self.assertIn('bmc_check_start_time', fw_updates[0]) + self.assertIn('bmc_fw_version_before_update', info) + self.assertEqual(states.SERVICEWAIT, result) + + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_update_non_bmc_uses_wait_parameter(self, mock_get_update_service, + mock_execute_fw_update, + mock_set_async_flags): + """Test non-BMC firmware update with wait parameter (obsolete).""" + # NOTE: This test is kept for historical reference but the wait + # parameter on BIOS updates is no longer used as BIOS reboots + # immediately when task starts rather than waiting + settings = [{'component': 'bios', 'url': 'http://bios/v1.0.0', + 'wait': 120}] + + # add task_monitor to the side effect + mock_execute_fw_update.side_effect = self._mock_exc_fwup_side_effect + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.node.service_step = {'step': 'update', + 'interface': 'firmware'} + result = task.driver.firmware.update(task, settings) + + # Verify reboot=True is set for BIOS + mock_set_async_flags.assert_called_once_with( + task.node, + reboot=True, + polling=True + ) + self.assertEqual(states.SERVICEWAIT, result) + + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_update_non_bmc_no_wait_parameter(self, mock_get_update_service, + mock_execute_fw_update, + mock_set_async_flags): + """Test non-BMC firmware update without wait parameter.""" + settings = [{'component': 'bios', 'url': 'http://bios/v1.0.0'}] + + # add task_monitor to the side effect + mock_execute_fw_update.side_effect = self._mock_exc_fwup_side_effect + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.node.service_step = {'step': 'update', + 'interface': 'firmware'} + result = task.driver.firmware.update(task, settings) + + # Verify reboot=True is set for BIOS + mock_set_async_flags.assert_called_once_with( + task.node, + reboot=True, + polling=True + ) + self.assertEqual(states.SERVICEWAIT, result) + + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_update_mixed_components_with_bmc(self, mock_get_update_service, + mock_execute_fw_update, + mock_set_async_flags): + """Test mixed component update with BIOS and BMC.""" + settings = [ + {'component': 'bios', 'url': 'http://bios/v1.0.0', 'wait': 120}, + {'component': 'bmc', 'url': 'http://bmc/v1.0.0', 'wait': 60} + ] + + # add task_monitor to the side effect + mock_execute_fw_update.side_effect = self._mock_exc_fwup_side_effect + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.node.service_step = {'step': 'update', + 'interface': 'firmware'} + result = task.driver.firmware.update(task, settings) + + # First component is BIOS, so reboot=True + mock_set_async_flags.assert_called_once_with( + task.node, + reboot=True, + polling=True + ) + self.assertEqual(states.SERVICEWAIT, result) + + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_update_bmc_with_explicit_wait(self, mock_get_update_service, + mock_execute_fw_update, + mock_set_async_flags): + """Test BMC update with explicit wait.""" + settings = [{'component': 'bmc', 'url': 'http://bmc/v1.0.0', + 'wait': 90}] + + # add task_monitor to the side effect + mock_execute_fw_update.side_effect = self._mock_exc_fwup_side_effect + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.node.service_step = {'step': 'update', + 'interface': 'firmware'} + result = task.driver.firmware.update(task, settings) + + # BMC uses version checking, not immediate reboot + mock_set_async_flags.assert_called_once_with( + task.node, + reboot=False, + polling=True + ) + # Verify wait time is stored + info = task.node.driver_internal_info + fw_updates = info['redfish_fw_updates'] + self.assertEqual(90, fw_updates[0]['wait']) + self.assertEqual(states.SERVICEWAIT, result) + + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_utils, 'get_manager', autospec=True) + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_update_bmc_no_immediate_reboot(self, mock_get_update_service, + mock_execute_fw_update, + mock_get_system, + mock_get_manager, + mock_set_async_flags): + """Test BMC firmware update does not set immediate reboot.""" + settings = [{'component': 'bmc', 'url': 'http://bmc/v1.0.0'}] + + # Mock system + mock_system = mock.Mock() + mock_get_system.return_value = mock_system + + # Mock BMC version reading + mock_manager = mock.Mock() + mock_manager.firmware_version = '1.0.0' + mock_get_manager.return_value = mock_manager + + # add task_monitor to the side effect + mock_execute_fw_update.side_effect = self._mock_exc_fwup_side_effect + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.node.service_step = {'step': 'update', + 'interface': 'firmware'} + result = task.driver.firmware.update(task, settings) + + # Verify reboot=False for BMC updates + mock_set_async_flags.assert_called_once_with( + task.node, + reboot=False, + polling=True + ) + # Verify we return wait state to keep step active + self.assertEqual(states.SERVICEWAIT, result) + + # Verify BMC version check tracking is set up + info = task.node.driver_internal_info + fw_updates = info.get('redfish_fw_updates', []) + self.assertEqual(1, len(fw_updates)) + self.assertIn('bmc_check_start_time', fw_updates[0]) + self.assertIn('bmc_fw_version_before_update', info) + + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_update_nic_no_immediate_reboot(self, mock_get_update_service, + mock_execute_fw_update, + mock_set_async_flags): + """Test NIC firmware update sets reboot flag, waits for task.""" + settings = [{'component': 'nic:BCM57414', 'url': 'http://nic/v1.0.0'}] + + # add task_monitor to the side effect + mock_execute_fw_update.side_effect = self._mock_exc_fwup_side_effect + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.node.service_step = {'step': 'update', + 'interface': 'firmware'} + result = task.driver.firmware.update(task, settings) + + # Verify reboot=True for NIC updates (reboot is conditional) + mock_set_async_flags.assert_called_once_with( + task.node, + reboot=True, + polling=True + ) + # Verify we return wait state to keep step active + self.assertEqual(states.SERVICEWAIT, result) + + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_update_bios_sets_reboot_flag(self, mock_get_update_service, + mock_execute_fw_update, + mock_set_async_flags): + """Test BIOS firmware update sets reboot flag.""" + settings = [{'component': 'bios', 'url': 'http://bios/v1.0.0'}] + + # add task_monitor to the side effect + mock_execute_fw_update.side_effect = self._mock_exc_fwup_side_effect + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + # Set up node as if in service step + task.node.service_step = {'step': 'update', + 'interface': 'firmware'} + result = task.driver.firmware.update(task, settings) + + # Verify reboot=True for BIOS updates + mock_set_async_flags.assert_called_once_with( + task.node, + reboot=True, + polling=True + ) + # Verify we return wait state to keep step active + self.assertEqual(states.SERVICEWAIT, result) + + @mock.patch.object(timeutils, 'utcnow', autospec=True) + @mock.patch.object(timeutils, 'parse_isotime', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_continue_updates', + autospec=True) + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, + '_get_current_bmc_version', autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_bmc_version_check_timeout_sets_reboot_flag( + self, mock_get_update_service, mock_get_bmc_version, + mock_set_async_flags, mock_continue_updates, + mock_parse_isotime, mock_utcnow): + """Test BMC version check timeout sets reboot request flag.""" + import datetime + start_time = datetime.datetime(2025, 1, 1, 0, 0, 0, + tzinfo=datetime.timezone.utc) + current_time = start_time + datetime.timedelta(seconds=301) + mock_parse_isotime.return_value = start_time + mock_utcnow.return_value = current_time + settings = [{'component': 'bmc', 'url': 'http://bmc/v1.0.0', + 'wait': 300, 'task_monitor': '/tasks/1', + 'bmc_check_start_time': '2025-01-01T00:00:00.000000'}] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + # Set up node with BMC version checking in progress + task.node.set_driver_internal_info( + 'redfish_fw_updates', settings) + + # Mock BMC is unresponsive + mock_get_bmc_version.return_value = None + + # Call the BMC update completion handler + firmware_interface = redfish_fw.RedfishFirmware() + firmware_interface._handle_bmc_update_completion( + task, mock_get_update_service.return_value, + settings, settings[0]) + + # Verify reboot flag is set + info = task.node.driver_internal_info + self.assertTrue(info.get('firmware_reboot_requested')) + + # Verify async flags updated with reboot=True + mock_set_async_flags.assert_called_once_with( + task.node, + reboot=True, + polling=True + ) + + # Verify _continue_updates was called + mock_continue_updates.assert_called_once() + + @mock.patch.object(redfish_fw.RedfishFirmware, '_continue_updates', + autospec=True) + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, + '_validate_resources_stability', autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) + def test_nic_completion_sets_reboot_flag( + self, mock_get_task_monitor, mock_get_update_service, + mock_validate_resources, mock_set_async_flags, + mock_continue_updates): + """Test NIC firmware task completion sets reboot request flag.""" + settings = [{'component': 'nic:BCM57414', + 'url': 'http://nic/v1.0.0', + 'task_monitor': '/tasks/1'}] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + # Set up node with NIC update in progress + # Set nic_needs_post_completion_reboot to simulate hardware + # that started update immediately but needs reboot after completion + settings[0]['nic_needs_post_completion_reboot'] = True + task.node.set_driver_internal_info( + 'redfish_fw_updates', settings) + + # Mock task completion + mock_task_monitor = mock.Mock() + mock_task_monitor.is_processing = False + mock_task = mock.Mock() + mock_task.task_state = sushy.TASK_STATE_COMPLETED + mock_task.task_status = sushy.HEALTH_OK + mock_task.messages = [] + mock_task_monitor.get_task.return_value = mock_task + mock_get_task_monitor.return_value = mock_task_monitor + + # Call the check method + firmware_interface = redfish_fw.RedfishFirmware() + firmware_interface._check_node_redfish_firmware_update(task) + + # Verify reboot flag is set + info = task.node.driver_internal_info + self.assertTrue(info.get('firmware_reboot_requested')) + + # Verify _continue_updates was called + mock_continue_updates.assert_called_once() + + @mock.patch.object(redfish_fw.RedfishFirmware, + '_validate_resources_stability', autospec=True) + @mock.patch.object(manager_utils, 'notify_conductor_resume_clean', + autospec=True) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_clear_updates', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_final_update_with_reboot_flag_triggers_reboot( + self, mock_get_update_service, mock_clear_updates, + mock_power_action, mock_resume_clean, validate_mock): + """Test final firmware update with reboot flag triggers reboot.""" + settings = [{'component': 'bmc', 'url': 'http://bmc/v1.0.0', + 'task_monitor': '/tasks/1'}] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + # Set up node as if in cleaning + task.node.clean_step = {'step': 'update', 'interface': 'firmware'} + + # Set up final update with reboot requested + task.node.set_driver_internal_info( + 'redfish_fw_updates', settings) + task.node.set_driver_internal_info( + 'firmware_reboot_requested', True) + + # Call _continue_updates with last firmware + firmware_interface = redfish_fw.RedfishFirmware() + firmware_interface._continue_updates( + task, mock_get_update_service.return_value, settings) + + # Verify reboot was triggered + mock_power_action.assert_called_once_with(task, states.REBOOT) + + # Verify BMC validation was called before resuming conductor + validate_mock.assert_called_once() + + # Verify resume clean was called + mock_resume_clean.assert_called_once_with(task) + + @mock.patch.object(redfish_fw.RedfishFirmware, + '_validate_resources_stability', autospec=True) + @mock.patch.object(manager_utils, 'notify_conductor_resume_clean', + autospec=True) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_clear_updates', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_final_update_without_reboot_flag_no_reboot( + self, mock_get_update_service, mock_clear_updates, + mock_power_action, mock_resume_clean, validate_mock): + """Test final firmware update without reboot flag skips reboot.""" + settings = [{'component': 'bmc', 'url': 'http://bmc/v1.0.0', + 'task_monitor': '/tasks/1'}] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + # Set up node as if in cleaning + task.node.clean_step = {'step': 'update', 'interface': 'firmware'} + + # Set up final update WITHOUT reboot requested + task.node.set_driver_internal_info( + 'redfish_fw_updates', settings) + # Don't set firmware_reboot_requested + + # Call _continue_updates with last firmware + firmware_interface = redfish_fw.RedfishFirmware() + firmware_interface._continue_updates( + task, mock_get_update_service.return_value, settings) + + # Verify reboot was NOT triggered + mock_power_action.assert_not_called() + + # Verify BMC validation was called before resuming conductor + validate_mock.assert_called_once() + + # Verify resume clean was still called + mock_resume_clean.assert_called_once_with(task) + + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) + def test_bios_reboot_on_task_starting( + self, mock_get_task_monitor, mock_get_update_service, + mock_power_action): + """Test BIOS update triggers reboot when task reaches STARTING.""" + settings = [{'component': 'bios', 'url': 'http://bios/v1.0.1', + 'task_monitor': '/tasks/1'}] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + # Set up node with BIOS update in progress + task.node.set_driver_internal_info('redfish_fw_updates', settings) + task.node.clean_step = {'step': 'update', 'interface': 'firmware'} + + # Mock task monitor to return is_processing=True + mock_task_monitor = mock.Mock() + mock_task_monitor.is_processing = True + mock_get_task_monitor.return_value = mock_task_monitor + + # Mock the task state as STARTING + mock_task = mock.Mock() + mock_task.task_state = sushy.TASK_STATE_STARTING + mock_task_monitor.get_task.return_value = mock_task + + # Call the check method + firmware_interface = redfish_fw.RedfishFirmware() + firmware_interface._check_node_redfish_firmware_update(task) + + # Verify reboot was triggered + mock_power_action.assert_called_once_with(task, states.REBOOT, 0) + + # Verify the flag was set to prevent repeated reboots + updated_settings = task.node.driver_internal_info[ + 'redfish_fw_updates'] + self.assertTrue(updated_settings[0].get('bios_reboot_triggered')) + + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) + def test_bios_no_repeated_reboot_after_flag_set( + self, mock_get_task_monitor, mock_get_update_service, + mock_power_action): + """Test BIOS update doesn't reboot again after flag is set.""" + settings = [{'component': 'bios', 'url': 'http://bios/v1.0.1', + 'task_monitor': '/tasks/1', + 'bios_reboot_triggered': True}] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + # Set up node with BIOS update in progress and flag already set + task.node.set_driver_internal_info('redfish_fw_updates', settings) + task.node.clean_step = {'step': 'update', 'interface': 'firmware'} + + # Mock task monitor to return is_processing=True + mock_task_monitor = mock.Mock() + mock_task_monitor.is_processing = True + mock_get_task_monitor.return_value = mock_task_monitor + + # Call the check method + firmware_interface = redfish_fw.RedfishFirmware() + firmware_interface._check_node_redfish_firmware_update(task) + + # Verify reboot was NOT triggered again + mock_power_action.assert_not_called() + + @mock.patch.object(redfish_fw, 'LOG', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_continue_updates', + autospec=True) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) + def test_bios_reboot_on_completion_without_prior_reboot( + self, mock_get_task_monitor, mock_get_update_service, + mock_power_action, mock_continue_updates, mock_log): + """Test BIOS task completion triggers reboot when not triggered before. + + This test verifies the alternate path where a BIOS firmware update + task completes very quickly (e.g., HPE iLO staging firmware) before + Ironic can trigger a reboot during the STARTING state. In this case, + when the task reaches COMPLETED state and bios_reboot_triggered is + not set, we should: + 1. Trigger a reboot to apply the staged firmware + 2. NOT call _continue_updates (return early) + 3. Set the bios_reboot_triggered flag + """ + settings = [{'component': 'bios', 'url': 'http://bios/v1.0.1', + 'task_monitor': '/tasks/1'}] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + # Set up node with BIOS update in progress + # Note: bios_reboot_triggered is NOT set + task.node.set_driver_internal_info('redfish_fw_updates', settings) + task.node.clean_step = {'step': 'update', 'interface': 'firmware'} + + # Mock task monitor showing task is completed + mock_task_monitor = mock.Mock() + mock_task_monitor.is_processing = False + mock_task = mock.Mock() + mock_task.task_state = sushy.TASK_STATE_COMPLETED + mock_task.task_status = sushy.HEALTH_OK + mock_task.messages = [] + mock_task_monitor.get_task.return_value = mock_task + mock_get_task_monitor.return_value = mock_task_monitor + + # Call the check method + firmware_interface = redfish_fw.RedfishFirmware() + firmware_interface._check_node_redfish_firmware_update(task) + + # Verify reboot WAS triggered + mock_power_action.assert_called_once_with(task, states.REBOOT, 0) + + # Verify _continue_updates was NOT called (early return) + mock_continue_updates.assert_not_called() + + # Verify the flag was set to prevent repeated reboots + updated_settings = task.node.driver_internal_info[ + 'redfish_fw_updates'] + self.assertTrue(updated_settings[0].get('bios_reboot_triggered')) + + # Verify LOG.info was called with the correct message + mock_log.info.assert_any_call( + 'BIOS firmware update task completed for node ' + '%(node)s but reboot was not triggered yet. ' + 'Triggering reboot now to apply staged firmware.', + {'node': task.node.uuid}) + + @mock.patch.object(redfish_fw, 'LOG', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_continue_updates', + autospec=True) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) + def test_bios_continue_after_completion_with_prior_reboot( + self, mock_get_task_monitor, mock_get_update_service, + mock_power_action, mock_continue_updates, mock_log): + """Test BIOS task completion continues when reboot already triggered. + + This test verifies the else path where a BIOS firmware update task + completes and the reboot was already triggered (during STARTING state). + In this case, when the task reaches COMPLETED state and + bios_reboot_triggered is already set, we should: + 1. NOT trigger another reboot + 2. Call _continue_updates to proceed with next firmware + 3. Clean up the bios_reboot_triggered flag + """ + settings = [{'component': 'bios', 'url': 'http://bios/v1.0.1', + 'task_monitor': '/tasks/1', + 'bios_reboot_triggered': True}] # Flag already set + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + # Set up node with BIOS update in progress + # Note: bios_reboot_triggered IS set (reboot already happened) + task.node.set_driver_internal_info('redfish_fw_updates', settings) + task.node.clean_step = {'step': 'update', 'interface': 'firmware'} + + # Mock task monitor showing task is completed + mock_task_monitor = mock.Mock() + mock_task_monitor.is_processing = False + mock_task = mock.Mock() + mock_task.task_state = sushy.TASK_STATE_COMPLETED + mock_task.task_status = sushy.HEALTH_OK + mock_task.messages = [] + mock_task_monitor.get_task.return_value = mock_task + mock_get_task_monitor.return_value = mock_task_monitor + + # Call the check method + firmware_interface = redfish_fw.RedfishFirmware() + firmware_interface._check_node_redfish_firmware_update(task) + + # Verify reboot was NOT triggered (already happened) + mock_power_action.assert_not_called() + + # Verify _continue_updates WAS called + mock_continue_updates.assert_called_once_with( + firmware_interface, task, mock_get_update_service.return_value, + settings) + + # Verify the flag was cleaned up (popped from settings) + updated_settings = task.node.driver_internal_info[ + 'redfish_fw_updates'] + self.assertIsNone(updated_settings[0].get('bios_reboot_triggered')) + + # Verify LOG.info was called with the correct message + mock_log.info.assert_any_call( + 'BIOS firmware update task completed for node ' + '%(node)s. System was already rebooted. ' + 'Proceeding with continuation.', + {'node': task.node.uuid}) + + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) + def test_bios_handle_task_starting_sets_flag_correctly( + self, mock_get_task_monitor, mock_power_action): + """Test _handle_bios_task_starting sets flag correctly with lock. + + This test verifies that when _handle_bios_task_starting is called: + 1. The bios_reboot_triggered flag is set + 2. task.node reflects the flag immediately after the method returns + + If node = task.node were captured BEFORE upgrade_lock (the bug), + this test would fail because task.node would be stale and not + have the flag. + + With the correct order (upgrade_lock first, then node = task.node), + the test passes because changes are saved to the correct object. + """ + settings = [{'component': 'bios', 'url': 'http://bios/v1.0.1', + 'task_monitor': '/tasks/1'}] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + # Set up node with BIOS update in progress + task.node.set_driver_internal_info('redfish_fw_updates', settings) + task.node.clean_step = {'step': 'update', 'interface': 'firmware'} + task.node.save() + + # Mock task monitor showing STARTING state + mock_task_monitor = mock.Mock() + mock_task = mock.Mock() + mock_task.task_state = sushy.TASK_STATE_STARTING + mock_task_monitor.get_task.return_value = mock_task + mock_get_task_monitor.return_value = mock_task_monitor + + # Mock upgrade_lock to simulate what happens in production: + # Replace task.node with a fresh copy from DB + original_upgrade_lock = task.upgrade_lock + def mock_upgrade_lock(): + original_upgrade_lock() + # Simulate production behavior: replace task.node + # with fresh copy + task.node = objects.Node.get(self.context, task.node.uuid) + + with mock.patch.object(task, 'upgrade_lock', + side_effect=mock_upgrade_lock, + autospec=True): + # Call the actual method being tested + firmware_interface = redfish_fw.RedfishFirmware() + result = firmware_interface._handle_bios_task_starting( + task, mock_task_monitor, settings, settings[0]) + + # Verify reboot was triggered + self.assertTrue(result, + 'Method should return True when reboot triggered') + mock_power_action.assert_called_once() + + # CRITICAL TEST: Verify task.node has the flag + # If the order were wrong (node captured before upgrade_lock), + # task.node would be a stale object without the flag + current_settings = task.node.driver_internal_info.get( + 'redfish_fw_updates', [{}]) + self.assertTrue( + current_settings[0].get('bios_reboot_triggered'), + 'Flag must be in task.node after method returns. ' + 'If this fails, node was captured before upgrade_lock.') + + # Verify the flag is also persisted to DB + task.node.refresh() + refreshed_settings = task.node.driver_internal_info.get( + 'redfish_fw_updates', [{}]) + self.assertTrue( + refreshed_settings[0].get('bios_reboot_triggered'), + 'Flag must be persisted to database') diff --git a/ironic/tests/unit/drivers/modules/redfish/test_management.py b/ironic/tests/unit/drivers/modules/redfish/test_management.py index 0fc1b8328a..953835c000 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_management.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_management.py @@ -327,6 +327,90 @@ def test_set_boot_device_http_boot(self, mock_get_system): self.assertNotIn('redfish_uefi_http_url', task.node.driver_internal_info) + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + def test_set_boot_device_post_retry_success(self, mock_get_system): + """Test successful retry when BMC rejects changes during POST.""" + # Configure minimal retries for faster test + self.config(post_boot_retry_attempts=3, group='redfish') + self.config(post_boot_retry_delay=1, group='redfish') + + fake_system = mock.Mock() + # First call fails with POST error, second succeeds + # Create a real BadRequestError with POST error message in str() + mock_response = mock.Mock() + mock_response.status_code = 400 + mock_response.json.return_value = { + 'error': {'message': 'UnableToModifyDuringSystemPOST'} + } + post_error = sushy.exceptions.BadRequestError( + 'POST', 'http://bmc/redfish', mock_response) + fake_system.set_system_boot_options.side_effect = [ + post_error, + None # Success on retry + ] + mock_get_system.return_value = fake_system + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.driver.management.set_boot_device(task, boot_devices.PXE) + + # Should have been called twice (initial + 1 retry) + self.assertEqual(2, + fake_system.set_system_boot_options.call_count) + + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + def test_set_boot_device_post_retry_exhausted(self, mock_get_system): + """Test that POST retry eventually gives up and raises.""" + # Configure minimal retries for faster test + self.config(post_boot_retry_attempts=2, group='redfish') + self.config(post_boot_retry_delay=1, group='redfish') + + fake_system = mock.Mock() + # All calls fail with POST error + # Create a real BadRequestError with POST error message in str() + mock_response = mock.Mock() + mock_response.status_code = 400 + mock_response.json.return_value = { + 'error': {'message': 'UnableToModifyDuringSystemPOST'} + } + post_error = sushy.exceptions.BadRequestError( + 'POST', 'http://bmc/redfish', mock_response) + fake_system.set_system_boot_options.side_effect = post_error + mock_get_system.return_value = fake_system + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + self.assertRaisesRegex( + exception.RedfishError, 'Redfish set boot device', + task.driver.management.set_boot_device, task, boot_devices.PXE) + + # Should have been called for all retry attempts + self.assertEqual(2, + fake_system.set_system_boot_options.call_count) + + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + def test_set_boot_device_non_post_error_no_retry(self, mock_get_system): + """Test that non-POST errors are not retried.""" + # Configure retries (should not be used) + self.config(post_boot_retry_attempts=3, group='redfish') + self.config(post_boot_retry_delay=1, group='redfish') + + fake_system = mock.Mock() + # Fail with a generic SushyError (not POST-related) + fake_system.set_system_boot_options.side_effect = ( + sushy.exceptions.SushyError('Some other error') + ) + mock_get_system.return_value = fake_system + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + self.assertRaisesRegex( + exception.RedfishError, 'Redfish set boot device', + task.driver.management.set_boot_device, task, boot_devices.PXE) + + # Should have been called only once (no retry for non-POST errors) + fake_system.set_system_boot_options.assert_called_once() + def test_restore_boot_device(self): fake_system = mock.Mock() with task_manager.acquire(self.context, self.node.uuid, diff --git a/releasenotes/notes/bmc-firmware-timeout-fix-328e09ff98f9348f.yaml b/releasenotes/notes/bmc-firmware-timeout-fix-328e09ff98f9348f.yaml new file mode 100644 index 0000000000..ceec71400f --- /dev/null +++ b/releasenotes/notes/bmc-firmware-timeout-fix-328e09ff98f9348f.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Fixes BMC firmware update timeout issues by using extended timeout + (default 300 seconds) for BMC firmware updates to handle BMC transitional + states during firmware update process, unless a different timeout is + specified by the operator. diff --git a/releasenotes/notes/bmc-validation-after-firmware-update-3d5f8a9e76c24d1b.yaml b/releasenotes/notes/bmc-validation-after-firmware-update-3d5f8a9e76c24d1b.yaml new file mode 100644 index 0000000000..49a66b657b --- /dev/null +++ b/releasenotes/notes/bmc-validation-after-firmware-update-3d5f8a9e76c24d1b.yaml @@ -0,0 +1,24 @@ +--- +fixes: + - | + Reduces likelihood of intermittent firmware upgrade failures by adding + comprehensive BMC state validation after firmware upgrades for the + Redfish driver. After a firmware update task completes successfully, Ironic + now validates that BMC resources (System, Manager, and NetworkAdapters) are + consistently available before proceeding with subsequent operations. + + This change improves firmware update reliability by requiring a + configurable number of consecutive successful responses from the BMC + resources. The validation process helps ensure the BMC has fully stabilized + after firmware updates before marking the operation as complete. + + New configuration options in the ``[redfish]`` section: + + * ``firmware_update_required_successes`` - Number of consecutive successful + responses required to consider BMC validation successful (default: 3), + 0 disables validation + * ``firmware_update_validation_interval`` - Time in seconds to wait between + validation attempts (default: 30) + * ``firmware_update_resource_validation_timeout`` - Maximum time in seconds + to wait for BMC resources to stabilize (default: 300), setting it to 0 + disables validation. diff --git a/releasenotes/notes/bug-2139122-c379997d1943ead6.yaml b/releasenotes/notes/bug-2139122-c379997d1943ead6.yaml new file mode 100644 index 0000000000..af7dc286ed --- /dev/null +++ b/releasenotes/notes/bug-2139122-c379997d1943ead6.yaml @@ -0,0 +1,8 @@ +--- +upgrade: + - | + The default configuration value for + ``[redfish]firmware_update_wait_unresponsive_bmc`` and for + ``[redfish]firmware_update_resource_validation_timeout`` has been changed + to ``0`` and ``480`` respectively. For more information, see + https://bugs.launchpad.net/ironic/+bug/2139122. diff --git a/releasenotes/notes/conditional-reboot-fwup-6ca14573a2bebba0.yaml b/releasenotes/notes/conditional-reboot-fwup-6ca14573a2bebba0.yaml new file mode 100644 index 0000000000..140e31bb91 --- /dev/null +++ b/releasenotes/notes/conditional-reboot-fwup-6ca14573a2bebba0.yaml @@ -0,0 +1,10 @@ +--- +fixes: + - | + Each firmware component might have different reboot requirements depending + on the hardware, this change adds two new configuration options to help + operators configure the reboot behavior for firmware updates. + ``[redfish]firmware_update_reboot_delay``: reboot delay for firmware + updates of components. + ``[redfish]firmware_update_bmc_version_check_interval``: delay in seconds + after the firmware update fails before the node is rebooted. diff --git a/releasenotes/notes/filter-null-nic-firmware-versions-in-cache-82a68b4df96591f7.yaml b/releasenotes/notes/filter-null-nic-firmware-versions-in-cache-82a68b4df96591f7.yaml new file mode 100644 index 0000000000..3df16f94b8 --- /dev/null +++ b/releasenotes/notes/filter-null-nic-firmware-versions-in-cache-82a68b4df96591f7.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + NIC firmware components with null or empty firmware package versions + are now filtered out during caching to avoid NOT NULL database + constraint violations. diff --git a/releasenotes/notes/fix-bios-fw-flash-190220f3beccf2f3.yaml b/releasenotes/notes/fix-bios-fw-flash-190220f3beccf2f3.yaml new file mode 100644 index 0000000000..a108500274 --- /dev/null +++ b/releasenotes/notes/fix-bios-fw-flash-190220f3beccf2f3.yaml @@ -0,0 +1,9 @@ +--- +fixes: + - | + Fixes an issue with BIOS firmware updates on certain BMCs where the + firmware task completes very quickly after staging the firmware, + before Ironic can trigger a reboot. Previously, this would result in the + staged BIOS firmware not being applied because no reboot was triggered. + Now, Ironic detects this condition and triggers the reboot when the task + completes, ensuring the staged firmware is properly applied. diff --git a/releasenotes/notes/fix-firmware-update-when-task-remains-running-bbe30ab1059baf32.yaml b/releasenotes/notes/fix-firmware-update-when-task-remains-running-bbe30ab1059baf32.yaml new file mode 100644 index 0000000000..5c0114810e --- /dev/null +++ b/releasenotes/notes/fix-firmware-update-when-task-remains-running-bbe30ab1059baf32.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Some BMCs maybe return they are not processing the firmware update when + the task is still in RUNNING, STARTING, or PENDING state. Previously, + Ironic will treat it as complete and stop polling for the update. + Now, Ironic will continue polling for the update until it is complete. diff --git a/releasenotes/notes/fix-heartbeat-timeout-no-ramdisk-steps-732f3da7faa28f94.yaml b/releasenotes/notes/fix-heartbeat-timeout-no-ramdisk-steps-732f3da7faa28f94.yaml new file mode 100644 index 0000000000..379247d195 --- /dev/null +++ b/releasenotes/notes/fix-heartbeat-timeout-no-ramdisk-steps-732f3da7faa28f94.yaml @@ -0,0 +1,17 @@ +--- +fixes: + - | + Fixes an issue where service and clean steps with ``requires_ramdisk=False`` + (such as Redfish firmware updates) would incorrectly fail due to heartbeat + timeout. These steps operate independently of the ramdisk agent and should + not be subject to heartbeat timeouts. Previously, such steps would fail + after ``service_callback_timeout`` or ``clean_callback_timeout`` (default + 30 minutes) with a misleading error about the ramdisk not running. + The fix updates ``provision_updated_at`` during firmware update polling, + preventing the heartbeat timeout mechanism from triggering while the + out-of-band operation is still in progress. For more information, see + `bug 2136276 `_. +features: + - | + Adds ``[redfish]firmware_update_overall_timeout`` (default 2 hours) to + limit total firmware update duration as a safety net for stuck updates. diff --git a/releasenotes/notes/fix-node-info-fwupdate-23778908c67d1bd6.yaml b/releasenotes/notes/fix-node-info-fwupdate-23778908c67d1bd6.yaml new file mode 100644 index 0000000000..981da0bb92 --- /dev/null +++ b/releasenotes/notes/fix-node-info-fwupdate-23778908c67d1bd6.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + Fixes a regression introduced in the firmware upgrade fixes that caused + the intermediate status not to get persisted on the node, causing a + reboot loop. diff --git a/releasenotes/notes/networkadapters-fix-ilo-gen11-2f43f4fbdb995231.yaml b/releasenotes/notes/networkadapters-fix-ilo-gen11-2f43f4fbdb995231.yaml new file mode 100644 index 0000000000..b0de9beb65 --- /dev/null +++ b/releasenotes/notes/networkadapters-fix-ilo-gen11-2f43f4fbdb995231.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Fixes an issue where firmware upgrades would fail on HP iLO Generation 11 + machines when the NetworkAdapters endpoint returns HTTP 400 (Bad Request) + responses during the firmware component caching process. Exception + handling allows the code to continue execution past that issue. diff --git a/releasenotes/notes/noisy-network-adapters-logging-46b8908b319f0bdd.yaml b/releasenotes/notes/noisy-network-adapters-logging-46b8908b319f0bdd.yaml new file mode 100644 index 0000000000..22a888904a --- /dev/null +++ b/releasenotes/notes/noisy-network-adapters-logging-46b8908b319f0bdd.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Fixes excessive logging when Redfish-managed servers do not have + NetworkAdapters endpoints. Previously, servers without NetworkAdapters + would generate warning-level log messages during firmware component + caching. Now these messages are logged at debug level since having no + NetworkAdapters is a valid hardware configuration. diff --git a/releasenotes/notes/redfish-full-boot-payload-asrockrack-79a9291da8ec5a50.yaml b/releasenotes/notes/redfish-full-boot-payload-asrockrack-79a9291da8ec5a50.yaml new file mode 100644 index 0000000000..572553e9ab --- /dev/null +++ b/releasenotes/notes/redfish-full-boot-payload-asrockrack-79a9291da8ec5a50.yaml @@ -0,0 +1,14 @@ +--- +fixes: + - | + Improved Redfish compatibility with ASRock Rack servers by updating how + boot mode and boot device settings are applied. + + Previously, calls to `set_boot_device` and `set_boot_mode` only included + minimal parameters (e.g., `BootOverrideTarget` and `BootOverrideMode`), which + were insufficient for certain vendor implementations like ASRockRack. + + This fix updates the Redfish driver to send the full payload, including + `BootSourceOverrideEnabled`, `BootSourceOverrideTarget`, `BootSourceOverrideMode`, + ensuring better compliance and reliability on these platforms. + diff --git a/releasenotes/notes/redfish-post-boot-retry-8da100f84e6fcacf.yaml b/releasenotes/notes/redfish-post-boot-retry-8da100f84e6fcacf.yaml new file mode 100644 index 0000000000..e6f9478961 --- /dev/null +++ b/releasenotes/notes/redfish-post-boot-retry-8da100f84e6fcacf.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Adds retry logic for Redfish boot device changes when BMC rejects + modifications during POST (Power-On Self-Test). This addresses issues + with HPE iLO BMCs returning ``UnableToModifyDuringSystemPOST`` errors + after firmware updates or reboots. See bug + `2136275 `_.