From 6da8c6373ff15e3b85d822262726b01d789ee129 Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Tue, 9 Sep 2025 14:41:58 +1000 Subject: [PATCH 01/15] Fix intermittent Redfish firmware update failures with BMC validation Resolves a bug where firmware updates fail intermittently on some hardware models due to invalid or unstable BMC responses immediately after firmware update completion. The BMC may return inconsistent responses for a period after firmware updates, causing the update process to fail prematurely. This change adds comprehensive BMC state validation that requires multiple consecutive successful responses from System, Manager, and NetworkAdapters resources before considering the firmware update complete. This ensures the BMC has fully stabilized before proceeding. Generated-By: Claude Code Sonnet 4 Change-Id: I5cb72f62d3fc62c3ad750c62924842cef59e79b8 Signed-off-by: Jacob Anders (cherry picked from commit 85ec9d655f7e8ade0a611ceb8d202c841123f05c) --- ironic/conf/redfish.py | 21 ++ ironic/drivers/modules/redfish/firmware.py | 111 +++++++- .../drivers/modules/redfish/test_firmware.py | 241 +++++++++++++++++- ...fter-firmware-update-3d5f8a9e76c24d1b.yaml | 24 ++ 4 files changed, 391 insertions(+), 6 deletions(-) create mode 100644 releasenotes/notes/bmc-validation-after-firmware-update-3d5f8a9e76c24d1b.yaml diff --git a/ironic/conf/redfish.py b/ironic/conf/redfish.py index 26db778e7c..00bc919924 100644 --- a/ironic/conf/redfish.py +++ b/ironic/conf/redfish.py @@ -95,6 +95,27 @@ default=300, help=_('Number of seconds to wait before proceeding with the ' 'reboot to finish the BMC firmware update step')), + cfg.IntOpt('firmware_update_required_successes', + min=0, + default=3, + help=_('Number of successful responses required to ' + 'consider post-upgrade BMC validation a success. ' + 'Set to 0 to disable post-upgrade validation ' + 'entirely.')), + cfg.IntOpt('firmware_update_validation_interval', + min=0, + default=30, + help=_('Timeout (in seconds) to wait between validation ' + 'attempts. Set to 0 for rapid succession retries ' + 'with no delay.')), + cfg.IntOpt('firmware_update_resource_validation_timeout', + min=0, + default=300, + help=_('Timeout (in seconds) to wait for BMC resources ' + '(System, Manager, NetworkAdapters) to become stable ' + 'and consistently available after firmware update. ' + 'Set to 0 to disable post-upgrade validation ' + 'entirely.')), cfg.StrOpt('firmware_source', choices=[('http', _('If firmware source URL is also HTTP, then ' 'serve from original location, otherwise ' diff --git a/ironic/drivers/modules/redfish/firmware.py b/ironic/drivers/modules/redfish/firmware.py index 12c23a509b..9facb99451 100644 --- a/ironic/drivers/modules/redfish/firmware.py +++ b/ironic/drivers/modules/redfish/firmware.py @@ -181,7 +181,6 @@ def retrieve_nic_components(self, task, system): @base.service_step(priority=0, abortable=False, argsinfo=_FW_SETTINGS_ARGSINFO, requires_ramdisk=False) - @base.cache_firmware_components def update(self, task, settings): """Update the Firmware on the node using the settings for components. @@ -280,6 +279,7 @@ def _execute_firmware_update(self, node, update_service, settings): 'seconds before proceeding to reboot the node %(node_uuid)s ' 'to complete the step', {'node_uuid': node.uuid, 'wait_time': wait_unres_bmc}) + # Store task monitor URI for periodic task polling # TODO(iurygregory): Improve the logic here to identify if the BMC # is back, so we don't have to unconditionally wait. # The wait_unres_bmc will be the maximum time to wait. @@ -299,10 +299,103 @@ def _execute_firmware_update(self, node, update_service, settings): fw_clean.append(cleanup) node.set_driver_internal_info('firmware_cleanup', fw_clean) + def _validate_resources_stability(self, node): + """Validate that BMC resources are consistently available. + + Requires consecutive successful responses from System, Manager, + and NetworkAdapters resources before considering them stable. + The number of required successes is configured via + CONF.redfish.firmware_update_required_successes. + Timeout is configured via + CONF.redfish.firmware_update_resource_validation_timeout. + + :param node: the Ironic node object + :raises: RedfishError if resources don't stabilize within timeout + """ + timeout = CONF.redfish.firmware_update_resource_validation_timeout + required_successes = CONF.redfish.firmware_update_required_successes + validation_interval = CONF.redfish.firmware_update_validation_interval + + # Skip validation if validation is disabled via configuration + if required_successes == 0 or timeout == 0: + reasons = [] + if required_successes == 0: + reasons.append('required_successes=0') + if timeout == 0: + reasons.append('validation_timeout=0') + + LOG.info('BMC resource validation disabled (%s) for node %(node)s', + ', '.join(reasons), {'node': node.uuid}) + return + + LOG.debug('Starting resource stability validation for node %(node)s ' + '(timeout: %(timeout)s seconds, ' + 'required_successes: %(required)s, ' + 'validation_interval: %(interval)s seconds)', + {'node': node.uuid, 'timeout': timeout, + 'required': required_successes, + 'interval': validation_interval}) + + start_time = time.time() + end_time = start_time + timeout + consecutive_successes = 0 + last_exc = None + + while time.time() < end_time: + try: + # Test System resource + system = redfish_utils.get_system(node) + + # Test Manager resource + redfish_utils.get_manager(node, system) + + # Test NetworkAdapters resource + chassis = redfish_utils.get_chassis(node, system) + chassis.network_adapters.get_members() + + # All resources successful + consecutive_successes += 1 + LOG.debug('Resource validation success %(count)d/%(required)d ' + 'for node %(node)s', + {'count': consecutive_successes, + 'required': required_successes, + 'node': node.uuid}) + + if consecutive_successes >= required_successes: + LOG.info('All tested Redfish resources stable and ' + ' available for node %(node)s', + {'node': node.uuid}) + return + + except (exception.RedfishError, + exception.RedfishConnectionError, + sushy.exceptions.BadRequestError) as e: + # Resource not available yet, reset counter + if consecutive_successes > 0: + LOG.debug('Resource validation interrupted for node ' + '%(node)s, resetting success counter ' + '(error: %(error)s)', + {'node': node.uuid, 'error': e}) + consecutive_successes = 0 + last_exc = e + + # Wait before next validation attempt + time.sleep(validation_interval) + # Timeout reached without achieving stability + error_msg = _('BMC resources failed to stabilize within ' + '%(timeout)s seconds for node %(node)s') % { + 'timeout': timeout, 'node': node.uuid} + if last_exc: + error_msg += _(', last error: %(error)s') % {'error': last_exc} + LOG.error(error_msg) + raise exception.RedfishError(error=error_msg) + def _continue_updates(self, task, update_service, settings): """Continues processing the firmware updates Continues to process the firmware updates on the node. + First monitors the current task completion, then validates resource + stability before proceeding to next update or completion. Note that the caller must have an exclusive lock on the node. @@ -312,6 +405,7 @@ def _continue_updates(self, task, update_service, settings): """ node = task.node fw_upd = settings[0] + wait_interval = fw_upd.get('wait') if wait_interval: time_now = str(timeutils.utcnow().isoformat()) @@ -450,6 +544,14 @@ def _check_node_redfish_firmware_update(self, task): try: task_monitor = redfish_utils.get_task_monitor( node, current_update['task_monitor']) + except exception.RedfishConnectionError as e: + # If the BMC firmware is being updated, the BMC will be + # unavailable for some amount of time. + LOG.warning('Unable to communicate with task monitor service ' + 'on node %(node)s. Will try again on the next poll. ' + 'Error: %(error)s', + {'node': node.uuid, 'error': e}) + return except exception.RedfishError: # The BMC deleted the Task before we could query it LOG.warning('Firmware update completed for node %(node)s, ' @@ -478,12 +580,15 @@ def _check_node_redfish_firmware_update(self, task): if (sushy_task.task_state == sushy.TASK_STATE_COMPLETED and sushy_task.task_status in [sushy.HEALTH_OK, sushy.HEALTH_WARNING]): - LOG.info('Firmware update succeeded for node %(node)s, ' - 'firmware %(firmware_image)s: %(messages)s', + LOG.info('Firmware update task completed for node %(node)s, ' + 'firmware %(firmware_image)s: %(messages)s. ' + 'Starting BMC response validation.', {'node': node.uuid, 'firmware_image': current_update['url'], 'messages': ", ".join(messages)}) + # Validate BMC resources are consistently available + self._validate_resources_stability(node) self._continue_updates(task, update_service, settings) else: error_msg = (_('Firmware update failed for node %(node)s, ' diff --git a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py index 883a4a2ece..4e468381eb 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py @@ -692,11 +692,13 @@ def test__check_node_firmware_update_fail_servicing( servicing_error_handler_mock.assert_called_once() interface._continue_updates.assert_not_called() + @mock.patch.object(redfish_fw.RedfishFirmware, + '_validate_resources_stability', autospec=True) @mock.patch.object(redfish_fw, 'LOG', autospec=True) @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) def test__check_node_firmware_update_done(self, tm_mock, get_us_mock, - log_mock): + log_mock, validate_mock): task_mock = mock.Mock() task_mock.task_state = sushy.TASK_STATE_COMPLETED task_mock.task_status = sushy.HEALTH_OK @@ -712,13 +714,15 @@ def test__check_node_firmware_update_done(self, tm_mock, get_us_mock, task, interface = self._test__check_node_redfish_firmware_update() task.upgrade_lock.assert_called_once_with() info_calls = [ - mock.call('Firmware update succeeded for node %(node)s, ' - 'firmware %(firmware_image)s: %(messages)s', + mock.call('Firmware update task completed for node %(node)s, ' + 'firmware %(firmware_image)s: %(messages)s. ' + 'Starting BMC response validation.', {'node': self.node.uuid, 'firmware_image': 'https://bmc/v1.0.1', 'messages': 'Firmware update done'})] log_mock.info.assert_has_calls(info_calls) + validate_mock.assert_called_once() interface._continue_updates.assert_called_once_with( task, get_us_mock.return_value, @@ -1016,3 +1020,234 @@ def test__execute_firmware_update_unresponsive_bmc_node_override( sleep_mock.assert_called_once_with( self.node.driver_info.get('firmware_update_unresponsive_bmc_wait') ) + + def test__validate_resources_stability_success(self): + """Test successful BMC resource validation with consecutive success.""" + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(redfish_utils, 'get_manager', + autospec=True) as manager_mock, \ + mock.patch.object(redfish_utils, 'get_chassis', + autospec=True) as chassis_mock, \ + mock.patch.object(time, 'time', autospec=True) as time_mock, \ + mock.patch.object(time, 'sleep', + autospec=True) as sleep_mock: + + # Mock successful resource responses + system_mock.return_value = mock.Mock() + manager_mock.return_value = mock.Mock() + net_adapters = chassis_mock.return_value.network_adapters + net_adapters.get_members.return_value = [] + + # Mock time progression to simulate consecutive successes + time_mock.side_effect = [0, 1, 2, 3] # 3 successful attempts + + # Should complete successfully after 3 consecutive successes + firmware._validate_resources_stability(task.node) + + # Verify all resources were checked 3 times (required success) + self.assertEqual(system_mock.call_count, 3) + self.assertEqual(manager_mock.call_count, 3) + self.assertEqual(chassis_mock.call_count, 3) + + # Verify sleep was called between validation attempts + expected_calls = [mock.call( + CONF.redfish.firmware_update_validation_interval)] * 2 + sleep_mock.assert_has_calls(expected_calls) + + def test__validate_resources_stability_timeout(self): + """Test BMC resource validation timeout when not achieved.""" + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(redfish_utils, 'get_manager', + autospec=True), \ + mock.patch.object(time, 'time', autospec=True) as time_mock, \ + mock.patch.object(time, 'sleep', autospec=True): + + # Mock system always failing + system_mock.side_effect = exception.RedfishConnectionError( + 'timeout') + + # Mock time progression to exceed timeout + time_mock.side_effect = [0, 350] # Exceeds 300 second timeout + + # Should raise RedfishError due to timeout + self.assertRaises(exception.RedfishError, + firmware._validate_resources_stability, + task.node) + + def test__validate_resources_stability_intermittent_failures(self): + """Test BMC resource validation with intermittent failures.""" + cfg.CONF.set_override('firmware_update_required_successes', 3, + 'redfish') + cfg.CONF.set_override('firmware_update_validation_interval', 10, + 'redfish') + + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(redfish_utils, 'get_manager', + autospec=True) as manager_mock, \ + mock.patch.object(redfish_utils, 'get_chassis', + autospec=True) as chassis_mock, \ + mock.patch.object(time, 'time', autospec=True) as time_mock, \ + mock.patch.object(time, 'sleep', autospec=True): + + # Mock intermittent failures: success, success, fail, + # success, success, success + # When system_mock raises exception, other calls are not made + call_count = 0 + + def system_side_effect(*args): + nonlocal call_count + call_count += 1 + if call_count == 3: # Third call fails + raise exception.RedfishConnectionError('error') + return mock.Mock() + + system_mock.side_effect = system_side_effect + manager_mock.return_value = mock.Mock() + net_adapters = chassis_mock.return_value.network_adapters + net_adapters.get_members.return_value = [] + + # Mock time progression (6 attempts total) + time_mock.side_effect = [0, 10, 20, 30, 40, 50, 60] + + # Should eventually succeed after counter reset + firmware._validate_resources_stability(task.node) + + # Verify all 6 attempts were made for system + self.assertEqual(system_mock.call_count, 6) + # Manager and chassis called only 5 times (not on failed) + self.assertEqual(manager_mock.call_count, 5) + self.assertEqual(chassis_mock.call_count, 5) + + def test__validate_resources_stability_manager_failure(self): + """Test BMC resource validation when Manager resource fails.""" + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(redfish_utils, 'get_manager', + autospec=True) as manager_mock, \ + mock.patch.object(time, 'time', autospec=True) as time_mock: + + # Mock system success, manager failure + system_mock.return_value = mock.Mock() + manager_mock.side_effect = exception.RedfishError( + 'manager error') + + # Mock time progression to exceed timeout + time_mock.side_effect = [0, 350] + + # Should raise RedfishError due to timeout + self.assertRaises(exception.RedfishError, + firmware._validate_resources_stability, + task.node) + + def test__validate_resources_stability_network_adapters_failure(self): + """Test validation when NetworkAdapters resource fails.""" + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(redfish_utils, 'get_manager', + autospec=True) as manager_mock, \ + mock.patch.object(redfish_utils, 'get_chassis', + autospec=True) as chassis_mock, \ + mock.patch.object(time, 'time', autospec=True) as time_mock: + + # Mock system and manager success, NetworkAdapters failure + system_mock.return_value = mock.Mock() + manager_mock.return_value = mock.Mock() + chassis_mock.side_effect = exception.RedfishError( + 'chassis error') + + # Mock time progression to exceed timeout + time_mock.side_effect = [0, 350] + + # Should raise RedfishError due to timeout + self.assertRaises(exception.RedfishError, + firmware._validate_resources_stability, + task.node) + + def test__validate_resources_stability_custom_config(self): + """Test BMC resource validation with custom configuration values.""" + cfg.CONF.set_override('firmware_update_required_successes', 5, + 'redfish') + cfg.CONF.set_override('firmware_update_validation_interval', 5, + 'redfish') + + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(redfish_utils, 'get_manager', + autospec=True) as manager_mock, \ + mock.patch.object(redfish_utils, 'get_chassis', + autospec=True) as chassis_mock, \ + mock.patch.object(time, 'time', autospec=True) as time_mock, \ + mock.patch.object(time, 'sleep', + autospec=True) as sleep_mock: + + # Mock successful resource responses + system_mock.return_value = mock.Mock() + manager_mock.return_value = mock.Mock() + net_adapters = chassis_mock.return_value.network_adapters + net_adapters.get_members.return_value = [] + + # Mock time progression (5 successful attempts) + time_mock.side_effect = [0, 5, 10, 15, 20, 25] + + # Should complete successfully after 5 consecutive successes + firmware._validate_resources_stability(task.node) + + # Verify all resources checked 5 times (custom required) + self.assertEqual(system_mock.call_count, 5) + self.assertEqual(manager_mock.call_count, 5) + self.assertEqual(chassis_mock.call_count, 5) + + # Verify sleep was called with custom interval + expected_calls = [mock.call(5)] * 4 # 4 sleeps between 5 + sleep_mock.assert_has_calls(expected_calls) + + def test__validate_resources_stability_badrequest_error(self): + """Test BMC resource validation handles BadRequestError correctly.""" + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(time, 'time', autospec=True) as time_mock: + + # Mock BadRequestError from sushy with proper arguments + mock_response = mock.Mock() + mock_response.status_code = 400 + system_mock.side_effect = sushy.exceptions.BadRequestError( + 'http://test', mock_response, mock_response) + + # Mock time progression to exceed timeout + time_mock.side_effect = [0, 350] + + # Should raise RedfishError due to timeout + self.assertRaises(exception.RedfishError, + firmware._validate_resources_stability, + task.node) diff --git a/releasenotes/notes/bmc-validation-after-firmware-update-3d5f8a9e76c24d1b.yaml b/releasenotes/notes/bmc-validation-after-firmware-update-3d5f8a9e76c24d1b.yaml new file mode 100644 index 0000000000..49a66b657b --- /dev/null +++ b/releasenotes/notes/bmc-validation-after-firmware-update-3d5f8a9e76c24d1b.yaml @@ -0,0 +1,24 @@ +--- +fixes: + - | + Reduces likelihood of intermittent firmware upgrade failures by adding + comprehensive BMC state validation after firmware upgrades for the + Redfish driver. After a firmware update task completes successfully, Ironic + now validates that BMC resources (System, Manager, and NetworkAdapters) are + consistently available before proceeding with subsequent operations. + + This change improves firmware update reliability by requiring a + configurable number of consecutive successful responses from the BMC + resources. The validation process helps ensure the BMC has fully stabilized + after firmware updates before marking the operation as complete. + + New configuration options in the ``[redfish]`` section: + + * ``firmware_update_required_successes`` - Number of consecutive successful + responses required to consider BMC validation successful (default: 3), + 0 disables validation + * ``firmware_update_validation_interval`` - Time in seconds to wait between + validation attempts (default: 30) + * ``firmware_update_resource_validation_timeout`` - Maximum time in seconds + to wait for BMC resources to stabilize (default: 300), setting it to 0 + disables validation. From efeabaca771432feb3681ceb9d657851e9e43b1a Mon Sep 17 00:00:00 2001 From: Mohamed-HDD Date: Thu, 11 Sep 2025 15:11:14 +0200 Subject: [PATCH 02/15] Fix : AsRockRack Management via Redfish Add required boot params in Redfish calls for AsRockRack Related-Bug: #2073518 Change-Id: I0610d488eb4392bf335464e685aaadbf28d59529 Signed-off-by: Mohamed EL HADDAD (cherry picked from commit 54977a14d57c8c56e9b25a65e4c8ccd2b9fabb7a) --- ironic/drivers/modules/redfish/management.py | 91 ++++++++++++++++++- ...t-payload-asrockrack-79a9291da8ec5a50.yaml | 14 +++ 2 files changed, 102 insertions(+), 3 deletions(-) create mode 100644 releasenotes/notes/redfish-full-boot-payload-asrockrack-79a9291da8ec5a50.yaml diff --git a/ironic/drivers/modules/redfish/management.py b/ironic/drivers/modules/redfish/management.py index 018c70f6f2..ad937f8d60 100644 --- a/ironic/drivers/modules/redfish/management.py +++ b/ironic/drivers/modules/redfish/management.py @@ -91,6 +91,12 @@ BOOT_DEVICE_PERSISTENT_MAP_REV = {v: k for k, v in BOOT_DEVICE_PERSISTENT_MAP.items()} +VENDORS_REQUIRING_FULL_BOOT_REQUEST = [ + "american megatrends international", + "ami", + "asrockrack" +] + INDICATOR_MAP = { sushy.INDICATOR_LED_LIT: indicator_states.ON, sushy.INDICATOR_LED_OFF: indicator_states.OFF, @@ -123,6 +129,10 @@ def _set_boot_device(task, system, device, persistent=False, :param http_boot_url: A string value to be sent to the sushy library, which is sent to the BMC as the url to boot from. :raises: SushyError on an error from the Sushy library + Vendor-specific logic: + - Vendors listed in VENDORS_REQUIRING_FULL_BOOT_REQUEST: + Require setting full boot parameters + (mode, enabled, target) even if unchanged. """ # The BMC handling of the persistent setting is vendor specific. @@ -132,6 +142,13 @@ def _set_boot_device(task, system, device, persistent=False, # persistent setting must be set when setting the boot device # (see https://storyboard.openstack.org/#!/story/2008547). vendor = task.node.properties.get('vendor', None) + LOG.debug("Vendor : %(vendor)s node %(uuid)s", + {'vendor': vendor, 'uuid': task.node.uuid}) + requires_full_boot_request = ( + vendor and any(vendor_id in vendor.lower() + for vendor_id in + VENDORS_REQUIRING_FULL_BOOT_REQUEST) + ) if vendor and vendor.lower() == 'supermicro': enabled = BOOT_DEVICE_PERSISTENT_MAP_REV[persistent] LOG.debug('Setting BootSourceOverrideEnable to %(enable)s ' @@ -147,8 +164,35 @@ def _set_boot_device(task, system, device, persistent=False, try: # NOTE(TheJulia): In sushy, it is uri, due to the convention used # in the standard. URL is used internally in ironic. - system.set_system_boot_options(device, enabled=enabled, - http_boot_uri=http_boot_url) + if requires_full_boot_request: + # Some vendors require sending all boot parameters every time + desired_mode = system.boot.get('mode') \ + or sushy.BOOT_SOURCE_MODE_UEFI + desired_enabled = BOOT_DEVICE_PERSISTENT_MAP_REV[persistent] + current_enabled = system.boot.get('enabled') \ + or sushy.BOOT_SOURCE_ENABLED_ONCE + current_target = system.boot.get('target') \ + or sushy.BOOT_SOURCE_TARGET_NONE + + LOG.debug('Vendor "%(vendor)s" requires full boot settings. ' + 'Sending: mode=%(mode)s, enabled=%(enabled)s, ' + 'target=%(target)s for node %(node)s', + {'vendor': vendor, 'mode': desired_mode, + 'enabled': current_enabled, 'target': current_target, + 'node': task.node.uuid}) + + system.set_system_boot_options( + device, + mode=desired_mode, + enabled=enabled, + http_boot_uri=http_boot_url + ) + else: + LOG.debug('Sending minimal Redfish boot device' + ' change for node %(node)s', + {'node': task.node.uuid}) + system.set_system_boot_options(device, enabled=enabled, + http_boot_uri=http_boot_url) except sushy.exceptions.SushyError as e: if enabled == sushy.BOOT_SOURCE_ENABLED_CONTINUOUS: # NOTE(dtantsur): continuous boot device settings have been @@ -336,13 +380,54 @@ def set_boot_mode(self, task, mode): """ system = redfish_utils.get_system(task.node) + vendor = task.node.properties.get('vendor', None) + requires_full_boot_request = ( + vendor and any(vendor_id in vendor.lower() + for vendor_id in + VENDORS_REQUIRING_FULL_BOOT_REQUEST) + ) + + LOG.debug('Requested %(vendor)s to set boot mode' + ' to "%(mode)s" for node %(node)s', + {'mode': mode, 'node': task.node.uuid, 'vendor': vendor}) + # NOTE(dtantsur): check the readability of the current mode before # modifying anything. I suspect it can become None transiently after # the update, while we need to know if it is supported *at all*. get_mode_unsupported = (system.boot.get('mode') is None) + desired_mode = BOOT_MODE_MAP_REV[mode] + LOG.debug('Current boot mode read from Redfish for ' + 'node %(node)s is: %(mode)s', + {'node': task.node.uuid, + 'mode': desired_mode}) + try: - system.set_system_boot_options(mode=BOOT_MODE_MAP_REV[mode]) + + if requires_full_boot_request: + current_enabled = system.boot.get('enable') \ + or sushy.BOOT_SOURCE_ENABLED_ONCE + current_target = system.boot.get('enable') \ + or sushy.BOOT_SOURCE_TARGET_PXE + + LOG.debug('Vendor "%(vendor)s" requires full boot settings. ' + 'Sending: mode=%(mode)s, enabled=%(enabled)s, ' + 'target=%(target)s for node %(node)s', + {'vendor': vendor, 'mode': desired_mode, + 'enabled': current_enabled, 'node': task.node.uuid, + 'target': current_target + }) + + system.set_system_boot_options( + mode=desired_mode, + enabled=current_enabled, + target=current_target + ) + else: + LOG.debug('Sending minimal Redfish boot mode ' + 'change for node %(node)s', + {'node': task.node.uuid}) + system.set_system_boot_options(mode=desired_mode) except sushy.exceptions.SushyError as e: error_msg = (_('Setting boot mode to %(mode)s ' 'failed for node %(node)s. ' diff --git a/releasenotes/notes/redfish-full-boot-payload-asrockrack-79a9291da8ec5a50.yaml b/releasenotes/notes/redfish-full-boot-payload-asrockrack-79a9291da8ec5a50.yaml new file mode 100644 index 0000000000..572553e9ab --- /dev/null +++ b/releasenotes/notes/redfish-full-boot-payload-asrockrack-79a9291da8ec5a50.yaml @@ -0,0 +1,14 @@ +--- +fixes: + - | + Improved Redfish compatibility with ASRock Rack servers by updating how + boot mode and boot device settings are applied. + + Previously, calls to `set_boot_device` and `set_boot_mode` only included + minimal parameters (e.g., `BootOverrideTarget` and `BootOverrideMode`), which + were insufficient for certain vendor implementations like ASRockRack. + + This fix updates the Redfish driver to send the full payload, including + `BootSourceOverrideEnabled`, `BootSourceOverrideTarget`, `BootSourceOverrideMode`, + ensuring better compliance and reliability on these platforms. + From 511bfb601318c8dd9879fac5ce98d899c44dfc19 Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Fri, 12 Sep 2025 16:00:31 +1000 Subject: [PATCH 03/15] Make cache_firmware_components more resilient during upgrades There have been reports of firmware upgrades failing on Gen11 iLO machines with GET NetworkAdepters returning 400s responses. This change attempts to resolve this by catching the exception relevant to the fault Change-Id: I62095c2b61d14688d2dcbcdcfd29e9391af2c0ba Signed-off-by: Jacob Anders (cherry picked from commit bba3041ccf6397587e51c676781f6a204396bdc5) --- ironic/drivers/modules/redfish/firmware.py | 17 +++- .../drivers/modules/redfish/test_firmware.py | 85 +++++++++++++++++++ ...apters-fix-ilo-gen11-2f43f4fbdb995231.yaml | 7 ++ 3 files changed, 107 insertions(+), 2 deletions(-) create mode 100644 releasenotes/notes/networkadapters-fix-ilo-gen11-2f43f4fbdb995231.yaml diff --git a/ironic/drivers/modules/redfish/firmware.py b/ironic/drivers/modules/redfish/firmware.py index 9facb99451..aee1f35cc6 100644 --- a/ironic/drivers/modules/redfish/firmware.py +++ b/ironic/drivers/modules/redfish/firmware.py @@ -110,12 +110,25 @@ def cache_firmware_components(self, task): LOG.warning('No manager available to retrieve Firmware ' 'from the bmc of node %s', task.node.uuid) - nic_components = self.retrieve_nic_components(task, system) + nic_components = None + try: + nic_components = self.retrieve_nic_components(task, system) + except (exception.RedfishError, + sushy.exceptions.BadRequestError) as e: + # NOTE(janders) if an exception is raised, log a warning + # with exception details. This is important for HP hardware + # which at the time of writing this are known to return 400 + # responses to GET NetworkAdapters while OS isn't fully booted + LOG.warning('Unable to access NetworkAdapters on node ' + '%(node_uuid)s, Error: %(error)s', + {'node_uuid': task.node.uuid, 'error': e}) + # NOTE(janders) if no exception is raised but no NICs are returned, + # state that clearly but in a lower severity message if nic_components == []: LOG.debug('Could not retrieve Firmware Package Version from ' 'NetworkAdapters on node %(node_uuid)s', {'node_uuid': task.node.uuid}) - else: + elif nic_components: settings.extend(nic_components) if not settings: diff --git a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py index 4e468381eb..5a9d176d83 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py @@ -267,6 +267,91 @@ def test_get_chassis_redfish_error(self, sync_fw_cmp_mock, system_mock, {'node_uuid': self.node.uuid} ) + @mock.patch.object(redfish_fw, 'LOG', autospec=True) + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + @mock.patch.object(redfish_utils, 'get_manager', autospec=True) + @mock.patch.object(objects, 'FirmwareComponentList', autospec=True) + def test_retrieve_nic_components_redfish_connection_error( + self, sync_fw_cmp_mock, manager_mock, system_mock, log_mock): + """Test that RedfishConnectionError during NIC retrieval is handled.""" + system_mock.return_value.identity = "System1" + system_mock.return_value.bios_version = '1.0.0' + manager_mock.return_value.identity = "Manager1" + manager_mock.return_value.firmware_version = '1.0.0' + + sync_fw_cmp_mock.sync_firmware_components.return_value = ( + [{'component': 'bios', 'current_version': '1.0.0'}, + {'component': 'bmc', 'current_version': '1.0.0'}], + [], []) + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(task.driver.firmware, + 'retrieve_nic_components', + autospec=True) as mock_retrieve: + connection_error = exception.RedfishError( + 'Connection failed') + mock_retrieve.side_effect = connection_error + + task.driver.firmware.cache_firmware_components(task) + + # Verify warning log for exception is called + log_mock.warning.assert_any_call( + 'Unable to access NetworkAdapters on node %(node_uuid)s, ' + 'Error: %(error)s', + {'node_uuid': self.node.uuid, 'error': connection_error} + ) + + # Verify debug log for empty NIC list is NOT called + # (since we caught an exception, not an empty list) + debug_calls = [call for call in log_mock.debug.call_args_list + if 'Could not retrieve Firmware Package Version from ' + 'NetworkAdapters' in str(call)] + self.assertEqual(len(debug_calls), 0) + + @mock.patch.object(redfish_fw, 'LOG', autospec=True) + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + @mock.patch.object(redfish_utils, 'get_manager', autospec=True) + @mock.patch.object(objects, 'FirmwareComponentList', autospec=True) + def test_retrieve_nic_components_sushy_bad_request_error( + self, sync_fw_cmp_mock, manager_mock, system_mock, log_mock): + """Test that sushy BadRequestError during NIC retrieval is handled.""" + system_mock.return_value.identity = "System1" + system_mock.return_value.bios_version = '1.0.0' + manager_mock.return_value.identity = "Manager1" + manager_mock.return_value.firmware_version = '1.0.0' + + sync_fw_cmp_mock.sync_firmware_components.return_value = ( + [{'component': 'bios', 'current_version': '1.0.0'}, + {'component': 'bmc', 'current_version': '1.0.0'}], + [], []) + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(task.driver.firmware, + 'retrieve_nic_components', + autospec=True) as mock_retrieve: + bad_request_error = sushy.exceptions.BadRequestError( + method='GET', url='/redfish/v1/Chassis/1/NetworkAdapters', + response=mock.Mock(status_code=400)) + mock_retrieve.side_effect = bad_request_error + + task.driver.firmware.cache_firmware_components(task) + + # Verify warning log for exception is called + log_mock.warning.assert_any_call( + 'Unable to access NetworkAdapters on node %(node_uuid)s, ' + 'Error: %(error)s', + {'node_uuid': self.node.uuid, 'error': bad_request_error} + ) + + # Verify debug log for empty NIC list is NOT called + # (since we caught an exception, not an empty list) + debug_calls = [call for call in log_mock.debug.call_args_list + if 'Could not retrieve Firmware Package Version from ' + 'NetworkAdapters' in str(call)] + self.assertEqual(len(debug_calls), 0) + @mock.patch.object(redfish_utils, 'LOG', autospec=True) @mock.patch.object(redfish_utils, '_get_connection', autospec=True) def test_missing_updateservice(self, conn_mock, log_mock): diff --git a/releasenotes/notes/networkadapters-fix-ilo-gen11-2f43f4fbdb995231.yaml b/releasenotes/notes/networkadapters-fix-ilo-gen11-2f43f4fbdb995231.yaml new file mode 100644 index 0000000000..b0de9beb65 --- /dev/null +++ b/releasenotes/notes/networkadapters-fix-ilo-gen11-2f43f4fbdb995231.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Fixes an issue where firmware upgrades would fail on HP iLO Generation 11 + machines when the NetworkAdapters endpoint returns HTTP 400 (Bad Request) + responses during the firmware component caching process. Exception + handling allows the code to continue execution past that issue. From 1ff6d514d836772fd5ac50c1410cfed508e13159 Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Tue, 23 Sep 2025 09:23:13 +1000 Subject: [PATCH 04/15] Fix power sync timeouts after BMC firmware update Use extended timeout (by default 300 seconds) for BMC firmware updates to handle BMC transitional states during firmware update process, unless a different timeout is specified by the operator. Assisted-By: Claude Code Sonnet 4 Change-Id: I2125ff4cdcbd07a89b364968dda4bb60e059121c Signed-off-by: Jacob Anders (cherry picked from commit fbe0e188b2bfa98e4e5f87c87a76c0a7ca47cf05) --- ironic/conf/redfish.py | 7 + ironic/drivers/modules/redfish/firmware.py | 20 ++- .../drivers/modules/redfish/test_firmware.py | 136 ++++++++++++++++++ ...firmware-timeout-fix-328e09ff98f9348f.yaml | 7 + 4 files changed, 169 insertions(+), 1 deletion(-) create mode 100644 releasenotes/notes/bmc-firmware-timeout-fix-328e09ff98f9348f.yaml diff --git a/ironic/conf/redfish.py b/ironic/conf/redfish.py index 00bc919924..c4b9c05983 100644 --- a/ironic/conf/redfish.py +++ b/ironic/conf/redfish.py @@ -116,6 +116,13 @@ 'and consistently available after firmware update. ' 'Set to 0 to disable post-upgrade validation ' 'entirely.')), + cfg.IntOpt('firmware_update_bmc_timeout', + min=0, + default=300, + help=_('Timeout (in seconds) for BMC firmware updates. ' + 'BMC firmware updates may need extended time to handle ' + 'BMC transitional states during the firmware update ' + 'process.')), cfg.StrOpt('firmware_source', choices=[('http', _('If firmware source URL is also HTTP, then ' 'serve from original location, otherwise ' diff --git a/ironic/drivers/modules/redfish/firmware.py b/ironic/drivers/modules/redfish/firmware.py index aee1f35cc6..711530561d 100644 --- a/ironic/drivers/modules/redfish/firmware.py +++ b/ironic/drivers/modules/redfish/firmware.py @@ -220,6 +220,24 @@ def update(self, task, settings): fw_upd = settings[0] wait_interval = fw_upd.get('wait') + + # Check if BMC firmware is being updated - BMC may need extended + # timeout after firmware update as it transitions through states + has_bmc_update = any( + setting.get('component', '') == redfish_utils.BMC + for setting in settings + ) + + # Use extended timeout for BMC updates to handle transitional states + if has_bmc_update and wait_interval is None: + reboot_timeout = CONF.redfish.firmware_update_bmc_timeout + LOG.info('BMC firmware update detected, using extended reboot ' + 'timeout of %(timeout)s seconds for node %(node)s to ' + 'handle BMC transitional states', + {'timeout': reboot_timeout, 'node': node.uuid}) + else: + reboot_timeout = wait_interval + deploy_utils.set_async_step_flags( node, reboot=True, @@ -227,7 +245,7 @@ def update(self, task, settings): polling=True ) - return deploy_utils.reboot_to_finish_step(task, timeout=wait_interval, + return deploy_utils.reboot_to_finish_step(task, timeout=reboot_timeout, disable_ramdisk=True) def _execute_firmware_update(self, node, update_service, settings): diff --git a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py index 5a9d176d83..de024cb162 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py @@ -25,6 +25,7 @@ from ironic.conductor import task_manager from ironic.conductor import utils as manager_utils from ironic.conf import CONF +from ironic.drivers.modules import deploy_utils from ironic.drivers.modules.redfish import firmware as redfish_fw from ironic.drivers.modules.redfish import firmware_utils from ironic.drivers.modules.redfish import utils as redfish_utils @@ -1336,3 +1337,138 @@ def test__validate_resources_stability_badrequest_error(self): self.assertRaises(exception.RedfishError, firmware._validate_resources_stability, task.node) + + @mock.patch.object(time, 'sleep', lambda seconds: None) + @mock.patch.object(deploy_utils, 'reboot_to_finish_step', autospec=True) + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_update_bmc_uses_configured_timeout(self, mock_get_update_service, + mock_execute_fw_update, + mock_set_async_flags, + mock_reboot_to_finish): + """Test BMC firmware update uses configured timeout.""" + settings = [{'component': 'bmc', 'url': 'http://bmc/v1.0.0'}] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.driver.firmware.update(task, settings) + + # Verify configured timeout is used for BMC update + mock_reboot_to_finish.assert_called_once_with( + task, timeout=CONF.redfish.firmware_update_bmc_timeout, + disable_ramdisk=True) + + @mock.patch.object(time, 'sleep', lambda seconds: None) + @mock.patch.object(deploy_utils, 'reboot_to_finish_step', autospec=True) + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_update_bmc_uses_bmc_constant(self, mock_get_update_service, + mock_execute_fw_update, + mock_set_async_flags, + mock_reboot_to_finish): + """Test BMC firmware update detection works with BMC constant.""" + settings = [{'component': redfish_utils.BMC, + 'url': 'http://bmc/v1.0.0'}] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.driver.firmware.update(task, settings) + + # Verify configured timeout is used + mock_reboot_to_finish.assert_called_once_with( + task, timeout=CONF.redfish.firmware_update_bmc_timeout, + disable_ramdisk=True) + + @mock.patch.object(time, 'sleep', lambda seconds: None) + @mock.patch.object(deploy_utils, 'reboot_to_finish_step', autospec=True) + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_update_non_bmc_uses_wait_parameter(self, mock_get_update_service, + mock_execute_fw_update, + mock_set_async_flags, + mock_reboot_to_finish): + """Test non-BMC firmware update uses wait parameter.""" + settings = [{'component': 'bios', 'url': 'http://bios/v1.0.0', + 'wait': 120}] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.driver.firmware.update(task, settings) + + # Verify wait parameter is used for non-BMC update + mock_reboot_to_finish.assert_called_once_with( + task, timeout=120, disable_ramdisk=True) + + @mock.patch.object(time, 'sleep', lambda seconds: None) + @mock.patch.object(deploy_utils, 'reboot_to_finish_step', autospec=True) + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_update_non_bmc_no_wait_parameter(self, mock_get_update_service, + mock_execute_fw_update, + mock_set_async_flags, + mock_reboot_to_finish): + """Test non-BMC firmware update without wait parameter uses None.""" + settings = [{'component': 'bios', 'url': 'http://bios/v1.0.0'}] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.driver.firmware.update(task, settings) + + # Verify None timeout is used for non-BMC without wait parameter + mock_reboot_to_finish.assert_called_once_with( + task, timeout=None, disable_ramdisk=True) + + @mock.patch.object(time, 'sleep', lambda seconds: None) + @mock.patch.object(deploy_utils, 'reboot_to_finish_step', autospec=True) + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_update_mixed_components_with_bmc(self, mock_get_update_service, + mock_execute_fw_update, + mock_set_async_flags, + mock_reboot_to_finish): + """Test mixed component update with BMC and explicit wait uses wait.""" + settings = [ + {'component': 'bios', 'url': 'http://bios/v1.0.0', 'wait': 120}, + {'component': 'bmc', 'url': 'http://bmc/v1.0.0', 'wait': 60} + ] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.driver.firmware.update(task, settings) + + # Verify explicit wait parameter takes precedence over BMC timeout + mock_reboot_to_finish.assert_called_once_with( + task, timeout=120, + disable_ramdisk=True) + + @mock.patch.object(time, 'sleep', lambda seconds: None) + @mock.patch.object(deploy_utils, 'reboot_to_finish_step', autospec=True) + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_update_bmc_with_explicit_wait(self, mock_get_update_service, + mock_execute_fw_update, + mock_set_async_flags, + mock_reboot_to_finish): + """Test BMC update with explicit wait uses wait, not BMC timeout.""" + settings = [{'component': 'bmc', 'url': 'http://bmc/v1.0.0', + 'wait': 90}] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.driver.firmware.update(task, settings) + + # Verify explicit wait parameter takes precedence over BMC timeout + mock_reboot_to_finish.assert_called_once_with( + task, timeout=90, disable_ramdisk=True) diff --git a/releasenotes/notes/bmc-firmware-timeout-fix-328e09ff98f9348f.yaml b/releasenotes/notes/bmc-firmware-timeout-fix-328e09ff98f9348f.yaml new file mode 100644 index 0000000000..ceec71400f --- /dev/null +++ b/releasenotes/notes/bmc-firmware-timeout-fix-328e09ff98f9348f.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Fixes BMC firmware update timeout issues by using extended timeout + (default 300 seconds) for BMC firmware updates to handle BMC transitional + states during firmware update process, unless a different timeout is + specified by the operator. From 10b2e9a6d33bac78189da9ff9c6d54ebf4ed2772 Mon Sep 17 00:00:00 2001 From: Afonne-CID Date: Mon, 10 Nov 2025 15:21:49 +0100 Subject: [PATCH 05/15] Filter null NIC firmware versions from cache Treat absent firmware package version as non-cacheable to avoid NOT NULL database constraint violation. Closes-Bug: #2130990 Change-Id: Ic2efaa0d53b6923908112c937957a60aa4f1ad9d Signed-off-by: Afonne-CID (cherry picked from commit 5563e5275ff112bbbe363b0683b77cad6a9fecda) --- ironic/drivers/modules/redfish/firmware.py | 2 + .../drivers/modules/redfish/test_firmware.py | 58 +++++++++++++++++++ ...re-versions-in-cache-82a68b4df96591f7.yaml | 6 ++ 3 files changed, 66 insertions(+) create mode 100644 releasenotes/notes/filter-null-nic-firmware-versions-in-cache-82a68b4df96591f7.yaml diff --git a/ironic/drivers/modules/redfish/firmware.py b/ironic/drivers/modules/redfish/firmware.py index 711530561d..55d9e38e56 100644 --- a/ironic/drivers/modules/redfish/firmware.py +++ b/ironic/drivers/modules/redfish/firmware.py @@ -180,6 +180,8 @@ def retrieve_nic_components(self, task, system): for net_adp in chassis.network_adapters.get_members(): for net_adp_ctrl in net_adp.controllers: fw_pkg_v = net_adp_ctrl.firmware_package_version + if not fw_pkg_v: + continue net_adp_fw = {'component': redfish_utils.NIC_COMPONENT_PREFIX + net_adp.identity, 'current_version': fw_pkg_v} nic_list.append(net_adp_fw) diff --git a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py index de024cb162..90243fc7a4 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py @@ -353,6 +353,64 @@ def test_retrieve_nic_components_sushy_bad_request_error( 'NetworkAdapters' in str(call)] self.assertEqual(len(debug_calls), 0) + @mock.patch.object(redfish_fw, 'LOG', autospec=True) + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + @mock.patch.object(redfish_utils, 'get_manager', autospec=True) + @mock.patch.object(redfish_utils, 'get_chassis', autospec=True) + @mock.patch.object(objects, 'FirmwareComponentList', autospec=True) + @mock.patch.object(objects, 'FirmwareComponent', spec_set=True, + autospec=True) + def test_retrieve_nic_components_invalid_firmware_version( + self, fw_cmp_mock, fw_cmp_list, chassis_mock, manager_mock, + system_mock, log_mock): + """Test that NIC components with missing versions are skipped.""" + for invalid_version in [None, ""]: + fw_cmp_list.reset_mock() + fw_cmp_mock.reset_mock() + log_mock.reset_mock() + + create_list = [{'component': 'bios', 'current_version': 'v1.0.0'}, + {'component': 'bmc', 'current_version': 'v1.0.0'}] + fw_cmp_list.sync_firmware_components.return_value = ( + create_list, [], [] + ) + + bios_component = {'component': 'bios', + 'current_version': 'v1.0.0', + 'node_id': self.node.id} + + bmc_component = {'component': 'bmc', 'current_version': 'v1.0.0', + 'node_id': self.node.id} + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + manager_mock.return_value.firmware_version = "v1.0.0" + system_mock.return_value.bios_version = "v1.0.0" + + netadp_ctrl = mock.MagicMock() + netadp_ctrl.firmware_package_version = invalid_version + netadp = mock.MagicMock() + netadp.identity = 'NIC1' + netadp.controllers = [netadp_ctrl] + net_adapters = mock.MagicMock() + net_adapters.get_members.return_value = [netadp] + chassis_mock.return_value.network_adapters = net_adapters + task.driver.firmware.cache_firmware_components(task) + + fw_cmp_list.sync_firmware_components.assert_called_once_with( + task.context, task.node.id, + [{'component': 'bios', 'current_version': 'v1.0.0'}, + {'component': 'bmc', 'current_version': 'v1.0.0'}]) + + fw_cmp_calls = [ + mock.call(task.context, **bios_component), + mock.call().create(), + mock.call(task.context, **bmc_component), + mock.call().create() + ] + fw_cmp_mock.assert_has_calls(fw_cmp_calls) + log_mock.warning.assert_not_called() + @mock.patch.object(redfish_utils, 'LOG', autospec=True) @mock.patch.object(redfish_utils, '_get_connection', autospec=True) def test_missing_updateservice(self, conn_mock, log_mock): diff --git a/releasenotes/notes/filter-null-nic-firmware-versions-in-cache-82a68b4df96591f7.yaml b/releasenotes/notes/filter-null-nic-firmware-versions-in-cache-82a68b4df96591f7.yaml new file mode 100644 index 0000000000..3df16f94b8 --- /dev/null +++ b/releasenotes/notes/filter-null-nic-firmware-versions-in-cache-82a68b4df96591f7.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + NIC firmware components with null or empty firmware package versions + are now filtered out during caching to avoid NOT NULL database + constraint violations. From df4022261f31f21996516d9f305991059f1289c7 Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Fri, 7 Nov 2025 05:06:12 +1000 Subject: [PATCH 06/15] Make post-firmware-update reboot conditional on component On most hardware platforms, each firmware component that can be updated has different reboot requirements. In addition to this some platforms are particularly sensitive to reboots happening at the expected time. This change attempts to make the reboot behavior dependent on the component being updated in _execute_firmware_update method, so it works for multi-component scenarios Assisted-By: Claude Code Sonnet 4.5 Change-Id: Ie4fe72406e3aedb8af246703f13f41e31866f58c Signed-off-by: Jacob Anders Signed-off-by: Iury Gregory Melo Ferreira (cherry picked from commit d90824a394fd07e1624602b748b1b3c42963cf9c) --- ironic/conf/redfish.py | 22 + ironic/drivers/modules/redfish/firmware.py | 772 ++++++++++++++++-- ironic/drivers/modules/redfish/utils.py | 24 + .../drivers/modules/redfish/test_firmware.py | 673 ++++++++++++--- ...ditional-reboot-fwup-6ca14573a2bebba0.yaml | 10 + 5 files changed, 1339 insertions(+), 162 deletions(-) create mode 100644 releasenotes/notes/conditional-reboot-fwup-6ca14573a2bebba0.yaml diff --git a/ironic/conf/redfish.py b/ironic/conf/redfish.py index c4b9c05983..8492aa3775 100644 --- a/ironic/conf/redfish.py +++ b/ironic/conf/redfish.py @@ -123,6 +123,28 @@ 'BMC firmware updates may need extended time to handle ' 'BMC transitional states during the firmware update ' 'process.')), + cfg.IntOpt('firmware_update_reboot_delay', + min=0, + default=300, + help=_('Default wait time (in seconds) for component-specific ' + 'firmware update operations. Used for: BIOS firmware ' + 'update wait before reboot, BMC firmware version check ' + 'timeout, and NIC firmware task completion timeout.')), + cfg.IntOpt('firmware_update_bmc_version_check_interval', + min=0, + default=30, + help=_('Interval (in seconds) for checking BMC firmware ' + 'version after BMC firmware update. Used to verify ' + 'if BMC firmware has been successfully applied.')), + cfg.IntOpt('firmware_update_nic_starting_wait', + min=0, + default=30, + help=_('Time (in seconds) to wait for a NIC firmware update ' + 'task to progress beyond the STARTING state before ' + 'triggering a reboot. Some NICs need a reboot to ' + 'start applying firmware, while others can begin ' + 'immediately. This timeout helps determine which ' + 'behavior the hardware exhibits.')), cfg.StrOpt('firmware_source', choices=[('http', _('If firmware source URL is also HTTP, then ' 'serve from original location, otherwise ' diff --git a/ironic/drivers/modules/redfish/firmware.py b/ironic/drivers/modules/redfish/firmware.py index 55d9e38e56..afc97c6f08 100644 --- a/ironic/drivers/modules/redfish/firmware.py +++ b/ironic/drivers/modules/redfish/firmware.py @@ -18,6 +18,7 @@ from oslo_utils import timeutils import sushy +from ironic.common import async_steps from ironic.common import exception from ironic.common.i18n import _ from ironic.common import metrics_utils @@ -220,35 +221,298 @@ def update(self, task, settings): {'node_uuid': node.uuid, 'settings': settings}) self._execute_firmware_update(node, update_service, settings) - fw_upd = settings[0] + # Store updated settings and save + node.set_driver_internal_info('redfish_fw_updates', settings) + node.save() + + # Return wait state to keep the step active and let polling handle + # the monitoring and eventual completion/reboot + return async_steps.get_return_state(node) + + def _setup_bmc_update_monitoring(self, node, fw_upd): + """Set up monitoring for BMC firmware update. + + BMC updates do not reboot immediately. Instead, we check the BMC + version periodically. If the version changed, we continue without + reboot. If timeout expires without version change, we trigger a reboot. + + :param node: the Ironic node object + :param fw_upd: firmware update settings dict + """ + # Record current BMC version before update + try: + system = redfish_utils.get_system(node) + manager = redfish_utils.get_manager(node, system) + current_bmc_version = manager.firmware_version + node.set_driver_internal_info( + 'bmc_fw_version_before_update', current_bmc_version) + LOG.debug('BMC version before update for node %(node)s: ' + '%(version)s', + {'node': node.uuid, 'version': current_bmc_version}) + except Exception as e: + LOG.warning('Could not read BMC version before update for ' + 'node %(node)s: %(error)s', + {'node': node.uuid, 'error': e}) + + node.set_driver_internal_info( + 'bmc_fw_check_start_time', + str(timeutils.utcnow().isoformat())) + + LOG.info('BMC firmware update for node %(node)s. ' + 'Monitoring BMC version instead of immediate reboot.', + {'node': node.uuid}) + + # Use wait_interval or default reboot delay wait_interval = fw_upd.get('wait') + if wait_interval is None: + wait_interval = CONF.redfish.firmware_update_reboot_delay + fw_upd['wait'] = wait_interval + # Set wait_start_time so polling can detect when task monitor + # becomes unresponsive and transition to version checking + fw_upd['wait_start_time'] = str(timeutils.utcnow().isoformat()) + # Mark this as a BMC update so we can handle timeouts properly + fw_upd['component_type'] = redfish_utils.BMC + + # BMC: Set async flags without immediate reboot + deploy_utils.set_async_step_flags( + node, + reboot=False, + polling=True + ) + + def _setup_nic_update_monitoring(self, node): + """Set up monitoring for NIC firmware update. + + NIC firmware behavior varies by hardware. Some NICs update immediately, + some need reboot to start. The handler will wait 30s and decide whether + to reboot. - # Check if BMC firmware is being updated - BMC may need extended - # timeout after firmware update as it transitions through states - has_bmc_update = any( - setting.get('component', '') == redfish_utils.BMC - for setting in settings + :param node: the Ironic node object + """ + LOG.info('NIC firmware update for node %(node)s. Will monitor ' + 'task state to determine if reboot is needed.', + {'node': node.uuid}) + + # NIC: Set async flags with reboot enabled + # (reboot will be triggered conditionally if hardware needs it) + deploy_utils.set_async_step_flags( + node, + reboot=True, + polling=True ) - # Use extended timeout for BMC updates to handle transitional states - if has_bmc_update and wait_interval is None: - reboot_timeout = CONF.redfish.firmware_update_bmc_timeout - LOG.info('BMC firmware update detected, using extended reboot ' - 'timeout of %(timeout)s seconds for node %(node)s to ' - 'handle BMC transitional states', - {'timeout': reboot_timeout, 'node': node.uuid}) - else: - reboot_timeout = wait_interval + def _setup_bios_update_monitoring(self, node): + """Set up monitoring for BIOS firmware update. + + BIOS updates require a reboot to apply, so we trigger it as soon + as the update task begins rather than waiting for completion. + + :param node: the Ironic node object + """ + LOG.info('BIOS firmware update for node %(node)s. Will reboot ' + 'when update task starts.', + {'node': node.uuid}) + + # BIOS: Set async flags with reboot enabled + deploy_utils.set_async_step_flags( + node, + reboot=True, + polling=True + ) + + def _setup_default_update_monitoring(self, node, fw_upd): + """Set up monitoring for unknown/default firmware component types. + + Default behavior for unknown component types uses standard reboot + handling with configurable wait interval. + :param node: the Ironic node object + :param fw_upd: firmware update settings dict + """ + component = fw_upd.get('component', '') + LOG.warning( + 'Unknown component type %(component)s for node %(node)s. ' + 'Using default firmware update behavior.', + {'component': component, 'node': node.uuid}) + + wait_interval = fw_upd.get('wait') + if wait_interval is None: + wait_interval = ( + node.driver_info.get('firmware_update_unresponsive_bmc_wait') + or CONF.redfish.firmware_update_wait_unresponsive_bmc) + fw_upd['wait'] = wait_interval + + # Default: Set async flags with reboot enabled deploy_utils.set_async_step_flags( node, reboot=True, - skip_current_step=True, polling=True ) - return deploy_utils.reboot_to_finish_step(task, timeout=reboot_timeout, - disable_ramdisk=True) + + def _get_current_bmc_version(self, node): + """Get current BMC firmware version. + + Note: BMC may be temporarily unresponsive after firmware update. + Expected exceptions (timeouts, connection refused, HTTP errors) are + caught and logged, returning None to indicate version unavailable. + + :param node: the Ironic node object + :returns: Current BMC firmware version string, or None if BMC + is unresponsive/inaccessible + """ + try: + system = redfish_utils.get_system(node) + manager = redfish_utils.get_manager(node, system) + return manager.firmware_version + except (exception.RedfishError, + exception.RedfishConnectionError, + sushy.exceptions.SushyError) as e: + # BMC unresponsiveness is expected after firmware update + # (timeouts, connection refused, HTTP 4xx/5xx errors) + LOG.debug('BMC temporarily unresponsive for node %(node)s: ' + '%(error)s', {'node': node.uuid, 'error': e}) + return None + + def _handle_bmc_update_completion(self, task, update_service, + settings, current_update): + """Handle BMC firmware update completion with version checking. + + For BMC updates, we don't reboot immediately. Instead, we check + the BMC version periodically. If the version changed, we continue + without reboot. If timeout expires without version change, we trigger + a reboot. + + :param task: a TaskManager instance + :param update_service: the sushy firmware update service + :param settings: firmware update settings + :param current_update: the current firmware update being processed + """ + node = task.node + + # Try to get current BMC version + # Note: BMC may be unresponsive after firmware update - expected + current_version = self._get_current_bmc_version(node) + version_before = node.driver_internal_info.get( + 'bmc_fw_version_before_update') + + # If we can read the version and it changed, update is complete + if (current_version is not None + and version_before is not None + and current_version != version_before): + LOG.info( + 'BMC firmware version for node %(node)s changed from ' + '%(old)s to %(new)s. Update complete. Continuing without ' + 'reboot.', + {'node': node.uuid, 'old': version_before, + 'new': current_version}) + node.del_driver_internal_info('bmc_fw_check_start_time') + node.del_driver_internal_info('bmc_fw_version_before_update') + node.save() + self._continue_updates(task, update_service, settings) + return + + # Check if we've been checking for too long + check_start_time = node.driver_internal_info.get( + 'bmc_fw_check_start_time') + + if check_start_time: + check_start = timeutils.parse_isotime(check_start_time) + elapsed_time = timeutils.utcnow(True) - check_start + timeout = current_update.get( + 'wait', CONF.redfish.firmware_update_reboot_delay) + if elapsed_time.seconds >= timeout: + # Timeout: version didn't change or BMC unresponsive + if (current_version is not None + and version_before is not None + and current_version == version_before): + # Version didn't change - skip reboot + LOG.info( + 'BMC firmware version for node %(node)s did not ' + 'change (still %(version)s). Update appears to be ' + 'a no-op or does not require reboot. Continuing ' + 'without reboot.', + {'node': node.uuid, 'version': current_version}) + else: + # Version changed or we can't tell - reboot to apply + LOG.warning( + 'BMC firmware version check timeout expired for ' + 'node %(node)s after %(elapsed)s seconds. ' + 'Will reboot to complete firmware update.', + {'node': node.uuid, 'elapsed': elapsed_time.seconds}) + # Mark that reboot is needed + node.set_driver_internal_info( + 'firmware_reboot_requested', True) + # Enable reboot flag now that we're ready to reboot + deploy_utils.set_async_step_flags( + node, + reboot=True, + polling=True + ) + + node.del_driver_internal_info('bmc_fw_check_start_time') + node.del_driver_internal_info('bmc_fw_version_before_update') + node.save() + self._continue_updates(task, update_service, settings) + return + + # Continue checking - set wait to check again + wait_interval = ( + CONF.redfish.firmware_update_bmc_version_check_interval) + current_update['wait'] = wait_interval + current_update['wait_start_time'] = str( + timeutils.utcnow().isoformat()) + current_update['bmc_version_checking'] = True + node.set_driver_internal_info('redfish_fw_updates', settings) + node.save() + + LOG.debug('BMC firmware version check continuing for node %(node)s. ' + 'Will check again in %(interval)s seconds.', + {'node': node.uuid, 'interval': wait_interval}) + + def _handle_nic_update_completion(self, task, update_service, settings, + current_update): + """Handle NIC firmware update completion. + + For NIC updates, check if a reboot is needed based on whether the + task went through the Running state (needs reboot after completion) + or if reboot already occurred during the Starting phase. + + :param task: a TaskManager instance + :param update_service: the sushy firmware update service + :param settings: firmware update settings + :param current_update: the current firmware update being processed + """ + node = task.node + + # Check if reboot is needed (task went to Running state) + needs_reboot = current_update.get( + 'nic_needs_post_completion_reboot', False) + + if needs_reboot: + LOG.info( + 'NIC firmware update task completed for node ' + '%(node)s. Reboot required to apply update.', + {'node': node.uuid}) + + # Mark that reboot is needed + node.set_driver_internal_info( + 'firmware_reboot_requested', True) + + # Clean up flags + current_update.pop('nic_needs_post_completion_reboot', None) + current_update.pop('nic_starting_timestamp', None) + current_update.pop('nic_reboot_triggered', None) + else: + LOG.info( + 'NIC firmware update task completed for node ' + '%(node)s. Reboot already occurred during update ' + 'start.', {'node': node.uuid}) + # Clean up all NIC-related flags + current_update.pop('nic_starting_timestamp', None) + current_update.pop('nic_reboot_triggered', None) + + self._continue_updates(task, update_service, settings) def _execute_firmware_update(self, node, update_service, settings): """Executes the next firmware update to the node @@ -261,6 +525,8 @@ def _execute_firmware_update(self, node, update_service, settings): to be executed. """ fw_upd = settings[0] + # Store power timeout to use on reboot operations + fw_upd['power_timeout'] = CONF.redfish.firmware_update_reboot_delay # NOTE(janders) try to get the collection of Systems on the BMC # to determine if there may be more than one System try: @@ -299,27 +565,9 @@ def _execute_firmware_update(self, node, update_service, settings): {'node': node.uuid, 'error': e.message}) raise exception.RedfishError(error=e) - # NOTE(iurygregory): In case we are doing firmware updates we need to - # account for unresponsive BMC, in this case we wait for a set of - # minutes before proceeding to the power actions. - # In case the node has firmware_update_unresponsive_bmc_wait set we - # give priority over the configuration option. - wait_unres_bmc = ( - node.driver_info.get('firmware_update_unresponsive_bmc_wait') - or CONF.redfish.firmware_update_wait_unresponsive_bmc - ) - LOG.debug('BMC firmware update in progress. Waiting %(wait_time)s ' - 'seconds before proceeding to reboot the node %(node_uuid)s ' - 'to complete the step', {'node_uuid': node.uuid, - 'wait_time': wait_unres_bmc}) # Store task monitor URI for periodic task polling - # TODO(iurygregory): Improve the logic here to identify if the BMC - # is back, so we don't have to unconditionally wait. - # The wait_unres_bmc will be the maximum time to wait. - time.sleep(wait_unres_bmc) - LOG.debug('Wait completed. Proceeding to reboot the node ' - '%(node_uuid)s to complete the step.', - {'node_uuid': node.uuid}) + # NOTE(janders): Component-specific wait/reboot behavior is now + # handled by the update() method and periodic polling, not here fw_upd['task_monitor'] = task_monitor.task_monitor_uri node.set_driver_internal_info('redfish_fw_updates', settings) @@ -332,6 +580,19 @@ def _execute_firmware_update(self, node, update_service, settings): fw_clean.append(cleanup) node.set_driver_internal_info('firmware_cleanup', fw_clean) + component = fw_upd.get('component', '') + component_type = redfish_utils.get_component_type(component) + + if component_type == redfish_utils.BMC: + self._setup_bmc_update_monitoring(node, fw_upd) + elif component_type == redfish_utils.NIC: + self._setup_nic_update_monitoring(node) + elif component_type == redfish_utils.BIOS: + self._setup_bios_update_monitoring(node) + else: + self._setup_default_update_monitoring(node, fw_upd) + + def _validate_resources_stability(self, node): """Validate that BMC resources are consistently available. @@ -458,10 +719,27 @@ def _continue_updates(self, task, update_service, settings): return if len(settings) == 1: + # Last firmware update - check if reboot is needed + reboot_requested = node.driver_internal_info.get( + 'firmware_reboot_requested', False) + self._clear_updates(node) LOG.info('Firmware updates completed for node %(node)s', {'node': node.uuid}) + + # If reboot was requested (e.g., for BMC timeout or NIC + # completion), trigger the reboot before notifying conductor + if reboot_requested: + LOG.info('Rebooting node %(node)s to apply firmware updates', + {'node': node.uuid}) + manager_utils.node_power_action(task, states.REBOOT) + + LOG.debug('Validating BMC responsiveness before resuming ' + 'conductor operations for node %(node)s', + {'node': node.uuid}) + self._validate_resources_stability(node) + if task.node.clean_step: manager_utils.notify_conductor_resume_clean(task) elif task.node.service_step: @@ -470,12 +748,40 @@ def _continue_updates(self, task, update_service, settings): manager_utils.notify_conductor_resume_deploy(task) else: + # Validate BMC resources are stable before continuing next update + LOG.info('Validating BMC responsiveness before continuing ' + 'to next firmware update for node %(node)s', + {'node': node.uuid}) + self._validate_resources_stability(node) + settings.pop(0) self._execute_firmware_update(node, update_service, settings) node.save() - manager_utils.node_power_action(task, states.REBOOT) + + # Only reboot if the component code requested it. + if task.node.clean_step: + reboot_field = async_steps.CLEANING_REBOOT + elif task.node.deploy_step: + reboot_field = async_steps.DEPLOYMENT_REBOOT + elif task.node.service_step: + reboot_field = async_steps.SERVICING_REBOOT + else: + reboot_field = None + + # Default to reboot=True for backwards compatibility. + should_reboot = (node.driver_internal_info.get(reboot_field, True) + if reboot_field else True) + + if should_reboot: + power_timeout = settings[0].get('power_timeout', 0) + manager_utils.node_power_action(task, states.REBOOT, + power_timeout) + else: + LOG.debug('Component requested no immediate reboot for node ' + '%(node)s. Continuing with async polling.', + {'node': node.uuid}) def _clear_updates(self, node): """Clears firmware updates artifacts @@ -490,6 +796,7 @@ def _clear_updates(self, node): firmware_utils.cleanup(node) node.del_driver_internal_info('redfish_fw_updates') node.del_driver_internal_info('firmware_cleanup') + node.del_driver_internal_info('firmware_reboot_requested') node.save() @METRICS.timer('RedfishFirmware._query_update_failed') @@ -528,6 +835,329 @@ def _query_update_status(self, task, manager, context): """Periodic job to check firmware update tasks.""" self._check_node_redfish_firmware_update(task) + def _handle_task_completion(self, task, sushy_task, messages, + update_service, settings, current_update): + """Handle firmware update task completion. + + :param task: a TaskManager instance + :param sushy_task: the sushy task object + :param messages: list of task messages + :param update_service: the sushy firmware update service + :param settings: firmware update settings + :param current_update: the current firmware update being processed + """ + node = task.node + + if (sushy_task.task_state == sushy.TASK_STATE_COMPLETED + and sushy_task.task_status in + [sushy.HEALTH_OK, sushy.HEALTH_WARNING]): + LOG.info('Firmware update task completed for node %(node)s, ' + 'firmware %(firmware_image)s: %(messages)s.', + {'node': node.uuid, + 'firmware_image': current_update['url'], + 'messages': ", ".join(messages)}) + + # Component-specific post-update handling + component = current_update.get('component', '') + component_type = redfish_utils.get_component_type(component) + + if component_type == redfish_utils.BMC: + # BMC: Start version checking instead of immediate reboot + self._handle_bmc_update_completion( + task, update_service, settings, current_update) + elif component_type == redfish_utils.NIC: + # NIC: Handle completion with appropriate reboot behavior + self._handle_nic_update_completion( + task, update_service, settings, current_update) + elif component_type == redfish_utils.BIOS: + # BIOS: Reboot was already triggered when task started, + # just continue with next update + LOG.info('BIOS firmware update task completed for node ' + '%(node)s. System was already rebooted. ' + 'Proceeding with continuation.', + {'node': node.uuid}) + # Clean up the reboot trigger flag + current_update.pop('bios_reboot_triggered', None) + self._continue_updates(task, update_service, settings) + else: + # Default: continue as before + self._continue_updates(task, update_service, settings) + else: + error_msg = (_('Firmware update failed for node %(node)s, ' + 'firmware %(firmware_image)s. ' + 'Error: %(errors)s') % + {'node': node.uuid, + 'firmware_image': current_update['url'], + 'errors': ", ".join(messages)}) + + self._clear_updates(node) + if task.node.clean_step: + manager_utils.cleaning_error_handler(task, error_msg) + elif task.node.deploy_step: + manager_utils.deploying_error_handler(task, error_msg) + elif task.node.service_step: + manager_utils.servicing_error_handler(task, error_msg) + + def _handle_nic_task_starting(self, task, task_monitor, settings, + current_update): + """Handle NIC firmware update task when it starts. + + NIC firmware behavior varies by hardware: + - Some NICs need reboot to START applying (task stays at Starting) + - Some NICs can start immediately but need reboot to APPLY (goes to + Running, then needs reboot after completion) + + This method waits for the configured time + (CONF.redfish.firmware_update_nic_starting_wait) to determine which + type: + - If still Starting after wait time → trigger reboot to start + - If moves to Running → let it finish, reboot will happen after + completion + + :param task: a TaskManager instance + :param task_monitor: the sushy task monitor + :param settings: firmware update settings + :param current_update: the current firmware update being processed + :returns: True if should stop polling, False to continue + """ + node = task.node + + # Upgrade lock at the start since we may modify driver_internal_info + task.upgrade_lock() + + try: + sushy_task = task_monitor.get_task() + task_state = sushy_task.task_state + + LOG.debug('NIC update task state for node %(node)s: %(state)s', + {'node': node.uuid, 'state': task_state}) + + # If task is Running, mark that reboot will be needed after + # completion and let it continue + if task_state == sushy.TASK_STATE_RUNNING: + LOG.debug('NIC update task for node %(node)s is running. ' + 'Will wait for completion then reboot.', + {'node': node.uuid}) + # Clear flags since we're past the starting phase + current_update.pop('nic_starting_timestamp', None) + current_update.pop('nic_reboot_triggered', None) + # Mark that reboot will be needed after completion + current_update['nic_needs_post_completion_reboot'] = True + node.set_driver_internal_info('redfish_fw_updates', settings) + node.save() + return False # Continue polling until completion + + # If task is in STARTING, check if we need to wait or reboot + if task_state == sushy.TASK_STATE_STARTING: + # Check if we already triggered a reboot + if current_update.get('nic_reboot_triggered'): + LOG.debug('NIC firmware update for node %(node)s: ' + 'reboot already triggered, waiting for task ' + 'to progress.', {'node': node.uuid}) + return False # Continue polling + + starting_time = current_update.get('nic_starting_timestamp') + + if not starting_time: + # First time seeing STARTING - record timestamp + current_update['nic_starting_timestamp'] = str( + timeutils.utcnow().isoformat()) + node.set_driver_internal_info( + 'redfish_fw_updates', settings) + node.save() + LOG.debug('NIC firmware update task for node %(node)s ' + 'is in STARTING state. Waiting to determine if ' + 'reboot is needed to start update.', + {'node': node.uuid}) + return False # Keep polling + + # Check if configured wait time has elapsed + start_time = timeutils.parse_isotime(starting_time) + elapsed = timeutils.utcnow(True) - start_time + nic_starting_wait = ( + CONF.redfish.firmware_update_nic_starting_wait) + + if elapsed.seconds < nic_starting_wait: + # Still within wait window, keep waiting + LOG.debug('NIC update for node %(node)s still in ' + 'STARTING after %(elapsed)s seconds. ' + 'Waiting...', + {'node': node.uuid, + 'elapsed': elapsed.seconds}) + return False # Keep polling + + # Wait time elapsed and still STARTING - need reboot to start + LOG.info('NIC firmware update task for node %(node)s ' + 'remained in STARTING state for %(wait)s+ seconds. ' + 'Hardware requires reboot to start update. ' + 'Triggering reboot.', + {'node': node.uuid, 'wait': nic_starting_wait}) + + # Mark that we triggered a reboot to prevent repeat reboots + current_update['nic_reboot_triggered'] = True + # Clean up timestamp + current_update.pop('nic_starting_timestamp', None) + node.set_driver_internal_info('redfish_fw_updates', settings) + node.save() + + # Trigger the reboot to start update + power_timeout = current_update.get('power_timeout', 0) + manager_utils.node_power_action(task, states.REBOOT, + power_timeout) + + LOG.info('Reboot initiated for node %(node)s to start ' + 'NIC firmware update', {'node': node.uuid}) + return True # Stop polling, reboot triggered + + except Exception as e: + LOG.warning('Unable to check NIC task state for node ' + '%(node)s: %(error)s. Will retry.', + {'node': node.uuid, 'error': e}) + + return False # Continue polling on error + + def _handle_bios_task_starting(self, task, task_monitor, settings, + current_update): + """Handle BIOS firmware update task when it starts. + + BIOS updates require a reboot to apply the firmware, so we trigger + the reboot as soon as the update task reaches STARTING state rather + than waiting for task completion. + + :param task: a TaskManager instance + :param task_monitor: the sushy task monitor + :param settings: firmware update settings + :param current_update: the current firmware update being processed + :returns: True if reboot was triggered, False otherwise + """ + node = task.node + + if current_update.get('bios_reboot_triggered'): + # Already triggered, just keep polling + return False + + # Upgrade lock at the start since we may modify driver_internal_info + task.upgrade_lock() + + try: + sushy_task = task_monitor.get_task() + LOG.debug('BIOS update task state for node %(node)s: ' + '%(state)s', + {'node': node.uuid, + 'state': sushy_task.task_state}) + + # Check if task has started (STARTING state or beyond) + # TaskState can be: New, Starting, Running, Suspended, + # Interrupted, Pending, Stopping, Completed, Killed, + # Exception, Service, Cancelling, Cancelled + if sushy_task.task_state in [sushy.TASK_STATE_STARTING, + sushy.TASK_STATE_RUNNING, + sushy.TASK_STATE_PENDING]: + LOG.info('BIOS firmware update task has started for ' + 'node %(node)s (state: %(state)s). ' + 'Triggering reboot to apply update.', + {'node': node.uuid, + 'state': sushy_task.task_state}) + + # Mark reboot as triggered to avoid repeated reboots + current_update['bios_reboot_triggered'] = True + node.set_driver_internal_info( + 'redfish_fw_updates', settings) + node.save() + + # Trigger the reboot + power_timeout = current_update.get('power_timeout', 0) + manager_utils.node_power_action(task, states.REBOOT, + power_timeout) + + LOG.info('Reboot initiated for node %(node)s to apply ' + 'BIOS firmware update', + {'node': node.uuid}) + return True + except Exception as e: + LOG.warning('Unable to check BIOS task state for node ' + '%(node)s: %(error)s. Will retry.', + {'node': node.uuid, 'error': e}) + + return False + + def _handle_wait_completion(self, task, update_service, settings, + current_update): + """Handle firmware update wait completion. + + :param task: a TaskManager instance + :param update_service: the sushy firmware update service + :param settings: firmware update settings + :param current_update: the current firmware update being processed + """ + node = task.node + + # Upgrade lock at the start since we may modify driver_internal_info + task.upgrade_lock() + + # Check if this is BMC version checking + if current_update.get('bmc_version_checking'): + current_update.pop('bmc_version_checking', None) + node.set_driver_internal_info( + 'redfish_fw_updates', settings) + node.save() + # Continue BMC version checking + self._handle_bmc_update_completion( + task, update_service, settings, current_update) + elif current_update.get('component_type') == redfish_utils.BMC: + # BMC update wait expired - check if task is still running + # before transitioning to version checking + task_still_running = False + try: + task_monitor = redfish_utils.get_task_monitor( + node, current_update['task_monitor']) + if task_monitor.is_processing: + task_still_running = True + LOG.debug('BMC firmware update wait expired but task ' + ' still processing for node %(node)s. ' + 'Continuing to monitor task completion.', + {'node': node.uuid}) + except exception.RedfishConnectionError as e: + LOG.debug('Unable to communicate with task monitor for node ' + '%(node)s during wait completion: %(error)s. ' + 'BMC may be resetting, will transition to version ' + 'checking.', {'node': node.uuid, 'error': e}) + except exception.RedfishError as e: + LOG.debug('Task monitor unavailable for node %(node)s: ' + '%(error)s. Task may have completed, transitioning ' + 'to version checking.', + {'node': node.uuid, 'error': e}) + + if task_still_running: + # Task is still running, continue to monitor task completion + # Don't transition to version checking yet. + node.set_driver_internal_info('redfish_fw_updates', settings) + node.save() + return + + # Task completed, deleted or BMC unavailable + # Transition to version checking + LOG.info('BMC firmware update wait expired for node %(node)s. ' + 'Task completed or unavailable. Transitioning to version ' + 'checking mode.', + {'node': node.uuid}) + self._handle_bmc_update_completion( + task, update_service, settings, current_update) + else: + # Regular wait completion - mark reboot needed if this is the + # last update. Note: BIOS components reboot immediately when + # task starts, so they won't use this path. + if len(settings) == 1: + component = current_update.get('component', '') + component_type = redfish_utils.get_component_type(component) + # For default/unknown components, reboot may be needed + if component_type is None: + node.set_driver_internal_info( + 'firmware_reboot_requested', True) + node.save() + # Continue with updates + self._continue_updates(task, update_service, settings) + @METRICS.timer('RedfishFirmware._check_node_redfish_firmware_update') def _check_node_redfish_firmware_update(self, task): """Check the progress of running firmware update on a node.""" @@ -563,7 +1193,9 @@ def _check_node_redfish_firmware_update(self, task): current_update.pop('wait', None) current_update.pop('wait_start_time', None) - self._continue_updates(task, update_service, settings) + # Handle wait completion + self._handle_wait_completion( + task, update_service, settings, current_update) else: LOG.debug('Continuing to wait after firmware update ' '%(firmware_image)s on node %(node)s. ' @@ -595,6 +1227,27 @@ def _check_node_redfish_firmware_update(self, task): self._continue_updates(task, update_service, settings) return + # Special handling for BIOS and NIC updates + component = current_update.get('component', '') + component_type = redfish_utils.get_component_type(component) + + if task_monitor.is_processing and component_type == redfish_utils.BIOS: + # For BIOS, check if task has reached STARTING state + # and trigger reboot immediately + if self._handle_bios_task_starting(task, task_monitor, settings, + current_update): + return # Reboot triggered, done + # Task is still processing, keep polling + return + + if task_monitor.is_processing and component_type == redfish_utils.NIC: + # For NIC, wait 30s to see if hardware needs reboot + if self._handle_nic_task_starting(task, task_monitor, settings, + current_update): + return # Reboot triggered, done + # Task is still processing (or waiting), keep polling + return + if not task_monitor.is_processing: # The last response does not necessarily contain a Task, # so get it @@ -607,38 +1260,17 @@ def _check_node_redfish_firmware_update(self, task): sushy_task.parse_messages() if sushy_task.messages is not None: - messages = [m.message for m in sushy_task.messages] + for m in sushy_task.messages: + msg = m.message + if not msg or msg.lower() in ['unknown', 'unknown error']: + msg = m.message_id + if msg: + messages.append(msg) task.upgrade_lock() - if (sushy_task.task_state == sushy.TASK_STATE_COMPLETED - and sushy_task.task_status in - [sushy.HEALTH_OK, sushy.HEALTH_WARNING]): - LOG.info('Firmware update task completed for node %(node)s, ' - 'firmware %(firmware_image)s: %(messages)s. ' - 'Starting BMC response validation.', - {'node': node.uuid, - 'firmware_image': current_update['url'], - 'messages': ", ".join(messages)}) - - # Validate BMC resources are consistently available - self._validate_resources_stability(node) - self._continue_updates(task, update_service, settings) - else: - error_msg = (_('Firmware update failed for node %(node)s, ' - 'firmware %(firmware_image)s. ' - 'Error: %(errors)s') % - {'node': node.uuid, - 'firmware_image': current_update['url'], - 'errors': ", ".join(messages)}) - - self._clear_updates(node) - if task.node.clean_step: - manager_utils.cleaning_error_handler(task, error_msg) - elif task.node.deploy_step: - manager_utils.deploying_error_handler(task, error_msg) - elif task.node.service_step: - manager_utils.servicing_error_handler(task, error_msg) - + self._handle_task_completion(task, sushy_task, messages, + update_service, settings, + current_update) else: LOG.debug('Firmware update in progress for node %(node)s, ' 'firmware %(firmware_image)s.', diff --git a/ironic/drivers/modules/redfish/utils.py b/ironic/drivers/modules/redfish/utils.py index 1504379a6a..af97cc3c63 100644 --- a/ironic/drivers/modules/redfish/utils.py +++ b/ironic/drivers/modules/redfish/utils.py @@ -94,11 +94,35 @@ NIC_COMPONENT_PREFIX = "nic:" "Prefix for NIC Firmware Components" +NIC = "nic" +"NIC Firmware Component type" FIRMWARE_COMPONENTS = [BIOS, BMC] """Firmware Components available to update""" +def get_component_type(component): + """Determine the type of firmware component. + + Note: This helper exists primarily to handle NIC components which use + a prefix pattern (e.g., 'nic:BCM57414', 'nic:adapter1') rather than + exact string matches. This centralizes the component type detection + logic that is used in multiple places (update initiation, task + monitoring, completion handling) and provides a single source of truth + for component type classification. + + :param component: The component name from settings + :returns: One of 'bios', 'bmc', 'nic', or None + """ + if component == BIOS: + return BIOS + elif component == BMC: + return BMC + elif component.startswith(NIC_COMPONENT_PREFIX): + return NIC + return None + + def parse_driver_info(node): """Parse the information required for Ironic to connect to Redfish. diff --git a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py index 90243fc7a4..f3179eccd9 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py @@ -50,6 +50,41 @@ def setUp(self): self.node = obj_utils.create_test_node( self.context, driver='redfish', driver_info=INFO_DICT) + def _mock_exc_fwup_side_effect(self, firmware_interface, node, + update_service, settings_list): + """Helper to simulate _execute_firmware_update behavior. + + The real _execute_firmware_update: + 1. Adds a task_monitor field to the settings + 2. Calls component-specific setup methods + + This helper replicates that behavior for tests that mock + this method to avoid JSON serialization issues. + + :param firmware_interface: The firmware interface instance (unused, + but passed by mock framework) + :param node: The node being updated + :param update_service: The update service + :param settings_list: The settings list + """ + settings_list[0]['task_monitor'] = '/redfish/v1/TaskService/Tasks/1' + + # Simulate component-specific setup that now happens inside + # _execute_firmware_update + fw_upd = settings_list[0] + component = fw_upd.get('component', '') + + # Call the actual setup method based on component type + # This ensures the driver_internal_info is set correctly + if component == redfish_utils.BMC: + firmware_interface._setup_bmc_update_monitoring(node, fw_upd) + elif component.startswith(redfish_utils.NIC_COMPONENT_PREFIX): + firmware_interface._setup_nic_update_monitoring(node) + elif component == redfish_utils.BIOS: + firmware_interface._setup_bios_update_monitoring(node) + else: + firmware_interface._setup_default_update_monitoring(node, fw_upd) + def test_get_properties(self): with task_manager.acquire(self.context, self.node.uuid, shared=True) as task: @@ -837,12 +872,13 @@ def test__check_node_firmware_update_fail_servicing( interface._continue_updates.assert_not_called() @mock.patch.object(redfish_fw.RedfishFirmware, - '_validate_resources_stability', autospec=True) + '_handle_bmc_update_completion', autospec=True) @mock.patch.object(redfish_fw, 'LOG', autospec=True) @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) def test__check_node_firmware_update_done(self, tm_mock, get_us_mock, - log_mock, validate_mock): + log_mock, + bmc_completion_mock): task_mock = mock.Mock() task_mock.task_state = sushy.TASK_STATE_COMPLETED task_mock.task_status = sushy.HEALTH_OK @@ -859,19 +895,23 @@ def test__check_node_firmware_update_done(self, tm_mock, get_us_mock, task.upgrade_lock.assert_called_once_with() info_calls = [ mock.call('Firmware update task completed for node %(node)s, ' - 'firmware %(firmware_image)s: %(messages)s. ' - 'Starting BMC response validation.', + 'firmware %(firmware_image)s: %(messages)s.', {'node': self.node.uuid, 'firmware_image': 'https://bmc/v1.0.1', 'messages': 'Firmware update done'})] log_mock.info.assert_has_calls(info_calls) - validate_mock.assert_called_once() + # NOTE(iurygregory): _validate_resources_stability is now called + # in _continue_updates before power operations, not in + # _handle_task_completion - interface._continue_updates.assert_called_once_with( - task, get_us_mock.return_value, + # BMC updates now go through _handle_bmc_update_completion + bmc_completion_mock.assert_called_once_with( + interface, task, get_us_mock.return_value, [{'component': 'bmc', 'url': 'https://bmc/v1.0.1', - 'task_monitor': '/task/1'}] + 'task_monitor': '/task/1'}], + {'component': 'bmc', 'url': 'https://bmc/v1.0.1', + 'task_monitor': '/task/1'} ) @mock.patch.object(firmware_utils, 'download_to_temp', autospec=True) @@ -964,14 +1004,19 @@ def test_continue_update_waitting(self, log_mock): ] log_mock.debug.assert_has_calls(debug_call) + @mock.patch.object(redfish_fw.RedfishFirmware, + '_validate_resources_stability', autospec=True) @mock.patch.object(redfish_fw, 'LOG', autospec=True) @mock.patch.object(manager_utils, 'notify_conductor_resume_clean', autospec=True) - def test_continue_updates_last(self, cond_resume_clean_mock, log_mock): + def test_continue_updates_last(self, cond_resume_clean_mock, log_mock, + validate_mock): self._generate_new_driver_internal_info(['bmc']) task = self._test_continue_updates() cond_resume_clean_mock.assert_called_once_with(task) + # Verify BMC validation was called before resuming conductor + validate_mock.assert_called_once() info_call = [ mock.call('Firmware updates completed for node %(node)s', @@ -979,15 +1024,19 @@ def test_continue_updates_last(self, cond_resume_clean_mock, log_mock): ] log_mock.info.assert_has_calls(info_call) + @mock.patch.object(redfish_fw.RedfishFirmware, + '_validate_resources_stability', autospec=True) @mock.patch.object(redfish_fw, 'LOG', autospec=True) @mock.patch.object(manager_utils, 'notify_conductor_resume_service', autospec=True) def test_continue_updates_last_service(self, cond_resume_service_mock, - log_mock): + log_mock, validate_mock): self._generate_new_driver_internal_info_service(['bmc']) task = self._test_continue_updates() cond_resume_service_mock.assert_called_once_with(task) + # Verify BMC validation was called before resuming conductor + validate_mock.assert_called_once() info_call = [ mock.call('Firmware updates completed for node %(node)s', @@ -995,12 +1044,15 @@ def test_continue_updates_last_service(self, cond_resume_service_mock, ] log_mock.info.assert_has_calls(info_call) + @mock.patch.object(redfish_fw.RedfishFirmware, + '_validate_resources_stability', autospec=True) @mock.patch.object(redfish_fw, 'LOG', autospec=True) @mock.patch.object(manager_utils, 'node_power_action', autospec=True) @mock.patch.object(redfish_utils, 'get_system_collection', autospec=True) def test_continue_updates_more_updates(self, get_system_collection_mock, node_power_action_mock, - log_mock): + log_mock, + validate_mock): cfg.CONF.set_override('firmware_update_wait_unresponsive_bmc', 0, 'redfish') self._generate_new_driver_internal_info(['bmc', 'bios']) @@ -1028,12 +1080,18 @@ def test_continue_updates_more_updates(self, get_system_collection_mock, log_mock.debug.assert_has_calls(debug_calls) self.assertEqual( [{'component': 'bios', 'url': 'https://bios/v1.0.1', - 'task_monitor': '/task/2'}], + 'task_monitor': '/task/2', 'power_timeout': 300}], task.node.driver_internal_info['redfish_fw_updates']) update_service_mock.simple_update.assert_called_once_with( 'https://bios/v1.0.1') - task.node.save.assert_called_once_with() - node_power_action_mock.assert_called_once_with(task, states.REBOOT) + # NOTE(iurygregory): node.save() is called twice: + # 1. Inside _execute_firmware_update via setup methods + # 2. In _continue_updates after _execute_firmware_update returns + self.assertEqual(task.node.save.call_count, 2) + # Verify BMC validation was called before continuing to next update + validate_mock.assert_called_once_with(firmware, task.node) + node_power_action_mock.assert_called_once_with(task, states.REBOOT, + 300) @mock.patch.object(redfish_utils, 'get_system', autospec=True) @mock.patch.object(redfish_utils, 'get_system_collection', autospec=True) @@ -1089,12 +1147,15 @@ def test__execute_firmware_update_targets(self, update_service_mock.simple_update.assert_called_once_with( 'https://bios/v1.0.1', targets=[mock.ANY]) + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_utils, 'get_manager', autospec=True) @mock.patch.object(redfish_utils, 'get_system', autospec=True) @mock.patch.object(redfish_utils, 'get_system_collection', autospec=True) - @mock.patch.object(time, 'sleep', autospec=True) - def test__execute_firmware_update_unresponsive_bmc(self, sleep_mock, + def test__execute_firmware_update_unresponsive_bmc(self, get_sys_collec_mock, - system_mock): + system_mock, + manager_mock, + set_async_mock): cfg.CONF.set_override('firmware_update_wait_unresponsive_bmc', 1, 'redfish') self._generate_new_driver_internal_info(['bmc']) @@ -1106,27 +1167,31 @@ def test__execute_firmware_update_unresponsive_bmc(self, sleep_mock, system_collection_mock.get_members.return_value = resp_obj['Members'] get_sys_collec_mock.return_value = system_collection_mock + # Mock BMC version reading for setup + mock_manager = mock.Mock() + mock_manager.firmware_version = '1.0.0' + manager_mock.return_value = mock_manager + task_monitor_mock = mock.Mock() task_monitor_mock.task_monitor_uri = '/task/2' update_service_mock = mock.Mock() update_service_mock.simple_update.return_value = task_monitor_mock - firmware = redfish_fw.RedfishFirmware() - settings = [{'component': 'bmc', 'url': 'https://bmc/v1.2.3'}] firmware._execute_firmware_update(self.node, update_service_mock, settings) - update_service_mock.simple_update.assert_called_once_with( 'https://bmc/v1.2.3') - sleep_mock.assert_called_once_with( - CONF.redfish.firmware_update_wait_unresponsive_bmc) - + # Verify BMC monitoring setup was called (internally by _execute) + set_async_mock.assert_called_once_with( + self.node, reboot=False, polling=True) + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_utils, 'get_manager', autospec=True) @mock.patch.object(redfish_utils, 'get_system', autospec=True) @mock.patch.object(redfish_utils, 'get_system_collection', autospec=True) - @mock.patch.object(time, 'sleep', autospec=True) def test__execute_firmware_update_unresponsive_bmc_node_override( - self, sleep_mock, get_sys_collec_mock, system_mock): + self, get_sys_collec_mock, system_mock, manager_mock, + set_async_mock): self._generate_new_driver_internal_info(['bmc']) # Set a specific value for firmware_update_unresponsive_bmc_wait for # the node @@ -1135,12 +1200,10 @@ def test__execute_firmware_update_unresponsive_bmc_node_override( d_info['firmware_update_unresponsive_bmc_wait'] = 1 self.node.driver_info = d_info self.node.save() - self.assertNotEqual( CONF.redfish.firmware_update_wait_unresponsive_bmc, self.node.driver_info.get('firmware_update_unresponsive_bmc_wait') ) - with open( 'ironic/tests/json_samples/systems_collection_single.json' ) as f: @@ -1149,21 +1212,24 @@ def test__execute_firmware_update_unresponsive_bmc_node_override( system_collection_mock.get_members.return_value = resp_obj['Members'] get_sys_collec_mock.return_value = system_collection_mock + # Mock BMC version reading for setup + mock_manager = mock.Mock() + mock_manager.firmware_version = '1.0.0' + manager_mock.return_value = mock_manager + task_monitor_mock = mock.Mock() task_monitor_mock.task_monitor_uri = '/task/2' update_service_mock = mock.Mock() update_service_mock.simple_update.return_value = task_monitor_mock - firmware = redfish_fw.RedfishFirmware() settings = [{'component': 'bmc', 'url': 'https://bmc/v1.2.3'}] firmware._execute_firmware_update(self.node, update_service_mock, settings) - update_service_mock.simple_update.assert_called_once_with( 'https://bmc/v1.2.3') - sleep_mock.assert_called_once_with( - self.node.driver_info.get('firmware_update_unresponsive_bmc_wait') - ) + # Verify BMC monitoring setup was called (internally by _execute) + set_async_mock.assert_called_once_with( + self.node, reboot=False, polling=True) def test__validate_resources_stability_success(self): """Test successful BMC resource validation with consecutive success.""" @@ -1380,7 +1446,8 @@ def test__validate_resources_stability_badrequest_error(self): shared=True) as task: with mock.patch.object(redfish_utils, 'get_system', autospec=True) as system_mock, \ - mock.patch.object(time, 'time', autospec=True) as time_mock: + mock.patch.object(time, 'time', autospec=True) as time_mock, \ + mock.patch.object(time, 'sleep', autospec=True): # Mock BadRequestError from sushy with proper arguments mock_response = mock.Mock() @@ -1396,8 +1463,8 @@ def test__validate_resources_stability_badrequest_error(self): firmware._validate_resources_stability, task.node) - @mock.patch.object(time, 'sleep', lambda seconds: None) - @mock.patch.object(deploy_utils, 'reboot_to_finish_step', autospec=True) + @mock.patch.object(redfish_utils, 'get_manager', autospec=True) + @mock.patch.object(redfish_utils, 'get_system', autospec=True) @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', autospec=True) @@ -1405,21 +1472,43 @@ def test__validate_resources_stability_badrequest_error(self): def test_update_bmc_uses_configured_timeout(self, mock_get_update_service, mock_execute_fw_update, mock_set_async_flags, - mock_reboot_to_finish): - """Test BMC firmware update uses configured timeout.""" + mock_get_system, + mock_get_manager): + """Test BMC firmware update sets up version checking.""" settings = [{'component': 'bmc', 'url': 'http://bmc/v1.0.0'}] + # Mock system + mock_system = mock.Mock() + mock_get_system.return_value = mock_system + + # Mock BMC version reading + mock_manager = mock.Mock() + mock_manager.firmware_version = '1.0.0' + mock_get_manager.return_value = mock_manager + + # add task_monitor to the side effect + mock_execute_fw_update.side_effect = self._mock_exc_fwup_side_effect + with task_manager.acquire(self.context, self.node.uuid, shared=False) as task: - task.driver.firmware.update(task, settings) - - # Verify configured timeout is used for BMC update - mock_reboot_to_finish.assert_called_once_with( - task, timeout=CONF.redfish.firmware_update_bmc_timeout, - disable_ramdisk=True) + task.node.service_step = {'step': 'update', + 'interface': 'firmware'} + result = task.driver.firmware.update(task, settings) + + # BMC uses version checking, not immediate reboot + mock_set_async_flags.assert_called_once_with( + task.node, + reboot=False, + polling=True + ) + # Verify BMC version check tracking is set up + info = task.node.driver_internal_info + self.assertIn('bmc_fw_check_start_time', info) + self.assertIn('bmc_fw_version_before_update', info) + self.assertEqual(states.SERVICEWAIT, result) - @mock.patch.object(time, 'sleep', lambda seconds: None) - @mock.patch.object(deploy_utils, 'reboot_to_finish_step', autospec=True) + @mock.patch.object(redfish_utils, 'get_manager', autospec=True) + @mock.patch.object(redfish_utils, 'get_system', autospec=True) @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', autospec=True) @@ -1427,106 +1516,506 @@ def test_update_bmc_uses_configured_timeout(self, mock_get_update_service, def test_update_bmc_uses_bmc_constant(self, mock_get_update_service, mock_execute_fw_update, mock_set_async_flags, - mock_reboot_to_finish): + mock_get_system, + mock_get_manager): """Test BMC firmware update detection works with BMC constant.""" settings = [{'component': redfish_utils.BMC, 'url': 'http://bmc/v1.0.0'}] + # Mock system + mock_system = mock.Mock() + mock_get_system.return_value = mock_system + + # Mock BMC version reading + mock_manager = mock.Mock() + mock_manager.firmware_version = '1.0.0' + mock_get_manager.return_value = mock_manager + + # add task_monitor to the side effect + mock_execute_fw_update.side_effect = self._mock_exc_fwup_side_effect + with task_manager.acquire(self.context, self.node.uuid, shared=False) as task: - task.driver.firmware.update(task, settings) - - # Verify configured timeout is used - mock_reboot_to_finish.assert_called_once_with( - task, timeout=CONF.redfish.firmware_update_bmc_timeout, - disable_ramdisk=True) + task.node.service_step = {'step': 'update', + 'interface': 'firmware'} + result = task.driver.firmware.update(task, settings) + + # BMC uses version checking, not immediate reboot + mock_set_async_flags.assert_called_once_with( + task.node, + reboot=False, + polling=True + ) + # Verify BMC version check tracking is set up + info = task.node.driver_internal_info + self.assertIn('bmc_fw_check_start_time', info) + self.assertIn('bmc_fw_version_before_update', info) + self.assertEqual(states.SERVICEWAIT, result) - @mock.patch.object(time, 'sleep', lambda seconds: None) - @mock.patch.object(deploy_utils, 'reboot_to_finish_step', autospec=True) @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', autospec=True) @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) def test_update_non_bmc_uses_wait_parameter(self, mock_get_update_service, mock_execute_fw_update, - mock_set_async_flags, - mock_reboot_to_finish): - """Test non-BMC firmware update uses wait parameter.""" + mock_set_async_flags): + """Test non-BMC firmware update with wait parameter (obsolete).""" + # NOTE: This test is kept for historical reference but the wait + # parameter on BIOS updates is no longer used as BIOS reboots + # immediately when task starts rather than waiting settings = [{'component': 'bios', 'url': 'http://bios/v1.0.0', 'wait': 120}] + # add task_monitor to the side effect + mock_execute_fw_update.side_effect = self._mock_exc_fwup_side_effect + with task_manager.acquire(self.context, self.node.uuid, shared=False) as task: - task.driver.firmware.update(task, settings) - - # Verify wait parameter is used for non-BMC update - mock_reboot_to_finish.assert_called_once_with( - task, timeout=120, disable_ramdisk=True) + task.node.service_step = {'step': 'update', + 'interface': 'firmware'} + result = task.driver.firmware.update(task, settings) + + # Verify reboot=True is set for BIOS + mock_set_async_flags.assert_called_once_with( + task.node, + reboot=True, + polling=True + ) + self.assertEqual(states.SERVICEWAIT, result) - @mock.patch.object(time, 'sleep', lambda seconds: None) - @mock.patch.object(deploy_utils, 'reboot_to_finish_step', autospec=True) @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', autospec=True) @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) def test_update_non_bmc_no_wait_parameter(self, mock_get_update_service, mock_execute_fw_update, - mock_set_async_flags, - mock_reboot_to_finish): - """Test non-BMC firmware update without wait parameter uses None.""" + mock_set_async_flags): + """Test non-BMC firmware update without wait parameter.""" settings = [{'component': 'bios', 'url': 'http://bios/v1.0.0'}] + # add task_monitor to the side effect + mock_execute_fw_update.side_effect = self._mock_exc_fwup_side_effect + with task_manager.acquire(self.context, self.node.uuid, shared=False) as task: - task.driver.firmware.update(task, settings) - - # Verify None timeout is used for non-BMC without wait parameter - mock_reboot_to_finish.assert_called_once_with( - task, timeout=None, disable_ramdisk=True) + task.node.service_step = {'step': 'update', + 'interface': 'firmware'} + result = task.driver.firmware.update(task, settings) + + # Verify reboot=True is set for BIOS + mock_set_async_flags.assert_called_once_with( + task.node, + reboot=True, + polling=True + ) + self.assertEqual(states.SERVICEWAIT, result) - @mock.patch.object(time, 'sleep', lambda seconds: None) - @mock.patch.object(deploy_utils, 'reboot_to_finish_step', autospec=True) @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', autospec=True) @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) def test_update_mixed_components_with_bmc(self, mock_get_update_service, mock_execute_fw_update, - mock_set_async_flags, - mock_reboot_to_finish): - """Test mixed component update with BMC and explicit wait uses wait.""" + mock_set_async_flags): + """Test mixed component update with BIOS and BMC.""" settings = [ {'component': 'bios', 'url': 'http://bios/v1.0.0', 'wait': 120}, {'component': 'bmc', 'url': 'http://bmc/v1.0.0', 'wait': 60} ] + # add task_monitor to the side effect + mock_execute_fw_update.side_effect = self._mock_exc_fwup_side_effect + with task_manager.acquire(self.context, self.node.uuid, shared=False) as task: - task.driver.firmware.update(task, settings) - - # Verify explicit wait parameter takes precedence over BMC timeout - mock_reboot_to_finish.assert_called_once_with( - task, timeout=120, - disable_ramdisk=True) + task.node.service_step = {'step': 'update', + 'interface': 'firmware'} + result = task.driver.firmware.update(task, settings) + + # First component is BIOS, so reboot=True + mock_set_async_flags.assert_called_once_with( + task.node, + reboot=True, + polling=True + ) + self.assertEqual(states.SERVICEWAIT, result) - @mock.patch.object(time, 'sleep', lambda seconds: None) - @mock.patch.object(deploy_utils, 'reboot_to_finish_step', autospec=True) @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', autospec=True) @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) def test_update_bmc_with_explicit_wait(self, mock_get_update_service, mock_execute_fw_update, - mock_set_async_flags, - mock_reboot_to_finish): - """Test BMC update with explicit wait uses wait, not BMC timeout.""" + mock_set_async_flags): + """Test BMC update with explicit wait.""" settings = [{'component': 'bmc', 'url': 'http://bmc/v1.0.0', 'wait': 90}] + # add task_monitor to the side effect + mock_execute_fw_update.side_effect = self._mock_exc_fwup_side_effect + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.node.service_step = {'step': 'update', + 'interface': 'firmware'} + result = task.driver.firmware.update(task, settings) + + # BMC uses version checking, not immediate reboot + mock_set_async_flags.assert_called_once_with( + task.node, + reboot=False, + polling=True + ) + # Verify wait time is stored + info = task.node.driver_internal_info + fw_updates = info['redfish_fw_updates'] + self.assertEqual(90, fw_updates[0]['wait']) + self.assertEqual(states.SERVICEWAIT, result) + + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_utils, 'get_manager', autospec=True) + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_update_bmc_no_immediate_reboot(self, mock_get_update_service, + mock_execute_fw_update, + mock_get_system, + mock_get_manager, + mock_set_async_flags): + """Test BMC firmware update does not set immediate reboot.""" + settings = [{'component': 'bmc', 'url': 'http://bmc/v1.0.0'}] + + # Mock system + mock_system = mock.Mock() + mock_get_system.return_value = mock_system + + # Mock BMC version reading + mock_manager = mock.Mock() + mock_manager.firmware_version = '1.0.0' + mock_get_manager.return_value = mock_manager + + # add task_monitor to the side effect + mock_execute_fw_update.side_effect = self._mock_exc_fwup_side_effect + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.node.service_step = {'step': 'update', + 'interface': 'firmware'} + result = task.driver.firmware.update(task, settings) + + # Verify reboot=False for BMC updates + mock_set_async_flags.assert_called_once_with( + task.node, + reboot=False, + polling=True + ) + # Verify we return wait state to keep step active + self.assertEqual(states.SERVICEWAIT, result) + + # Verify BMC version check tracking is set up + info = task.node.driver_internal_info + self.assertIn('bmc_fw_check_start_time', info) + self.assertIn('bmc_fw_version_before_update', info) + + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_update_nic_no_immediate_reboot(self, mock_get_update_service, + mock_execute_fw_update, + mock_set_async_flags): + """Test NIC firmware update sets reboot flag, waits for task.""" + settings = [{'component': 'nic:BCM57414', 'url': 'http://nic/v1.0.0'}] + + # add task_monitor to the side effect + mock_execute_fw_update.side_effect = self._mock_exc_fwup_side_effect + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.node.service_step = {'step': 'update', + 'interface': 'firmware'} + result = task.driver.firmware.update(task, settings) + + # Verify reboot=True for NIC updates (reboot is conditional) + mock_set_async_flags.assert_called_once_with( + task.node, + reboot=True, + polling=True + ) + # Verify we return wait state to keep step active + self.assertEqual(states.SERVICEWAIT, result) + + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_execute_firmware_update', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_update_bios_sets_reboot_flag(self, mock_get_update_service, + mock_execute_fw_update, + mock_set_async_flags): + """Test BIOS firmware update sets reboot flag.""" + settings = [{'component': 'bios', 'url': 'http://bios/v1.0.0'}] + + # add task_monitor to the side effect + mock_execute_fw_update.side_effect = self._mock_exc_fwup_side_effect + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + # Set up node as if in service step + task.node.service_step = {'step': 'update', + 'interface': 'firmware'} + result = task.driver.firmware.update(task, settings) + + # Verify reboot=True for BIOS updates + mock_set_async_flags.assert_called_once_with( + task.node, + reboot=True, + polling=True + ) + # Verify we return wait state to keep step active + self.assertEqual(states.SERVICEWAIT, result) + + @mock.patch.object(timeutils, 'utcnow', autospec=True) + @mock.patch.object(timeutils, 'parse_isotime', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_continue_updates', + autospec=True) + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, + '_get_current_bmc_version', autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_bmc_version_check_timeout_sets_reboot_flag( + self, mock_get_update_service, mock_get_bmc_version, + mock_set_async_flags, mock_continue_updates, + mock_parse_isotime, mock_utcnow): + """Test BMC version check timeout sets reboot request flag.""" + import datetime + start_time = datetime.datetime(2025, 1, 1, 0, 0, 0, + tzinfo=datetime.timezone.utc) + current_time = start_time + datetime.timedelta(seconds=301) + mock_parse_isotime.return_value = start_time + mock_utcnow.return_value = current_time + settings = [{'component': 'bmc', 'url': 'http://bmc/v1.0.0', + 'wait': 300, 'task_monitor': '/tasks/1'}] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + # Set up node with BMC version checking in progress + task.node.set_driver_internal_info( + 'redfish_fw_updates', settings) + task.node.set_driver_internal_info( + 'bmc_fw_check_start_time', '2025-01-01T00:00:00.000000') + + # Mock BMC is unresponsive + mock_get_bmc_version.return_value = None + + # Call the BMC update completion handler + firmware_interface = redfish_fw.RedfishFirmware() + firmware_interface._handle_bmc_update_completion( + task, mock_get_update_service.return_value, + settings, settings[0]) + + # Verify reboot flag is set + info = task.node.driver_internal_info + self.assertTrue(info.get('firmware_reboot_requested')) + + # Verify async flags updated with reboot=True + mock_set_async_flags.assert_called_once_with( + task.node, + reboot=True, + polling=True + ) + + # Verify _continue_updates was called + mock_continue_updates.assert_called_once() + + @mock.patch.object(redfish_fw.RedfishFirmware, '_continue_updates', + autospec=True) + @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, + '_validate_resources_stability', autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) + def test_nic_completion_sets_reboot_flag( + self, mock_get_task_monitor, mock_get_update_service, + mock_validate_resources, mock_set_async_flags, + mock_continue_updates): + """Test NIC firmware task completion sets reboot request flag.""" + settings = [{'component': 'nic:BCM57414', + 'url': 'http://nic/v1.0.0', + 'task_monitor': '/tasks/1'}] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + # Set up node with NIC update in progress + # Set nic_needs_post_completion_reboot to simulate hardware + # that started update immediately but needs reboot after completion + settings[0]['nic_needs_post_completion_reboot'] = True + task.node.set_driver_internal_info( + 'redfish_fw_updates', settings) + + # Mock task completion + mock_task_monitor = mock.Mock() + mock_task_monitor.is_processing = False + mock_task = mock.Mock() + mock_task.task_state = sushy.TASK_STATE_COMPLETED + mock_task.task_status = sushy.HEALTH_OK + mock_task.messages = [] + mock_task_monitor.get_task.return_value = mock_task + mock_get_task_monitor.return_value = mock_task_monitor + + # Call the check method + firmware_interface = redfish_fw.RedfishFirmware() + firmware_interface._check_node_redfish_firmware_update(task) + + # Verify reboot flag is set + info = task.node.driver_internal_info + self.assertTrue(info.get('firmware_reboot_requested')) + + # Verify _continue_updates was called + mock_continue_updates.assert_called_once() + + @mock.patch.object(redfish_fw.RedfishFirmware, + '_validate_resources_stability', autospec=True) + @mock.patch.object(manager_utils, 'notify_conductor_resume_clean', + autospec=True) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_clear_updates', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_final_update_with_reboot_flag_triggers_reboot( + self, mock_get_update_service, mock_clear_updates, + mock_power_action, mock_resume_clean, validate_mock): + """Test final firmware update with reboot flag triggers reboot.""" + settings = [{'component': 'bmc', 'url': 'http://bmc/v1.0.0', + 'task_monitor': '/tasks/1'}] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + # Set up node as if in cleaning + task.node.clean_step = {'step': 'update', 'interface': 'firmware'} + + # Set up final update with reboot requested + task.node.set_driver_internal_info( + 'redfish_fw_updates', settings) + task.node.set_driver_internal_info( + 'firmware_reboot_requested', True) + + # Call _continue_updates with last firmware + firmware_interface = redfish_fw.RedfishFirmware() + firmware_interface._continue_updates( + task, mock_get_update_service.return_value, settings) + + # Verify reboot was triggered + mock_power_action.assert_called_once_with(task, states.REBOOT) + + # Verify BMC validation was called before resuming conductor + validate_mock.assert_called_once() + + # Verify resume clean was called + mock_resume_clean.assert_called_once_with(task) + + @mock.patch.object(redfish_fw.RedfishFirmware, + '_validate_resources_stability', autospec=True) + @mock.patch.object(manager_utils, 'notify_conductor_resume_clean', + autospec=True) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_clear_updates', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_final_update_without_reboot_flag_no_reboot( + self, mock_get_update_service, mock_clear_updates, + mock_power_action, mock_resume_clean, validate_mock): + """Test final firmware update without reboot flag skips reboot.""" + settings = [{'component': 'bmc', 'url': 'http://bmc/v1.0.0', + 'task_monitor': '/tasks/1'}] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + # Set up node as if in cleaning + task.node.clean_step = {'step': 'update', 'interface': 'firmware'} + + # Set up final update WITHOUT reboot requested + task.node.set_driver_internal_info( + 'redfish_fw_updates', settings) + # Don't set firmware_reboot_requested + + # Call _continue_updates with last firmware + firmware_interface = redfish_fw.RedfishFirmware() + firmware_interface._continue_updates( + task, mock_get_update_service.return_value, settings) + + # Verify reboot was NOT triggered + mock_power_action.assert_not_called() + + # Verify BMC validation was called before resuming conductor + validate_mock.assert_called_once() + + # Verify resume clean was still called + mock_resume_clean.assert_called_once_with(task) + + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) + def test_bios_reboot_on_task_starting( + self, mock_get_task_monitor, mock_get_update_service, + mock_power_action): + """Test BIOS update triggers reboot when task reaches STARTING.""" + settings = [{'component': 'bios', 'url': 'http://bios/v1.0.1', + 'task_monitor': '/tasks/1'}] + with task_manager.acquire(self.context, self.node.uuid, shared=False) as task: - task.driver.firmware.update(task, settings) + # Set up node with BIOS update in progress + task.node.set_driver_internal_info('redfish_fw_updates', settings) + task.node.clean_step = {'step': 'update', 'interface': 'firmware'} + + # Mock task monitor to return is_processing=True + mock_task_monitor = mock.Mock() + mock_task_monitor.is_processing = True + mock_get_task_monitor.return_value = mock_task_monitor + + # Mock the task state as STARTING + mock_task = mock.Mock() + mock_task.task_state = sushy.TASK_STATE_STARTING + mock_task_monitor.get_task.return_value = mock_task + + # Call the check method + firmware_interface = redfish_fw.RedfishFirmware() + firmware_interface._check_node_redfish_firmware_update(task) + + # Verify reboot was triggered + mock_power_action.assert_called_once_with(task, states.REBOOT, 0) + + # Verify the flag was set to prevent repeated reboots + updated_settings = task.node.driver_internal_info[ + 'redfish_fw_updates'] + self.assertTrue(updated_settings[0].get('bios_reboot_triggered')) + + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) + def test_bios_no_repeated_reboot_after_flag_set( + self, mock_get_task_monitor, mock_get_update_service, + mock_power_action): + """Test BIOS update doesn't reboot again after flag is set.""" + settings = [{'component': 'bios', 'url': 'http://bios/v1.0.1', + 'task_monitor': '/tasks/1', + 'bios_reboot_triggered': True}] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + # Set up node with BIOS update in progress and flag already set + task.node.set_driver_internal_info('redfish_fw_updates', settings) + task.node.clean_step = {'step': 'update', 'interface': 'firmware'} + + # Mock task monitor to return is_processing=True + mock_task_monitor = mock.Mock() + mock_task_monitor.is_processing = True + mock_get_task_monitor.return_value = mock_task_monitor + + # Call the check method + firmware_interface = redfish_fw.RedfishFirmware() + firmware_interface._check_node_redfish_firmware_update(task) - # Verify explicit wait parameter takes precedence over BMC timeout - mock_reboot_to_finish.assert_called_once_with( - task, timeout=90, disable_ramdisk=True) + # Verify reboot was NOT triggered again + mock_power_action.assert_not_called() diff --git a/releasenotes/notes/conditional-reboot-fwup-6ca14573a2bebba0.yaml b/releasenotes/notes/conditional-reboot-fwup-6ca14573a2bebba0.yaml new file mode 100644 index 0000000000..140e31bb91 --- /dev/null +++ b/releasenotes/notes/conditional-reboot-fwup-6ca14573a2bebba0.yaml @@ -0,0 +1,10 @@ +--- +fixes: + - | + Each firmware component might have different reboot requirements depending + on the hardware, this change adds two new configuration options to help + operators configure the reboot behavior for firmware updates. + ``[redfish]firmware_update_reboot_delay``: reboot delay for firmware + updates of components. + ``[redfish]firmware_update_bmc_version_check_interval``: delay in seconds + after the firmware update fails before the node is rebooted. From 2534d873d449fd5afb59d4e92f08c76cb5e3bbe9 Mon Sep 17 00:00:00 2001 From: Iury Gregory Melo Ferreira Date: Mon, 15 Dec 2025 00:31:37 -0300 Subject: [PATCH 07/15] Fix firmware update failure when task remains in RUNNING state Some BMCs (particularly HPE iLO) may return is_processing=False while the firmware update task is still in RUNNING, STARTING, or PENDING state. The previous code incorrectly treated this as task completion and entered the completion handler, which only recognizes COMPLETED as success. This resulted in firmware updates being marked as failed with blank error messages when the BMC had no error messages to report for an ongoing task. Assisted-By: Claude Code Sonnet 4.5 Closes-Bug: #2136089 Signed-off-by: Iury Gregory Melo Ferreira Change-Id: I8b61fea63b8af0cf4c3245758538eeb36a7a5b04 (cherry picked from commit 326c4e9be9f7cd10ff4619348edddcb2d1aa0824) (cherry picked from commit f6231ae9cc08684fd6e5257f2cabb790bf74b3bd) --- ironic/drivers/modules/redfish/firmware.py | 12 +++++++ .../drivers/modules/redfish/test_firmware.py | 34 +++++++++++++++++++ ...task-remains-running-bbe30ab1059baf32.yaml | 7 ++++ 3 files changed, 53 insertions(+) create mode 100644 releasenotes/notes/fix-firmware-update-when-task-remains-running-bbe30ab1059baf32.yaml diff --git a/ironic/drivers/modules/redfish/firmware.py b/ironic/drivers/modules/redfish/firmware.py index afc97c6f08..da611fd683 100644 --- a/ironic/drivers/modules/redfish/firmware.py +++ b/ironic/drivers/modules/redfish/firmware.py @@ -1253,6 +1253,18 @@ def _check_node_redfish_firmware_update(self, task): # so get it sushy_task = task_monitor.get_task() + # NOTE(iurygregory): Some BMCs (particularly HPE iLO) may return + # is_processing=False while the task is still in RUNNING, STARTING, + # or PENDING state. Only treat it as completion if the task state + # indicates it's actually finished. + if sushy_task.task_state in [sushy.TASK_STATE_RUNNING, + sushy.TASK_STATE_STARTING, + sushy.TASK_STATE_PENDING]: + LOG.debug('Firmware update task for node %(node)s is in ' + '%(state)s state. Continuing to poll.', + {'node': node.uuid, 'state': sushy_task.task_state}) + return + # Only parse the messages if the BMC did not return parsed # messages messages = [] diff --git a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py index f3179eccd9..e9a634ea99 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py @@ -813,6 +813,40 @@ def test__check_update_in_progress(self, tm_mock, get_us_mock, log_mock): interface._continue_updates.assert_not_called() + @mock.patch.object(redfish_fw, 'LOG', autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) + def test__check_update_task_state(self, tm_mock, get_us_mock, log_mock): + """Test task with is_processing=False but still in active state. + + Some BMCs (particularly HPE iLO) may return is_processing=False + while the task is still in RUNNING, STARTING, or PENDING state. + The update should continue polling and not be treated as complete. + """ + self._generate_new_driver_internal_info(['bmc']) + + # Test each of the three active states + for task_state in [sushy.TASK_STATE_RUNNING, + sushy.TASK_STATE_STARTING, + sushy.TASK_STATE_PENDING]: + log_mock.reset_mock() + + tm_mock.return_value.is_processing = False + mock_task = tm_mock.return_value.get_task.return_value + mock_task.task_state = task_state + mock_task.task_status = sushy.HEALTH_OK + + _, interface = self._test__check_node_redfish_firmware_update() + + # Verify the new debug log message + debug_calls = [ + mock.call('Firmware update task for node %(node)s is in ' + '%(state)s state. Continuing to poll.', + {'node': self.node.uuid, 'state': task_state})] + + log_mock.debug.assert_has_calls(debug_calls) + interface._continue_updates.assert_not_called() + @mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True) @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) diff --git a/releasenotes/notes/fix-firmware-update-when-task-remains-running-bbe30ab1059baf32.yaml b/releasenotes/notes/fix-firmware-update-when-task-remains-running-bbe30ab1059baf32.yaml new file mode 100644 index 0000000000..5c0114810e --- /dev/null +++ b/releasenotes/notes/fix-firmware-update-when-task-remains-running-bbe30ab1059baf32.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Some BMCs maybe return they are not processing the firmware update when + the task is still in RUNNING, STARTING, or PENDING state. Previously, + Ironic will treat it as complete and stop polling for the update. + Now, Ironic will continue polling for the update until it is complete. From 32c843b501240fc5b5ca66b4a65b3ee1c4b2ecdc Mon Sep 17 00:00:00 2001 From: Iury Gregory Melo Ferreira Date: Mon, 15 Dec 2025 00:25:18 -0300 Subject: [PATCH 08/15] Fix BIOS firmware not flashing when task completes before first poll Some BMCs (particularly HPE iLO) complete BIOS firmware update tasks very quickly (within 20-30 seconds) by staging the firmware, but the firmware is only applied on the next reboot. If the task completes before Ironic's first poll (which happens ~60 seconds after task creation), the _handle_bios_task_starting() method never runs and no reboot is triggered. This results in: - BIOS firmware staged but not applied - Code incorrectly logs "System was already rebooted" - BIOS version remains unchanged Observed behavior: - HPE iLO: Task created at 20:56:23, completed at 20:56:46 (23s) - First poll at 20:57:24 found task already completed - No reboot was triggered, BIOS firmware remained at old version Assisted-By: Claude Code Sonnet 4.5 Closes-Bug: #2136088 Signed-off-by: Iury Gregory Melo Ferreira Change-Id: Idb2cc5a6a1f9415f1ad5b5e36616abe44cb51861 (cherry picked from commit 2306d90f8c6f597a0b4563219e34a8c74951b285) (cherry picked from commit ba35f1a6c9bd58b8923d323f7d1aaeb8a05bb3b8) --- ironic/drivers/modules/redfish/firmware.py | 37 +++-- .../drivers/modules/redfish/test_firmware.py | 126 ++++++++++++++++++ .../fix-bios-fw-flash-190220f3beccf2f3.yaml | 9 ++ 3 files changed, 163 insertions(+), 9 deletions(-) create mode 100644 releasenotes/notes/fix-bios-fw-flash-190220f3beccf2f3.yaml diff --git a/ironic/drivers/modules/redfish/firmware.py b/ironic/drivers/modules/redfish/firmware.py index da611fd683..0b0bf477b7 100644 --- a/ironic/drivers/modules/redfish/firmware.py +++ b/ironic/drivers/modules/redfish/firmware.py @@ -870,15 +870,34 @@ def _handle_task_completion(self, task, sushy_task, messages, self._handle_nic_update_completion( task, update_service, settings, current_update) elif component_type == redfish_utils.BIOS: - # BIOS: Reboot was already triggered when task started, - # just continue with next update - LOG.info('BIOS firmware update task completed for node ' - '%(node)s. System was already rebooted. ' - 'Proceeding with continuation.', - {'node': node.uuid}) - # Clean up the reboot trigger flag - current_update.pop('bios_reboot_triggered', None) - self._continue_updates(task, update_service, settings) + # BIOS: Check if reboot was actually triggered + # Some BMCs (e.g., HPE iLO) complete the BIOS firmware task + # very quickly (staging the firmware) before Ironic can poll + # and trigger the reboot. In this case, we need to trigger + # the reboot now to actually apply the firmware. + if not current_update.get('bios_reboot_triggered'): + LOG.info('BIOS firmware update task completed for node ' + '%(node)s but reboot was not triggered yet. ' + 'Triggering reboot now to apply staged firmware.', + {'node': node.uuid}) + current_update['bios_reboot_triggered'] = True + node.set_driver_internal_info('redfish_fw_updates', + settings) + node.save() + power_timeout = current_update.get('power_timeout', 0) + manager_utils.node_power_action(task, states.REBOOT, + power_timeout) + return + else: + # Reboot was already triggered when task started, + # just continue with next update + LOG.info('BIOS firmware update task completed for node ' + '%(node)s. System was already rebooted. ' + 'Proceeding with continuation.', + {'node': node.uuid}) + # Clean up the reboot trigger flag + current_update.pop('bios_reboot_triggered', None) + self._continue_updates(task, update_service, settings) else: # Default: continue as before self._continue_updates(task, update_service, settings) diff --git a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py index e9a634ea99..4f5ac7373f 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py @@ -2053,3 +2053,129 @@ def test_bios_no_repeated_reboot_after_flag_set( # Verify reboot was NOT triggered again mock_power_action.assert_not_called() + + @mock.patch.object(redfish_fw, 'LOG', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_continue_updates', + autospec=True) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) + def test_bios_reboot_on_completion_without_prior_reboot( + self, mock_get_task_monitor, mock_get_update_service, + mock_power_action, mock_continue_updates, mock_log): + """Test BIOS task completion triggers reboot when not triggered before. + + This test verifies the alternate path where a BIOS firmware update + task completes very quickly (e.g., HPE iLO staging firmware) before + Ironic can trigger a reboot during the STARTING state. In this case, + when the task reaches COMPLETED state and bios_reboot_triggered is + not set, we should: + 1. Trigger a reboot to apply the staged firmware + 2. NOT call _continue_updates (return early) + 3. Set the bios_reboot_triggered flag + """ + settings = [{'component': 'bios', 'url': 'http://bios/v1.0.1', + 'task_monitor': '/tasks/1'}] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + # Set up node with BIOS update in progress + # Note: bios_reboot_triggered is NOT set + task.node.set_driver_internal_info('redfish_fw_updates', settings) + task.node.clean_step = {'step': 'update', 'interface': 'firmware'} + + # Mock task monitor showing task is completed + mock_task_monitor = mock.Mock() + mock_task_monitor.is_processing = False + mock_task = mock.Mock() + mock_task.task_state = sushy.TASK_STATE_COMPLETED + mock_task.task_status = sushy.HEALTH_OK + mock_task.messages = [] + mock_task_monitor.get_task.return_value = mock_task + mock_get_task_monitor.return_value = mock_task_monitor + + # Call the check method + firmware_interface = redfish_fw.RedfishFirmware() + firmware_interface._check_node_redfish_firmware_update(task) + + # Verify reboot WAS triggered + mock_power_action.assert_called_once_with(task, states.REBOOT, 0) + + # Verify _continue_updates was NOT called (early return) + mock_continue_updates.assert_not_called() + + # Verify the flag was set to prevent repeated reboots + updated_settings = task.node.driver_internal_info[ + 'redfish_fw_updates'] + self.assertTrue(updated_settings[0].get('bios_reboot_triggered')) + + # Verify LOG.info was called with the correct message + mock_log.info.assert_any_call( + 'BIOS firmware update task completed for node ' + '%(node)s but reboot was not triggered yet. ' + 'Triggering reboot now to apply staged firmware.', + {'node': task.node.uuid}) + + @mock.patch.object(redfish_fw, 'LOG', autospec=True) + @mock.patch.object(redfish_fw.RedfishFirmware, '_continue_updates', + autospec=True) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) + def test_bios_continue_after_completion_with_prior_reboot( + self, mock_get_task_monitor, mock_get_update_service, + mock_power_action, mock_continue_updates, mock_log): + """Test BIOS task completion continues when reboot already triggered. + + This test verifies the else path where a BIOS firmware update task + completes and the reboot was already triggered (during STARTING state). + In this case, when the task reaches COMPLETED state and + bios_reboot_triggered is already set, we should: + 1. NOT trigger another reboot + 2. Call _continue_updates to proceed with next firmware + 3. Clean up the bios_reboot_triggered flag + """ + settings = [{'component': 'bios', 'url': 'http://bios/v1.0.1', + 'task_monitor': '/tasks/1', + 'bios_reboot_triggered': True}] # Flag already set + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + # Set up node with BIOS update in progress + # Note: bios_reboot_triggered IS set (reboot already happened) + task.node.set_driver_internal_info('redfish_fw_updates', settings) + task.node.clean_step = {'step': 'update', 'interface': 'firmware'} + + # Mock task monitor showing task is completed + mock_task_monitor = mock.Mock() + mock_task_monitor.is_processing = False + mock_task = mock.Mock() + mock_task.task_state = sushy.TASK_STATE_COMPLETED + mock_task.task_status = sushy.HEALTH_OK + mock_task.messages = [] + mock_task_monitor.get_task.return_value = mock_task + mock_get_task_monitor.return_value = mock_task_monitor + + # Call the check method + firmware_interface = redfish_fw.RedfishFirmware() + firmware_interface._check_node_redfish_firmware_update(task) + + # Verify reboot was NOT triggered (already happened) + mock_power_action.assert_not_called() + + # Verify _continue_updates WAS called + mock_continue_updates.assert_called_once_with( + firmware_interface, task, mock_get_update_service.return_value, + settings) + + # Verify the flag was cleaned up (popped from settings) + updated_settings = task.node.driver_internal_info[ + 'redfish_fw_updates'] + self.assertIsNone(updated_settings[0].get('bios_reboot_triggered')) + + # Verify LOG.info was called with the correct message + mock_log.info.assert_any_call( + 'BIOS firmware update task completed for node ' + '%(node)s. System was already rebooted. ' + 'Proceeding with continuation.', + {'node': task.node.uuid}) diff --git a/releasenotes/notes/fix-bios-fw-flash-190220f3beccf2f3.yaml b/releasenotes/notes/fix-bios-fw-flash-190220f3beccf2f3.yaml new file mode 100644 index 0000000000..a108500274 --- /dev/null +++ b/releasenotes/notes/fix-bios-fw-flash-190220f3beccf2f3.yaml @@ -0,0 +1,9 @@ +--- +fixes: + - | + Fixes an issue with BIOS firmware updates on certain BMCs where the + firmware task completes very quickly after staging the firmware, + before Ironic can trigger a reboot. Previously, this would result in the + staged BIOS firmware not being applied because no reboot was triggered. + Now, Ironic detects this condition and triggers the reboot when the task + completes, ensuring the staged firmware is properly applied. From 728203285c0394df9dcce6b14e25dfda8c63978c Mon Sep 17 00:00:00 2001 From: Jay Faulkner Date: Tue, 6 Jan 2026 10:31:13 -0800 Subject: [PATCH 09/15] Silence loud logging when no NetworkAdapters This reduces logging when NetworkAdapters are missing from a redfish bmc from warning level to debug level. This resolves an issue where loud logging was reporting on hardware without redfish NetworkAdapters support. Generated-by: Claude-code 2.0 Closes-bug: #2133727 Signed-off-by: Jay Faulkner Change-Id: If48757c6ec4a1f7978bd973830020161c55922e4 (cherry picked from commit 18bedb69fd28b761f3a66c4234128704fc6e97b1) (cherry picked from commit 4cbcb93b237b3f0179f8d00c7f46004d1122e575) --- ironic/drivers/modules/redfish/firmware.py | 33 ++++- .../drivers/modules/redfish/test_firmware.py | 131 +++++++++++++++++- ...ork-adapters-logging-46b8908b319f0bdd.yaml | 8 ++ 3 files changed, 165 insertions(+), 7 deletions(-) create mode 100644 releasenotes/notes/noisy-network-adapters-logging-46b8908b319f0bdd.yaml diff --git a/ironic/drivers/modules/redfish/firmware.py b/ironic/drivers/modules/redfish/firmware.py index 0b0bf477b7..b1f29bc0f5 100644 --- a/ironic/drivers/modules/redfish/firmware.py +++ b/ironic/drivers/modules/redfish/firmware.py @@ -173,12 +173,26 @@ def retrieve_nic_components(self, task, system): try: chassis = redfish_utils.get_chassis(task.node, system) except exception.RedfishError: - LOG.warning('No chassis available to retrieve NetworkAdapters ' - 'firmware information on node %(node_uuid)s', - {'node_uuid': task.node.uuid}) + LOG.debug('No chassis available to retrieve NetworkAdapters ' + 'firmware information on node %(node_uuid)s', + {'node_uuid': task.node.uuid}) + return nic_list + + try: + network_adapters = chassis.network_adapters + if network_adapters is None: + LOG.debug('NetworkAdapters not available on chassis for ' + 'node %(node_uuid)s', + {'node_uuid': task.node.uuid}) + return nic_list + adapters = network_adapters.get_members() + except sushy.exceptions.MissingAttributeError: + LOG.debug('NetworkAdapters not available on chassis for ' + 'node %(node_uuid)s', + {'node_uuid': task.node.uuid}) return nic_list - for net_adp in chassis.network_adapters.get_members(): + for net_adp in adapters: for net_adp_ctrl in net_adp.controllers: fw_pkg_v = net_adp_ctrl.firmware_package_version if not fw_pkg_v: @@ -643,9 +657,16 @@ def _validate_resources_stability(self, node): # Test Manager resource redfish_utils.get_manager(node, system) - # Test NetworkAdapters resource + # Test Chassis and NetworkAdapters resource (if available) + # Some systems may not have NetworkAdapters, which is valid chassis = redfish_utils.get_chassis(node, system) - chassis.network_adapters.get_members() + try: + network_adapters = chassis.network_adapters + if network_adapters is not None: + network_adapters.get_members() + except sushy.exceptions.MissingAttributeError: + # NetworkAdapters not available is acceptable + pass # All resources successful consecutive_successes += 1 diff --git a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py index 4f5ac7373f..5c4d49850d 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py @@ -297,7 +297,7 @@ def test_get_chassis_redfish_error(self, sync_fw_cmp_mock, system_mock, shared=True) as task: task.driver.firmware.cache_firmware_components(task) - log_mock.warning.assert_any_call( + log_mock.debug.assert_any_call( 'No chassis available to retrieve NetworkAdapters firmware ' 'information on node %(node_uuid)s', {'node_uuid': self.node.uuid} @@ -446,6 +446,70 @@ def test_retrieve_nic_components_invalid_firmware_version( fw_cmp_mock.assert_has_calls(fw_cmp_calls) log_mock.warning.assert_not_called() + @mock.patch.object(redfish_fw, 'LOG', autospec=True) + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + @mock.patch.object(redfish_utils, 'get_manager', autospec=True) + @mock.patch.object(redfish_utils, 'get_chassis', autospec=True) + @mock.patch.object(objects, 'FirmwareComponentList', autospec=True) + def test_retrieve_nic_components_network_adapters_none( + self, fw_cmp_list, chassis_mock, manager_mock, + system_mock, log_mock): + """Test that None network_adapters is handled gracefully.""" + fw_cmp_list.sync_firmware_components.return_value = ( + [{'component': 'bios', 'current_version': '1.0.0'}, + {'component': 'bmc', 'current_version': '1.0.0'}], + [], []) + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + system_mock.return_value.bios_version = '1.0.0' + manager_mock.return_value.firmware_version = '1.0.0' + # network_adapters is None + chassis_mock.return_value.network_adapters = None + + task.driver.firmware.cache_firmware_components(task) + + # Should log at debug level, not warning + log_mock.debug.assert_any_call( + 'NetworkAdapters not available on chassis for ' + 'node %(node_uuid)s', + {'node_uuid': self.node.uuid} + ) + log_mock.warning.assert_not_called() + + @mock.patch.object(redfish_fw, 'LOG', autospec=True) + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + @mock.patch.object(redfish_utils, 'get_manager', autospec=True) + @mock.patch.object(redfish_utils, 'get_chassis', autospec=True) + @mock.patch.object(objects, 'FirmwareComponentList', autospec=True) + def test_retrieve_nic_components_missing_attribute_error( + self, fw_cmp_list, chassis_mock, manager_mock, + system_mock, log_mock): + """Test that MissingAttributeError is handled gracefully.""" + fw_cmp_list.sync_firmware_components.return_value = ( + [{'component': 'bios', 'current_version': '1.0.0'}, + {'component': 'bmc', 'current_version': '1.0.0'}], + [], []) + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + system_mock.return_value.bios_version = '1.0.0' + manager_mock.return_value.firmware_version = '1.0.0' + # network_adapters raises MissingAttributeError + type(chassis_mock.return_value).network_adapters = ( + mock.PropertyMock( + side_effect=sushy.exceptions.MissingAttributeError)) + + task.driver.firmware.cache_firmware_components(task) + + # Should log at debug level, not warning + log_mock.debug.assert_any_call( + 'NetworkAdapters not available on chassis for ' + 'node %(node_uuid)s', + {'node_uuid': self.node.uuid} + ) + log_mock.warning.assert_not_called() + @mock.patch.object(redfish_utils, 'LOG', autospec=True) @mock.patch.object(redfish_utils, '_get_connection', autospec=True) def test_missing_updateservice(self, conn_mock, log_mock): @@ -1472,6 +1536,71 @@ def test__validate_resources_stability_custom_config(self): expected_calls = [mock.call(5)] * 4 # 4 sleeps between 5 sleep_mock.assert_has_calls(expected_calls) + def test__validate_resources_stability_network_adapters_none(self): + """Test validation succeeds when network_adapters is None.""" + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(redfish_utils, 'get_manager', + autospec=True) as manager_mock, \ + mock.patch.object(redfish_utils, 'get_chassis', + autospec=True) as chassis_mock, \ + mock.patch.object(time, 'time', autospec=True) as time_mock, \ + mock.patch.object(time, 'sleep', autospec=True): + + # Mock successful resource responses but network_adapters None + system_mock.return_value = mock.Mock() + manager_mock.return_value = mock.Mock() + chassis_mock.return_value.network_adapters = None + + # Mock time progression to simulate consecutive successes + time_mock.side_effect = [0, 1, 2, 3] + + # Should complete successfully (None network_adapters is OK) + firmware._validate_resources_stability(task.node) + + # Verify all resources were checked 3 times + self.assertEqual(system_mock.call_count, 3) + self.assertEqual(manager_mock.call_count, 3) + self.assertEqual(chassis_mock.call_count, 3) + + def test__validate_resources_stability_network_adapters_missing_attr(self): + """Test validation succeeds when network_adapters is missing.""" + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(redfish_utils, 'get_manager', + autospec=True) as manager_mock, \ + mock.patch.object(redfish_utils, 'get_chassis', + autospec=True) as chassis_mock, \ + mock.patch.object(time, 'time', autospec=True) as time_mock, \ + mock.patch.object(time, 'sleep', autospec=True): + + # Mock successful resource responses + system_mock.return_value = mock.Mock() + manager_mock.return_value = mock.Mock() + # network_adapters raises MissingAttributeError + type(chassis_mock.return_value).network_adapters = ( + mock.PropertyMock( + side_effect=sushy.exceptions.MissingAttributeError)) + + # Mock time progression to simulate consecutive successes + time_mock.side_effect = [0, 1, 2, 3] + + # Should complete successfully (missing network_adapters is OK) + firmware._validate_resources_stability(task.node) + + # Verify all resources were checked 3 times + self.assertEqual(system_mock.call_count, 3) + self.assertEqual(manager_mock.call_count, 3) + self.assertEqual(chassis_mock.call_count, 3) + def test__validate_resources_stability_badrequest_error(self): """Test BMC resource validation handles BadRequestError correctly.""" firmware = redfish_fw.RedfishFirmware() diff --git a/releasenotes/notes/noisy-network-adapters-logging-46b8908b319f0bdd.yaml b/releasenotes/notes/noisy-network-adapters-logging-46b8908b319f0bdd.yaml new file mode 100644 index 0000000000..22a888904a --- /dev/null +++ b/releasenotes/notes/noisy-network-adapters-logging-46b8908b319f0bdd.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Fixes excessive logging when Redfish-managed servers do not have + NetworkAdapters endpoints. Previously, servers without NetworkAdapters + would generate warning-level log messages during firmware component + caching. Now these messages are logged at debug level since having no + NetworkAdapters is a valid hardware configuration. From f0e8b9e897834a6425c85a0ee183958e1b50383c Mon Sep 17 00:00:00 2001 From: Iury Gregory Melo Ferreira Date: Sun, 14 Dec 2025 23:56:50 -0300 Subject: [PATCH 10/15] Fix BMC resource validation to catch all sushy exceptions The _validate_resources_stability() function was only catching sushy.exceptions.BadRequestError, but Dell iDRAC can return sushy.exceptions.ServerSideError (HTTP 500) and sushy.exceptions.ConnectionError when BMC resources are temporarily unavailable during firmware updates. When these uncaught exceptions occurred, they would crash the periodic task and prevent notify_conductor_resume_service() from being called, leaving the node stuck in servicing state even though the firmware update had completed successfully. Closes-Bug: #2136087 Change-Id: I3b5806cc4fb055d4264bb6ae9008f57d8c1e0cc1 Signed-off-by: Iury Gregory Melo Ferreira (cherry picked from commit dce19076d507200ee4ae47b17e4472fe4a892978) (cherry picked from commit 11b97221a2c061bad26c88363f888b91eb8e8dbb) --- ironic/drivers/modules/redfish/firmware.py | 6 +++- .../drivers/modules/redfish/test_firmware.py | 29 ++++++++++++++++--- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/ironic/drivers/modules/redfish/firmware.py b/ironic/drivers/modules/redfish/firmware.py index b1f29bc0f5..2e78ec77c6 100644 --- a/ironic/drivers/modules/redfish/firmware.py +++ b/ironic/drivers/modules/redfish/firmware.py @@ -684,7 +684,11 @@ def _validate_resources_stability(self, node): except (exception.RedfishError, exception.RedfishConnectionError, - sushy.exceptions.BadRequestError) as e: + sushy.exceptions.SushyError) as e: + LOG.debug('BMC resource validation failed for node %(node)s: ' + '%(error)s. This may indicate the BMC is still ' + 'restarting or recovering from firmware update.', + {'node': node.uuid, 'error': e}) # Resource not available yet, reset counter if consecutive_successes > 0: LOG.debug('Resource validation interrupted for node ' diff --git a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py index 5c4d49850d..aa091a8ca0 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py @@ -1392,7 +1392,9 @@ def test__validate_resources_stability_timeout(self): firmware._validate_resources_stability, task.node) - def test__validate_resources_stability_intermittent_failures(self): + @mock.patch.object(redfish_fw, 'LOG', autospec=True) + def test__validate_resources_stability_intermittent_failures( + self, mock_log): """Test BMC resource validation with intermittent failures.""" cfg.CONF.set_override('firmware_update_required_successes', 3, 'redfish') @@ -1441,6 +1443,14 @@ def system_side_effect(*args): self.assertEqual(manager_mock.call_count, 5) self.assertEqual(chassis_mock.call_count, 5) + # Verify verbose logging about BMC recovery was called + expected_log_call = mock.call( + 'BMC resource validation failed for node %(node)s: ' + '%(error)s. This may indicate the BMC is still ' + 'restarting or recovering from firmware update.', + {'node': task.node.uuid, 'error': mock.ANY}) + mock_log.debug.assert_has_calls([expected_log_call]) + def test__validate_resources_stability_manager_failure(self): """Test BMC resource validation when Manager resource fails.""" firmware = redfish_fw.RedfishFirmware() @@ -1601,7 +1611,8 @@ def test__validate_resources_stability_network_adapters_missing_attr(self): self.assertEqual(manager_mock.call_count, 3) self.assertEqual(chassis_mock.call_count, 3) - def test__validate_resources_stability_badrequest_error(self): + @mock.patch.object(redfish_fw, 'LOG', autospec=True) + def test__validate_resources_stability_badrequest_error(self, mock_log): """Test BMC resource validation handles BadRequestError correctly.""" firmware = redfish_fw.RedfishFirmware() @@ -1618,14 +1629,24 @@ def test__validate_resources_stability_badrequest_error(self): system_mock.side_effect = sushy.exceptions.BadRequestError( 'http://test', mock_response, mock_response) - # Mock time progression to exceed timeout - time_mock.side_effect = [0, 350] + # Mock time progression: start at 0, try once at 10, timeout + # at 350, this allows at least one loop iteration to trigger + # the exception + time_mock.side_effect = [0, 10, 350] # Should raise RedfishError due to timeout self.assertRaises(exception.RedfishError, firmware._validate_resources_stability, task.node) + # Verify verbose logging about BMC recovery was called + expected_log_call = mock.call( + 'BMC resource validation failed for node %(node)s: ' + '%(error)s. This may indicate the BMC is still ' + 'restarting or recovering from firmware update.', + {'node': task.node.uuid, 'error': mock.ANY}) + mock_log.debug.assert_has_calls([expected_log_call]) + @mock.patch.object(redfish_utils, 'get_manager', autospec=True) @mock.patch.object(redfish_utils, 'get_system', autospec=True) @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) From 00e22166356e8efd24f409ac37ff7a10362d29f1 Mon Sep 17 00:00:00 2001 From: Iury Gregory Melo Ferreira Date: Tue, 16 Dec 2025 00:40:25 -0300 Subject: [PATCH 11/15] Fix heartbeat for steps with requires_ramdisk=False Service and clean steps that have requires_ramdisk=False operate independently of the ramdisk agent (e.g., Redfish firmware updates). These steps should not be subject to heartbeat timeouts since they do not require the agent to be running. Previously, the _check_servicewait_timeouts and _check_cleanwait_timeouts periodic tasks would timeout and fail these steps if no agent heartbeat was received for service_callback_timeout (default 30 minutes), even though the step was successfully executing via out-of-band mechanisms. This caused firmware updates taking longer than 30 minutes to fail with: "Timeout reached while servicing the node. Please check if the ramdisk responsible for the servicing is running on the node." The error message was particularly misleading because the step explicitly declared requires_ramdisk=False, meaning no ramdisk was expected. Closes-Bug: #2136276 Signed-off-by: Iury Gregory Melo Ferreira Signed-off-by: Jacob Anders Assisted-by: Claude (Anthropic) version 4.5 Change-Id: I1cade32e1dce57441e83cbc9f0b07d9ee5e0ec01 (cherry picked from commit 54ca5a3b4c5d26be95762aabda2e1572ba2ad120) (cherry picked from commit c05c035f52fc0c22c459ef9530dc75ba138a0fbb) (cherry picked from commit 6d3bec365791af863d64015823e2da975df04a18) --- ironic/conf/redfish.py | 8 ++ ironic/drivers/modules/redfish/firmware.py | 51 +++++++- .../drivers/modules/redfish/test_firmware.py | 115 ++++++++++++++++++ ...out-no-ramdisk-steps-732f3da7faa28f94.yaml | 17 +++ 4 files changed, 190 insertions(+), 1 deletion(-) create mode 100644 releasenotes/notes/fix-heartbeat-timeout-no-ramdisk-steps-732f3da7faa28f94.yaml diff --git a/ironic/conf/redfish.py b/ironic/conf/redfish.py index 8492aa3775..b55a90f816 100644 --- a/ironic/conf/redfish.py +++ b/ironic/conf/redfish.py @@ -145,6 +145,14 @@ 'start applying firmware, while others can begin ' 'immediately. This timeout helps determine which ' 'behavior the hardware exhibits.')), + cfg.IntOpt('firmware_update_overall_timeout', + min=0, + default=7200, + help=_('Maximum time (in seconds) allowed for the entire ' + 'firmware update operation to complete. This provides ' + 'a safety net for firmware updates that get stuck. ' + 'Set to 0 to disable this timeout (not recommended). ' + 'Default is 7200 seconds (2 hours).')), cfg.StrOpt('firmware_source', choices=[('http', _('If firmware source URL is also HTTP, then ' 'serve from original location, otherwise ' diff --git a/ironic/drivers/modules/redfish/firmware.py b/ironic/drivers/modules/redfish/firmware.py index 2e78ec77c6..7be2aab74f 100644 --- a/ironic/drivers/modules/redfish/firmware.py +++ b/ironic/drivers/modules/redfish/firmware.py @@ -235,8 +235,11 @@ def update(self, task, settings): {'node_uuid': node.uuid, 'settings': settings}) self._execute_firmware_update(node, update_service, settings) - # Store updated settings and save + # Store updated settings and start time for overall timeout tracking node.set_driver_internal_info('redfish_fw_updates', settings) + node.set_driver_internal_info( + 'redfish_fw_update_start_time', + timeutils.utcnow().isoformat()) node.save() # Return wait state to keep the step active and let polling handle @@ -820,6 +823,7 @@ def _clear_updates(self, node): """ firmware_utils.cleanup(node) node.del_driver_internal_info('redfish_fw_updates') + node.del_driver_internal_info('redfish_fw_update_start_time') node.del_driver_internal_info('firmware_cleanup') node.del_driver_internal_info('firmware_reboot_requested') node.save() @@ -1202,12 +1206,50 @@ def _handle_wait_completion(self, task, update_service, settings, # Continue with updates self._continue_updates(task, update_service, settings) + def _check_overall_timeout(self, task): + """Check if firmware update has exceeded overall timeout. + + :param task: A TaskManager instance + :returns: True if timeout exceeded and error was handled, + False otherwise + """ + node = task.node + overall_timeout = CONF.redfish.firmware_update_overall_timeout + if overall_timeout <= 0: + return False + + start_time_str = node.driver_internal_info.get( + 'redfish_fw_update_start_time') + if not start_time_str: + return False + + start_time = timeutils.parse_isotime(start_time_str) + elapsed = timeutils.utcnow(True) - start_time + if elapsed.total_seconds() < overall_timeout: + return False + + msg = (_('Firmware update on node %(node)s has exceeded ' + 'the overall timeout of %(timeout)s seconds. ' + 'Elapsed time: %(elapsed)s seconds.') + % {'node': node.uuid, + 'timeout': overall_timeout, + 'elapsed': int(elapsed.total_seconds())}) + LOG.error(msg) + task.upgrade_lock() + self._clear_updates(node) + manager_utils.servicing_error_handler(task, msg, traceback=False) + return True + @METRICS.timer('RedfishFirmware._check_node_redfish_firmware_update') def _check_node_redfish_firmware_update(self, task): """Check the progress of running firmware update on a node.""" node = task.node + # Check overall timeout for firmware update operation + if self._check_overall_timeout(task): + return + settings = node.driver_internal_info['redfish_fw_updates'] current_update = settings[0] @@ -1222,6 +1264,13 @@ def _check_node_redfish_firmware_update(self, task): {'node': node.uuid, 'error': e}) return + # Touch provisioning to indicate progress is being monitored. + # This prevents heartbeat timeout from triggering for steps that + # don't require the ramdisk agent (requires_ramdisk=False). + # Note: Only touch after successful BMC communication to ensure + # the process eventually times out if the BMC is unresponsive. + node.touch_provisioning() + wait_start_time = current_update.get('wait_start_time') if wait_start_time: wait_start = timeutils.parse_isotime(wait_start_time) diff --git a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py index aa091a8ca0..c89f116a79 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py @@ -778,6 +778,121 @@ def _test__check_node_redfish_firmware_update(self): firmware._check_node_redfish_firmware_update(task) return task, firmware + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) + def test_check_calls_touch_provisioning(self, mock_task_monitor, + mock_get_update_service): + """Test _check_node_redfish_firmware_update calls touch_provisioning. + + This prevents heartbeat timeouts for firmware updates that don't + require the ramdisk agent (requires_ramdisk=False). By calling + touch_provisioning on each poll, we keep provision_updated_at fresh. + """ + self._generate_new_driver_internal_info(['bmc']) + + # Mock task still in progress + mock_task_monitor.return_value.is_processing = True + + firmware = redfish_fw.RedfishFirmware() + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + with mock.patch.object(task.node, 'touch_provisioning', + autospec=True) as mock_touch: + firmware._check_node_redfish_firmware_update(task) + + # Verify touch_provisioning was called + mock_touch.assert_called_once_with() + + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_check_skips_touch_provisioning_on_conn_error( + self, mock_get_update_service): + """Test touch_provisioning is NOT called when BMC connection fails. + + When the BMC is unresponsive, we should NOT update + provision_updated_at. This ensures the process eventually times + out if the BMC never recovers, rather than being kept alive. + """ + self._generate_new_driver_internal_info(['bmc']) + + # Mock connection error + mock_get_update_service.side_effect = exception.RedfishConnectionError( + 'Connection failed') + + firmware = redfish_fw.RedfishFirmware() + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + with mock.patch.object(task.node, 'touch_provisioning', + autospec=True) as mock_touch: + firmware._check_node_redfish_firmware_update(task) + + # Verify touch_provisioning was NOT called on connection error + mock_touch.assert_not_called() + + @mock.patch.object(redfish_fw.manager_utils, 'servicing_error_handler', + autospec=True) + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + def test_check_overall_timeout_exceeded(self, mock_get_update_service, + mock_error_handler): + """Test firmware update fails when overall timeout is exceeded. + + This ensures firmware updates don't run indefinitely - if the + overall timeout is exceeded, the update should fail with an error. + """ + self._generate_new_driver_internal_info(['bmc']) + + # Set start time to 3 hours ago (exceeds 2 hour default timeout) + past_time = (timeutils.utcnow() + - datetime.timedelta(hours=3)).isoformat() + self.node.set_driver_internal_info('redfish_fw_update_start_time', + past_time) + self.node.save() + + firmware = redfish_fw.RedfishFirmware() + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + firmware._check_node_redfish_firmware_update(task) + + # Verify error handler was called with timeout message + mock_error_handler.assert_called_once() + call_args = mock_error_handler.call_args + self.assertIn('exceeded', call_args[0][1].lower()) + self.assertIn('timeout', call_args[0][1].lower()) + + # Verify the firmware update info was cleaned up + task.node.refresh() + self.assertIsNone( + task.node.driver_internal_info.get('redfish_fw_updates')) + self.assertIsNone( + task.node.driver_internal_info.get( + 'redfish_fw_update_start_time')) + + @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) + @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) + def test_check_overall_timeout_not_exceeded(self, mock_task_monitor, + mock_get_update_service): + """Test firmware update continues when timeout not exceeded.""" + self._generate_new_driver_internal_info(['bmc']) + + # Set start time to 1 hour ago (within 2 hour default timeout) + past_time = (timeutils.utcnow() + - datetime.timedelta(hours=1)).isoformat() + self.node.set_driver_internal_info('redfish_fw_update_start_time', + past_time) + self.node.save() + + # Mock task still in progress + mock_task_monitor.return_value.is_processing = True + + firmware = redfish_fw.RedfishFirmware() + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + with mock.patch.object(task.node, 'touch_provisioning', + autospec=True) as mock_touch: + firmware._check_node_redfish_firmware_update(task) + + # Verify touch_provisioning was called (update continues) + mock_touch.assert_called_once_with() + @mock.patch.object(redfish_fw, 'LOG', autospec=True) @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) def test_check_conn_error(self, get_us_mock, log_mock): diff --git a/releasenotes/notes/fix-heartbeat-timeout-no-ramdisk-steps-732f3da7faa28f94.yaml b/releasenotes/notes/fix-heartbeat-timeout-no-ramdisk-steps-732f3da7faa28f94.yaml new file mode 100644 index 0000000000..379247d195 --- /dev/null +++ b/releasenotes/notes/fix-heartbeat-timeout-no-ramdisk-steps-732f3da7faa28f94.yaml @@ -0,0 +1,17 @@ +--- +fixes: + - | + Fixes an issue where service and clean steps with ``requires_ramdisk=False`` + (such as Redfish firmware updates) would incorrectly fail due to heartbeat + timeout. These steps operate independently of the ramdisk agent and should + not be subject to heartbeat timeouts. Previously, such steps would fail + after ``service_callback_timeout`` or ``clean_callback_timeout`` (default + 30 minutes) with a misleading error about the ramdisk not running. + The fix updates ``provision_updated_at`` during firmware update polling, + preventing the heartbeat timeout mechanism from triggering while the + out-of-band operation is still in progress. For more information, see + `bug 2136276 `_. +features: + - | + Adds ``[redfish]firmware_update_overall_timeout`` (default 2 hours) to + limit total firmware update duration as a safety net for stuck updates. From df680d93a2d1e7dcbb89e8b046a0ae34aa8e4c61 Mon Sep 17 00:00:00 2001 From: Iury Gregory Melo Ferreira Date: Mon, 15 Dec 2025 23:15:48 -0300 Subject: [PATCH 12/15] Add retry logic for boot device changes during POST After firmware updates complete and the system reboots, Ironic attempts to tear down from service by resetting the boot device. However, HPE iLO BMCs reject boot device modifications while the system is in POST (Power-On Self-Test), returning an UnableToModifyDuringSystemPOST error. This change adds retry functionality to assist the operators dealing with such scenarios. Closes-Bug: #2136275 Signed-off-by: Iury Gregory Melo Ferreira Signed-off-by: Jacob Anders Assisted-By: Claude Opus 4.5 Change-Id: I714030f433d6730a99f9f68cf60ce330e9d43c76 (cherry picked from commit 2d253a04f563bb93ad8a08b9bd4be1ac8aa2857e) (cherry picked from commit 9a4788bdea7f419484b8fdc07fff6783033b0ac2) (cherry picked from commit 6809dd2c086ec6fa46c967acfea229b9c1c2b515) --- ironic/conf/redfish.py | 15 ++++ ironic/drivers/modules/redfish/management.py | 56 +++++++++++-- .../modules/redfish/test_management.py | 84 +++++++++++++++++++ ...fish-post-boot-retry-8da100f84e6fcacf.yaml | 8 ++ 4 files changed, 155 insertions(+), 8 deletions(-) create mode 100644 releasenotes/notes/redfish-post-boot-retry-8da100f84e6fcacf.yaml diff --git a/ironic/conf/redfish.py b/ironic/conf/redfish.py index b55a90f816..63355864de 100644 --- a/ironic/conf/redfish.py +++ b/ironic/conf/redfish.py @@ -213,6 +213,21 @@ '$default_inspection_hooks. Hooks can be added before ' 'or after the defaults like this: ' '"prehook,$default_hooks,posthook".')), + cfg.IntOpt('post_boot_retry_attempts', + min=1, + default=6, + help=_('Maximum number of retry attempts when BMC rejects ' + 'boot device changes during POST (Power-On Self-Test). ' + 'Some BMCs (e.g. HPE iLO) reject boot device ' + 'modifications while the system is in POST after a ' + 'firmware update or reboot.')), + cfg.IntOpt('post_boot_retry_delay', + min=1, + default=5, + help=_('Minimum delay in seconds between retry attempts ' + 'for POST-related boot device errors. Exponential ' + 'backoff is applied, starting from this value up to ' + '6x this value.')), ] diff --git a/ironic/drivers/modules/redfish/management.py b/ironic/drivers/modules/redfish/management.py index ad937f8d60..051abd03e2 100644 --- a/ironic/drivers/modules/redfish/management.py +++ b/ironic/drivers/modules/redfish/management.py @@ -22,6 +22,7 @@ from oslo_log import log from oslo_utils import timeutils import sushy +import tenacity from ironic.common import boot_devices from ironic.common import boot_modes @@ -116,6 +117,24 @@ }} +def _is_during_post_error(exc): + """Check if a Sushy exception is an 'UnableToModifyDuringSystemPOST'. + + HPE iLO BMCs reject boot device changes while the system is in POST + (Power-On Self-Test), typically after a firmware update or reboot. + + :param exc: An exception instance + :returns: True if this is a POST-related error, False otherwise + """ + if not isinstance(exc, sushy.exceptions.BadRequestError): + return False + is_post_error = 'UnableToModifyDuringSystemPOST' in str(exc) + if is_post_error: + LOG.debug('Detected UnableToModifyDuringSystemPOST error from BMC, ' + 'will trigger retry logic. Error: %s', exc) + return is_post_error + + def _set_boot_device(task, system, device, persistent=False, http_boot_url=None): """An internal routine to set the boot device. @@ -161,14 +180,34 @@ def _set_boot_device(task, system, device, persistent=False, # NOTE(etingof): this can be racy, esp if BMC is not RESTful enabled = (desired_enabled if desired_enabled != current_enabled else None) - try: + + # Logging callback for retry attempts (closure captures task) + def _log_post_boot_retry(retry_state): + LOG.warning('BMC is in POST, unable to modify boot device for ' + 'node %(node)s. Retrying in %(delay).1f seconds ' + '(attempt %(attempt)d/%(total)d)', + {'node': task.node.uuid, + 'delay': retry_state.next_action.sleep, + 'attempt': retry_state.attempt_number, + 'total': CONF.redfish.post_boot_retry_attempts}) + + @tenacity.retry( + retry=tenacity.retry_if_exception(_is_during_post_error), + stop=tenacity.stop_after_attempt(CONF.redfish.post_boot_retry_attempts), + wait=tenacity.wait_exponential( + multiplier=1, + min=CONF.redfish.post_boot_retry_delay, + max=CONF.redfish.post_boot_retry_delay * 6), + before_sleep=_log_post_boot_retry, + reraise=True) + def _do_set_boot_options(): # NOTE(TheJulia): In sushy, it is uri, due to the convention used # in the standard. URL is used internally in ironic. if requires_full_boot_request: # Some vendors require sending all boot parameters every time desired_mode = system.boot.get('mode') \ or sushy.BOOT_SOURCE_MODE_UEFI - desired_enabled = BOOT_DEVICE_PERSISTENT_MAP_REV[persistent] + BOOT_DEVICE_PERSISTENT_MAP_REV[persistent] current_enabled = system.boot.get('enabled') \ or sushy.BOOT_SOURCE_ENABLED_ONCE current_target = system.boot.get('target') \ @@ -178,9 +217,8 @@ def _set_boot_device(task, system, device, persistent=False, 'Sending: mode=%(mode)s, enabled=%(enabled)s, ' 'target=%(target)s for node %(node)s', {'vendor': vendor, 'mode': desired_mode, - 'enabled': current_enabled, 'target': current_target, - 'node': task.node.uuid}) - + 'enabled': current_enabled, + 'target': current_target, 'node': task.node.uuid}) system.set_system_boot_options( device, mode=desired_mode, @@ -188,11 +226,13 @@ def _set_boot_device(task, system, device, persistent=False, http_boot_uri=http_boot_url ) else: - LOG.debug('Sending minimal Redfish boot device' - ' change for node %(node)s', - {'node': task.node.uuid}) + LOG.debug('Sending minimal Redfish boot device change for ' + 'node %(node)s', {'node': task.node.uuid}) system.set_system_boot_options(device, enabled=enabled, http_boot_uri=http_boot_url) + + try: + _do_set_boot_options() except sushy.exceptions.SushyError as e: if enabled == sushy.BOOT_SOURCE_ENABLED_CONTINUOUS: # NOTE(dtantsur): continuous boot device settings have been diff --git a/ironic/tests/unit/drivers/modules/redfish/test_management.py b/ironic/tests/unit/drivers/modules/redfish/test_management.py index 0fc1b8328a..953835c000 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_management.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_management.py @@ -327,6 +327,90 @@ def test_set_boot_device_http_boot(self, mock_get_system): self.assertNotIn('redfish_uefi_http_url', task.node.driver_internal_info) + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + def test_set_boot_device_post_retry_success(self, mock_get_system): + """Test successful retry when BMC rejects changes during POST.""" + # Configure minimal retries for faster test + self.config(post_boot_retry_attempts=3, group='redfish') + self.config(post_boot_retry_delay=1, group='redfish') + + fake_system = mock.Mock() + # First call fails with POST error, second succeeds + # Create a real BadRequestError with POST error message in str() + mock_response = mock.Mock() + mock_response.status_code = 400 + mock_response.json.return_value = { + 'error': {'message': 'UnableToModifyDuringSystemPOST'} + } + post_error = sushy.exceptions.BadRequestError( + 'POST', 'http://bmc/redfish', mock_response) + fake_system.set_system_boot_options.side_effect = [ + post_error, + None # Success on retry + ] + mock_get_system.return_value = fake_system + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.driver.management.set_boot_device(task, boot_devices.PXE) + + # Should have been called twice (initial + 1 retry) + self.assertEqual(2, + fake_system.set_system_boot_options.call_count) + + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + def test_set_boot_device_post_retry_exhausted(self, mock_get_system): + """Test that POST retry eventually gives up and raises.""" + # Configure minimal retries for faster test + self.config(post_boot_retry_attempts=2, group='redfish') + self.config(post_boot_retry_delay=1, group='redfish') + + fake_system = mock.Mock() + # All calls fail with POST error + # Create a real BadRequestError with POST error message in str() + mock_response = mock.Mock() + mock_response.status_code = 400 + mock_response.json.return_value = { + 'error': {'message': 'UnableToModifyDuringSystemPOST'} + } + post_error = sushy.exceptions.BadRequestError( + 'POST', 'http://bmc/redfish', mock_response) + fake_system.set_system_boot_options.side_effect = post_error + mock_get_system.return_value = fake_system + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + self.assertRaisesRegex( + exception.RedfishError, 'Redfish set boot device', + task.driver.management.set_boot_device, task, boot_devices.PXE) + + # Should have been called for all retry attempts + self.assertEqual(2, + fake_system.set_system_boot_options.call_count) + + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + def test_set_boot_device_non_post_error_no_retry(self, mock_get_system): + """Test that non-POST errors are not retried.""" + # Configure retries (should not be used) + self.config(post_boot_retry_attempts=3, group='redfish') + self.config(post_boot_retry_delay=1, group='redfish') + + fake_system = mock.Mock() + # Fail with a generic SushyError (not POST-related) + fake_system.set_system_boot_options.side_effect = ( + sushy.exceptions.SushyError('Some other error') + ) + mock_get_system.return_value = fake_system + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + self.assertRaisesRegex( + exception.RedfishError, 'Redfish set boot device', + task.driver.management.set_boot_device, task, boot_devices.PXE) + + # Should have been called only once (no retry for non-POST errors) + fake_system.set_system_boot_options.assert_called_once() + def test_restore_boot_device(self): fake_system = mock.Mock() with task_manager.acquire(self.context, self.node.uuid, diff --git a/releasenotes/notes/redfish-post-boot-retry-8da100f84e6fcacf.yaml b/releasenotes/notes/redfish-post-boot-retry-8da100f84e6fcacf.yaml new file mode 100644 index 0000000000..e6f9478961 --- /dev/null +++ b/releasenotes/notes/redfish-post-boot-retry-8da100f84e6fcacf.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Adds retry logic for Redfish boot device changes when BMC rejects + modifications during POST (Power-On Self-Test). This addresses issues + with HPE iLO BMCs returning ``UnableToModifyDuringSystemPOST`` errors + after firmware updates or reboots. See bug + `2136275 `_. From 50b4e06f45f512d8c58a557887ca898a828df7c2 Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Thu, 22 Jan 2026 22:47:08 +1000 Subject: [PATCH 13/15] Refactor firmware update temp field handling Follow-up to https://review.opendev.org/c/openstack/ironic/+/966344 - Add constants for temporary field names to prevent typos - Add _clean_temp_fields() helper for centralized cleanup - Move BMC check start time from driver_internal_info to settings dict - Change task completion log from INFO to DEBUG Assisted-By: Claude Opus 4.5 Change-Id: I9394fc3ada08f14ad44dc59839f4f6b97f63a1f5 Signed-off-by: Jacob Anders (cherry picked from commit 07102b6c0b70e024112849058e8247619324ec1e) --- ironic/drivers/modules/redfish/firmware.py | 118 +++++++++++------- .../drivers/modules/redfish/test_firmware.py | 23 ++-- 2 files changed, 88 insertions(+), 53 deletions(-) diff --git a/ironic/drivers/modules/redfish/firmware.py b/ironic/drivers/modules/redfish/firmware.py index 7be2aab74f..06621235f5 100644 --- a/ironic/drivers/modules/redfish/firmware.py +++ b/ironic/drivers/modules/redfish/firmware.py @@ -36,6 +36,16 @@ METRICS = metrics_utils.get_metrics_logger(__name__) +# Temporary field names stored in node.driver_internal_info +BMC_FW_VERSION_BEFORE_UPDATE = 'bmc_fw_version_before_update' +FIRMWARE_REBOOT_REQUESTED = 'firmware_reboot_requested' + +# Temporary field names stored in fw_upd/current_update settings dict +NIC_NEEDS_POST_COMPLETION_REBOOT = 'nic_needs_post_completion_reboot' +NIC_STARTING_TIMESTAMP = 'nic_starting_timestamp' +NIC_REBOOT_TRIGGERED = 'nic_reboot_triggered' +BIOS_REBOOT_TRIGGERED = 'bios_reboot_triggered' + class RedfishFirmware(base.FirmwareInterface): @@ -246,6 +256,18 @@ def update(self, task, settings): # the monitoring and eventual completion/reboot return async_steps.get_return_state(node) + def _clean_temp_fields(self, node): + """Clean up temporary fields used during firmware update monitoring. + + This ensures no stale data interferes with new firmware updates. + + :param node: the Ironic node object + """ + # BMC-related temp fields + node.del_driver_internal_info(BMC_FW_VERSION_BEFORE_UPDATE) + # General firmware temp fields + node.del_driver_internal_info(FIRMWARE_REBOOT_REQUESTED) + def _setup_bmc_update_monitoring(self, node, fw_upd): """Set up monitoring for BMC firmware update. @@ -256,13 +278,16 @@ def _setup_bmc_update_monitoring(self, node, fw_upd): :param node: the Ironic node object :param fw_upd: firmware update settings dict """ + # Clean any stale temp fields from previous updates + self._clean_temp_fields(node) + # Record current BMC version before update try: system = redfish_utils.get_system(node) manager = redfish_utils.get_manager(node, system) current_bmc_version = manager.firmware_version node.set_driver_internal_info( - 'bmc_fw_version_before_update', current_bmc_version) + BMC_FW_VERSION_BEFORE_UPDATE, current_bmc_version) LOG.debug('BMC version before update for node %(node)s: ' '%(version)s', {'node': node.uuid, 'version': current_bmc_version}) @@ -271,10 +296,6 @@ def _setup_bmc_update_monitoring(self, node, fw_upd): 'node %(node)s: %(error)s', {'node': node.uuid, 'error': e}) - node.set_driver_internal_info( - 'bmc_fw_check_start_time', - str(timeutils.utcnow().isoformat())) - LOG.info('BMC firmware update for node %(node)s. ' 'Monitoring BMC version instead of immediate reboot.', {'node': node.uuid}) @@ -284,9 +305,11 @@ def _setup_bmc_update_monitoring(self, node, fw_upd): if wait_interval is None: wait_interval = CONF.redfish.firmware_update_reboot_delay fw_upd['wait'] = wait_interval - # Set wait_start_time so polling can detect when task monitor - # becomes unresponsive and transition to version checking - fw_upd['wait_start_time'] = str(timeutils.utcnow().isoformat()) + # Set wait_start_time for polling interval and bmc_check_start_time + # for total timeout tracking (wait_start_time gets updated each poll) + start_time = str(timeutils.utcnow().isoformat()) + fw_upd['wait_start_time'] = start_time + fw_upd['bmc_check_start_time'] = start_time # Mark this as a BMC update so we can handle timeouts properly fw_upd['component_type'] = redfish_utils.BMC @@ -306,6 +329,9 @@ def _setup_nic_update_monitoring(self, node): :param node: the Ironic node object """ + # Clean any stale temp fields from previous updates + self._clean_temp_fields(node) + LOG.info('NIC firmware update for node %(node)s. Will monitor ' 'task state to determine if reboot is needed.', {'node': node.uuid}) @@ -326,6 +352,9 @@ def _setup_bios_update_monitoring(self, node): :param node: the Ironic node object """ + # Clean any stale temp fields from previous updates + self._clean_temp_fields(node) + LOG.info('BIOS firmware update for node %(node)s. Will reboot ' 'when update task starts.', {'node': node.uuid}) @@ -346,6 +375,9 @@ def _setup_default_update_monitoring(self, node, fw_upd): :param node: the Ironic node object :param fw_upd: firmware update settings dict """ + # Clean any stale temp fields from previous updates + self._clean_temp_fields(node) + component = fw_upd.get('component', '') LOG.warning( 'Unknown component type %(component)s for node %(node)s. ' @@ -411,7 +443,7 @@ def _handle_bmc_update_completion(self, task, update_service, # Note: BMC may be unresponsive after firmware update - expected current_version = self._get_current_bmc_version(node) version_before = node.driver_internal_info.get( - 'bmc_fw_version_before_update') + BMC_FW_VERSION_BEFORE_UPDATE) # If we can read the version and it changed, update is complete if (current_version is not None @@ -423,15 +455,13 @@ def _handle_bmc_update_completion(self, task, update_service, 'reboot.', {'node': node.uuid, 'old': version_before, 'new': current_version}) - node.del_driver_internal_info('bmc_fw_check_start_time') - node.del_driver_internal_info('bmc_fw_version_before_update') + node.del_driver_internal_info(BMC_FW_VERSION_BEFORE_UPDATE) node.save() self._continue_updates(task, update_service, settings) return # Check if we've been checking for too long - check_start_time = node.driver_internal_info.get( - 'bmc_fw_check_start_time') + check_start_time = current_update.get('bmc_check_start_time') if check_start_time: check_start = timeutils.parse_isotime(check_start_time) @@ -459,7 +489,7 @@ def _handle_bmc_update_completion(self, task, update_service, {'node': node.uuid, 'elapsed': elapsed_time.seconds}) # Mark that reboot is needed node.set_driver_internal_info( - 'firmware_reboot_requested', True) + FIRMWARE_REBOOT_REQUESTED, True) # Enable reboot flag now that we're ready to reboot deploy_utils.set_async_step_flags( node, @@ -467,8 +497,7 @@ def _handle_bmc_update_completion(self, task, update_service, polling=True ) - node.del_driver_internal_info('bmc_fw_check_start_time') - node.del_driver_internal_info('bmc_fw_version_before_update') + node.del_driver_internal_info(BMC_FW_VERSION_BEFORE_UPDATE) node.save() self._continue_updates(task, update_service, settings) return @@ -504,7 +533,7 @@ def _handle_nic_update_completion(self, task, update_service, settings, # Check if reboot is needed (task went to Running state) needs_reboot = current_update.get( - 'nic_needs_post_completion_reboot', False) + NIC_NEEDS_POST_COMPLETION_REBOOT, False) if needs_reboot: LOG.info( @@ -514,20 +543,20 @@ def _handle_nic_update_completion(self, task, update_service, settings, # Mark that reboot is needed node.set_driver_internal_info( - 'firmware_reboot_requested', True) + FIRMWARE_REBOOT_REQUESTED, True) # Clean up flags - current_update.pop('nic_needs_post_completion_reboot', None) - current_update.pop('nic_starting_timestamp', None) - current_update.pop('nic_reboot_triggered', None) + current_update.pop(NIC_NEEDS_POST_COMPLETION_REBOOT, None) + current_update.pop(NIC_STARTING_TIMESTAMP, None) + current_update.pop(NIC_REBOOT_TRIGGERED, None) else: LOG.info( 'NIC firmware update task completed for node ' '%(node)s. Reboot already occurred during update ' 'start.', {'node': node.uuid}) # Clean up all NIC-related flags - current_update.pop('nic_starting_timestamp', None) - current_update.pop('nic_reboot_triggered', None) + current_update.pop(NIC_STARTING_TIMESTAMP, None) + current_update.pop(NIC_REBOOT_TRIGGERED, None) self._continue_updates(task, update_service, settings) @@ -749,7 +778,7 @@ def _continue_updates(self, task, update_service, settings): if len(settings) == 1: # Last firmware update - check if reboot is needed reboot_requested = node.driver_internal_info.get( - 'firmware_reboot_requested', False) + FIRMWARE_REBOOT_REQUESTED, False) self._clear_updates(node) @@ -825,7 +854,8 @@ def _clear_updates(self, node): node.del_driver_internal_info('redfish_fw_updates') node.del_driver_internal_info('redfish_fw_update_start_time') node.del_driver_internal_info('firmware_cleanup') - node.del_driver_internal_info('firmware_reboot_requested') + # Clean all temporary fields used during firmware update monitoring + self._clean_temp_fields(node) node.save() @METRICS.timer('RedfishFirmware._query_update_failed') @@ -880,11 +910,11 @@ def _handle_task_completion(self, task, sushy_task, messages, if (sushy_task.task_state == sushy.TASK_STATE_COMPLETED and sushy_task.task_status in [sushy.HEALTH_OK, sushy.HEALTH_WARNING]): - LOG.info('Firmware update task completed for node %(node)s, ' - 'firmware %(firmware_image)s: %(messages)s.', - {'node': node.uuid, - 'firmware_image': current_update['url'], - 'messages': ", ".join(messages)}) + LOG.debug('Redfish task completed for node %(node)s, ' + 'firmware %(firmware_image)s: %(messages)s.', + {'node': node.uuid, + 'firmware_image': current_update['url'], + 'messages': ", ".join(messages)}) # Component-specific post-update handling component = current_update.get('component', '') @@ -904,12 +934,12 @@ def _handle_task_completion(self, task, sushy_task, messages, # very quickly (staging the firmware) before Ironic can poll # and trigger the reboot. In this case, we need to trigger # the reboot now to actually apply the firmware. - if not current_update.get('bios_reboot_triggered'): + if not current_update.get(BIOS_REBOOT_TRIGGERED): LOG.info('BIOS firmware update task completed for node ' '%(node)s but reboot was not triggered yet. ' 'Triggering reboot now to apply staged firmware.', {'node': node.uuid}) - current_update['bios_reboot_triggered'] = True + current_update[BIOS_REBOOT_TRIGGERED] = True node.set_driver_internal_info('redfish_fw_updates', settings) node.save() @@ -925,7 +955,7 @@ def _handle_task_completion(self, task, sushy_task, messages, 'Proceeding with continuation.', {'node': node.uuid}) # Clean up the reboot trigger flag - current_update.pop('bios_reboot_triggered', None) + current_update.pop(BIOS_REBOOT_TRIGGERED, None) self._continue_updates(task, update_service, settings) else: # Default: continue as before @@ -987,10 +1017,10 @@ def _handle_nic_task_starting(self, task, task_monitor, settings, 'Will wait for completion then reboot.', {'node': node.uuid}) # Clear flags since we're past the starting phase - current_update.pop('nic_starting_timestamp', None) - current_update.pop('nic_reboot_triggered', None) + current_update.pop(NIC_STARTING_TIMESTAMP, None) + current_update.pop(NIC_REBOOT_TRIGGERED, None) # Mark that reboot will be needed after completion - current_update['nic_needs_post_completion_reboot'] = True + current_update[NIC_NEEDS_POST_COMPLETION_REBOOT] = True node.set_driver_internal_info('redfish_fw_updates', settings) node.save() return False # Continue polling until completion @@ -998,17 +1028,17 @@ def _handle_nic_task_starting(self, task, task_monitor, settings, # If task is in STARTING, check if we need to wait or reboot if task_state == sushy.TASK_STATE_STARTING: # Check if we already triggered a reboot - if current_update.get('nic_reboot_triggered'): + if current_update.get(NIC_REBOOT_TRIGGERED): LOG.debug('NIC firmware update for node %(node)s: ' 'reboot already triggered, waiting for task ' 'to progress.', {'node': node.uuid}) return False # Continue polling - starting_time = current_update.get('nic_starting_timestamp') + starting_time = current_update.get(NIC_STARTING_TIMESTAMP) if not starting_time: # First time seeing STARTING - record timestamp - current_update['nic_starting_timestamp'] = str( + current_update[NIC_STARTING_TIMESTAMP] = str( timeutils.utcnow().isoformat()) node.set_driver_internal_info( 'redfish_fw_updates', settings) @@ -1042,9 +1072,9 @@ def _handle_nic_task_starting(self, task, task_monitor, settings, {'node': node.uuid, 'wait': nic_starting_wait}) # Mark that we triggered a reboot to prevent repeat reboots - current_update['nic_reboot_triggered'] = True + current_update[NIC_REBOOT_TRIGGERED] = True # Clean up timestamp - current_update.pop('nic_starting_timestamp', None) + current_update.pop(NIC_STARTING_TIMESTAMP, None) node.set_driver_internal_info('redfish_fw_updates', settings) node.save() @@ -1080,7 +1110,7 @@ def _handle_bios_task_starting(self, task, task_monitor, settings, """ node = task.node - if current_update.get('bios_reboot_triggered'): + if current_update.get(BIOS_REBOOT_TRIGGERED): # Already triggered, just keep polling return False @@ -1108,7 +1138,7 @@ def _handle_bios_task_starting(self, task, task_monitor, settings, 'state': sushy_task.task_state}) # Mark reboot as triggered to avoid repeated reboots - current_update['bios_reboot_triggered'] = True + current_update[BIOS_REBOOT_TRIGGERED] = True node.set_driver_internal_info( 'redfish_fw_updates', settings) node.save() @@ -1201,7 +1231,7 @@ def _handle_wait_completion(self, task, update_service, settings, # For default/unknown components, reboot may be needed if component_type is None: node.set_driver_internal_info( - 'firmware_reboot_requested', True) + FIRMWARE_REBOOT_REQUESTED, True) node.save() # Continue with updates self._continue_updates(task, update_service, settings) diff --git a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py index c89f116a79..451f04db2e 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py @@ -1106,14 +1106,14 @@ def test__check_node_firmware_update_done(self, tm_mock, get_us_mock, task, interface = self._test__check_node_redfish_firmware_update() task.upgrade_lock.assert_called_once_with() - info_calls = [ - mock.call('Firmware update task completed for node %(node)s, ' + debug_calls = [ + mock.call('Redfish task completed for node %(node)s, ' 'firmware %(firmware_image)s: %(messages)s.', {'node': self.node.uuid, 'firmware_image': 'https://bmc/v1.0.1', 'messages': 'Firmware update done'})] - log_mock.info.assert_has_calls(info_calls) + log_mock.debug.assert_has_calls(debug_calls) # NOTE(iurygregory): _validate_resources_stability is now called # in _continue_updates before power operations, not in # _handle_task_completion @@ -1802,7 +1802,9 @@ def test_update_bmc_uses_configured_timeout(self, mock_get_update_service, ) # Verify BMC version check tracking is set up info = task.node.driver_internal_info - self.assertIn('bmc_fw_check_start_time', info) + fw_updates = info.get('redfish_fw_updates', []) + self.assertEqual(1, len(fw_updates)) + self.assertIn('bmc_check_start_time', fw_updates[0]) self.assertIn('bmc_fw_version_before_update', info) self.assertEqual(states.SERVICEWAIT, result) @@ -1847,7 +1849,9 @@ def test_update_bmc_uses_bmc_constant(self, mock_get_update_service, ) # Verify BMC version check tracking is set up info = task.node.driver_internal_info - self.assertIn('bmc_fw_check_start_time', info) + fw_updates = info.get('redfish_fw_updates', []) + self.assertEqual(1, len(fw_updates)) + self.assertIn('bmc_check_start_time', fw_updates[0]) self.assertIn('bmc_fw_version_before_update', info) self.assertEqual(states.SERVICEWAIT, result) @@ -2014,7 +2018,9 @@ def test_update_bmc_no_immediate_reboot(self, mock_get_update_service, # Verify BMC version check tracking is set up info = task.node.driver_internal_info - self.assertIn('bmc_fw_check_start_time', info) + fw_updates = info.get('redfish_fw_updates', []) + self.assertEqual(1, len(fw_updates)) + self.assertIn('bmc_check_start_time', fw_updates[0]) self.assertIn('bmc_fw_version_before_update', info) @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) @@ -2094,15 +2100,14 @@ def test_bmc_version_check_timeout_sets_reboot_flag( mock_parse_isotime.return_value = start_time mock_utcnow.return_value = current_time settings = [{'component': 'bmc', 'url': 'http://bmc/v1.0.0', - 'wait': 300, 'task_monitor': '/tasks/1'}] + 'wait': 300, 'task_monitor': '/tasks/1', + 'bmc_check_start_time': '2025-01-01T00:00:00.000000'}] with task_manager.acquire(self.context, self.node.uuid, shared=False) as task: # Set up node with BMC version checking in progress task.node.set_driver_internal_info( 'redfish_fw_updates', settings) - task.node.set_driver_internal_info( - 'bmc_fw_check_start_time', '2025-01-01T00:00:00.000000') # Mock BMC is unresponsive mock_get_bmc_version.return_value = None From 722cc305ed28a3d29370aa06661368c1c644f08c Mon Sep 17 00:00:00 2001 From: Iury Gregory Melo Ferreira Date: Tue, 27 Jan 2026 01:08:14 -0300 Subject: [PATCH 14/15] Change default value of some redfish firmware update config This commit changes the default values for the following config options: - [redfish]firmware_update_wait_unresponsive_bmc - [redfish]firmware_update_resource_validation_timeout Closes-Bug: #2139122 Change-Id: Ie964df68e3bf4e0421edf4fedab7748603baf4e0 Signed-off-by: Iury Gregory Melo Ferreira (cherry picked from commit cb7e0a50a344001ae6b0e865e4b8b63dda5e7696) --- ironic/conf/redfish.py | 4 ++-- .../unit/drivers/modules/redfish/test_firmware.py | 10 +++++----- releasenotes/notes/bug-2139122-c379997d1943ead6.yaml | 8 ++++++++ 3 files changed, 15 insertions(+), 7 deletions(-) create mode 100644 releasenotes/notes/bug-2139122-c379997d1943ead6.yaml diff --git a/ironic/conf/redfish.py b/ironic/conf/redfish.py index 63355864de..530de73e89 100644 --- a/ironic/conf/redfish.py +++ b/ironic/conf/redfish.py @@ -92,7 +92,7 @@ 'failed firmware update tasks')), cfg.IntOpt('firmware_update_wait_unresponsive_bmc', min=0, - default=300, + default=0, help=_('Number of seconds to wait before proceeding with the ' 'reboot to finish the BMC firmware update step')), cfg.IntOpt('firmware_update_required_successes', @@ -110,7 +110,7 @@ 'with no delay.')), cfg.IntOpt('firmware_update_resource_validation_timeout', min=0, - default=300, + default=480, help=_('Timeout (in seconds) to wait for BMC resources ' '(System, Manager, NetworkAdapters) to become stable ' 'and consistently available after firmware update. ' diff --git a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py index 451f04db2e..268a36e9ef 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py @@ -1500,7 +1500,7 @@ def test__validate_resources_stability_timeout(self): 'timeout') # Mock time progression to exceed timeout - time_mock.side_effect = [0, 350] # Exceeds 300 second timeout + time_mock.side_effect = [0, 500] # Should raise RedfishError due to timeout self.assertRaises(exception.RedfishError, @@ -1584,7 +1584,7 @@ def test__validate_resources_stability_manager_failure(self): 'manager error') # Mock time progression to exceed timeout - time_mock.side_effect = [0, 350] + time_mock.side_effect = [0, 500] # Should raise RedfishError due to timeout self.assertRaises(exception.RedfishError, @@ -1612,7 +1612,7 @@ def test__validate_resources_stability_network_adapters_failure(self): 'chassis error') # Mock time progression to exceed timeout - time_mock.side_effect = [0, 350] + time_mock.side_effect = [0, 500] # Should raise RedfishError due to timeout self.assertRaises(exception.RedfishError, @@ -1745,9 +1745,9 @@ def test__validate_resources_stability_badrequest_error(self, mock_log): 'http://test', mock_response, mock_response) # Mock time progression: start at 0, try once at 10, timeout - # at 350, this allows at least one loop iteration to trigger + # at 500, this allows at least one loop iteration to trigger # the exception - time_mock.side_effect = [0, 10, 350] + time_mock.side_effect = [0, 10, 500] # Should raise RedfishError due to timeout self.assertRaises(exception.RedfishError, diff --git a/releasenotes/notes/bug-2139122-c379997d1943ead6.yaml b/releasenotes/notes/bug-2139122-c379997d1943ead6.yaml new file mode 100644 index 0000000000..af7dc286ed --- /dev/null +++ b/releasenotes/notes/bug-2139122-c379997d1943ead6.yaml @@ -0,0 +1,8 @@ +--- +upgrade: + - | + The default configuration value for + ``[redfish]firmware_update_wait_unresponsive_bmc`` and for + ``[redfish]firmware_update_resource_validation_timeout`` has been changed + to ``0`` and ``480`` respectively. For more information, see + https://bugs.launchpad.net/ironic/+bug/2139122. From 1e002e39af172805bc63c7a12c63c958627963df Mon Sep 17 00:00:00 2001 From: Iury Gregory Melo Ferreira Date: Fri, 30 Jan 2026 13:16:37 -0300 Subject: [PATCH 15/15] Call task.upgrade_lock at begin of _handle functions We saw cases in our testing where the newer information of the node object wasn't being saved. The task.upgrade_lock reloads the node from the DB, but the node variabble would point to the old node object. Signed-off-by: Iury Gregory Melo Ferreira Change-Id: Iebef487f7411845958fee19381e58eb448894d82 (cherry picked from commit 54bb9cc14ae66db7c1f3ea8d08a37b1483cfbc17) --- ironic/drivers/modules/redfish/firmware.py | 10 +-- .../drivers/modules/redfish/test_firmware.py | 74 +++++++++++++++++++ ...x-node-info-fwupdate-23778908c67d1bd6.yaml | 6 ++ 3 files changed, 84 insertions(+), 6 deletions(-) create mode 100644 releasenotes/notes/fix-node-info-fwupdate-23778908c67d1bd6.yaml diff --git a/ironic/drivers/modules/redfish/firmware.py b/ironic/drivers/modules/redfish/firmware.py index 06621235f5..dd49c79475 100644 --- a/ironic/drivers/modules/redfish/firmware.py +++ b/ironic/drivers/modules/redfish/firmware.py @@ -998,10 +998,9 @@ def _handle_nic_task_starting(self, task, task_monitor, settings, :param current_update: the current firmware update being processed :returns: True if should stop polling, False to continue """ - node = task.node - # Upgrade lock at the start since we may modify driver_internal_info task.upgrade_lock() + node = task.node try: sushy_task = task_monitor.get_task() @@ -1108,14 +1107,14 @@ def _handle_bios_task_starting(self, task, task_monitor, settings, :param current_update: the current firmware update being processed :returns: True if reboot was triggered, False otherwise """ - node = task.node - if current_update.get(BIOS_REBOOT_TRIGGERED): # Already triggered, just keep polling return False # Upgrade lock at the start since we may modify driver_internal_info task.upgrade_lock() + node = task.node + try: sushy_task = task_monitor.get_task() @@ -1168,10 +1167,9 @@ def _handle_wait_completion(self, task, update_service, settings, :param settings: firmware update settings :param current_update: the current firmware update being processed """ - node = task.node - # Upgrade lock at the start since we may modify driver_internal_info task.upgrade_lock() + node = task.node # Check if this is BMC version checking if current_update.get('bmc_version_checking'): diff --git a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py index 268a36e9ef..edf9b41076 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py @@ -2449,3 +2449,77 @@ def test_bios_continue_after_completion_with_prior_reboot( '%(node)s. System was already rebooted. ' 'Proceeding with continuation.', {'node': task.node.uuid}) + + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) + def test_bios_handle_task_starting_sets_flag_correctly( + self, mock_get_task_monitor, mock_power_action): + """Test _handle_bios_task_starting sets flag correctly with lock. + + This test verifies that when _handle_bios_task_starting is called: + 1. The bios_reboot_triggered flag is set + 2. task.node reflects the flag immediately after the method returns + + If node = task.node were captured BEFORE upgrade_lock (the bug), + this test would fail because task.node would be stale and not + have the flag. + + With the correct order (upgrade_lock first, then node = task.node), + the test passes because changes are saved to the correct object. + """ + settings = [{'component': 'bios', 'url': 'http://bios/v1.0.1', + 'task_monitor': '/tasks/1'}] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + # Set up node with BIOS update in progress + task.node.set_driver_internal_info('redfish_fw_updates', settings) + task.node.clean_step = {'step': 'update', 'interface': 'firmware'} + task.node.save() + + # Mock task monitor showing STARTING state + mock_task_monitor = mock.Mock() + mock_task = mock.Mock() + mock_task.task_state = sushy.TASK_STATE_STARTING + mock_task_monitor.get_task.return_value = mock_task + mock_get_task_monitor.return_value = mock_task_monitor + + # Mock upgrade_lock to simulate what happens in production: + # Replace task.node with a fresh copy from DB + original_upgrade_lock = task.upgrade_lock + def mock_upgrade_lock(): + original_upgrade_lock() + # Simulate production behavior: replace task.node + # with fresh copy + task.node = objects.Node.get(self.context, task.node.uuid) + + with mock.patch.object(task, 'upgrade_lock', + side_effect=mock_upgrade_lock, + autospec=True): + # Call the actual method being tested + firmware_interface = redfish_fw.RedfishFirmware() + result = firmware_interface._handle_bios_task_starting( + task, mock_task_monitor, settings, settings[0]) + + # Verify reboot was triggered + self.assertTrue(result, + 'Method should return True when reboot triggered') + mock_power_action.assert_called_once() + + # CRITICAL TEST: Verify task.node has the flag + # If the order were wrong (node captured before upgrade_lock), + # task.node would be a stale object without the flag + current_settings = task.node.driver_internal_info.get( + 'redfish_fw_updates', [{}]) + self.assertTrue( + current_settings[0].get('bios_reboot_triggered'), + 'Flag must be in task.node after method returns. ' + 'If this fails, node was captured before upgrade_lock.') + + # Verify the flag is also persisted to DB + task.node.refresh() + refreshed_settings = task.node.driver_internal_info.get( + 'redfish_fw_updates', [{}]) + self.assertTrue( + refreshed_settings[0].get('bios_reboot_triggered'), + 'Flag must be persisted to database') diff --git a/releasenotes/notes/fix-node-info-fwupdate-23778908c67d1bd6.yaml b/releasenotes/notes/fix-node-info-fwupdate-23778908c67d1bd6.yaml new file mode 100644 index 0000000000..981da0bb92 --- /dev/null +++ b/releasenotes/notes/fix-node-info-fwupdate-23778908c67d1bd6.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + Fixes a regression introduced in the firmware upgrade fixes that caused + the intermediate status not to get persisted on the node, causing a + reboot loop.