diff --git a/docs/changelog.md b/docs/changelog.md index 120c95352..fb6061fc4 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,10 @@ This changelog documents user-relevant changes to the GitHub runner charm. +## 2026-03-30 + +- Fixed stale `systemd` service files left behind when a unit is removed from a co-located machine. The service is now disabled, the service file removed, and the unit data cleaned up on stop. + ## 2026-03-25 - Removed `mongodb` relation. MongoDB-based reactive runner spawning is no longer supported. **If you have an active MongoDB relation, remove it with `juju remove-relation` before upgrading.** diff --git a/github-runner-manager/pyproject.toml b/github-runner-manager/pyproject.toml index a5877fdd6..2655cb0e7 100644 --- a/github-runner-manager/pyproject.toml +++ b/github-runner-manager/pyproject.toml @@ -3,7 +3,7 @@ [project] name = "github-runner-manager" -version = "0.17.0" +version = "0.17.1" authors = [ { name = "Canonical IS DevOps", email = "is-devops-team@canonical.com" }, ] diff --git a/src/charm.py b/src/charm.py index 8d9f7bf7b..8d84eb116 100755 --- a/src/charm.py +++ b/src/charm.py @@ -448,7 +448,7 @@ def _log_charm_status(self) -> None: def _on_stop(self, _: StopEvent) -> None: """Handle the stopping of the charm.""" self._manager_client.flush_runner(busy=True) - manager_service.stop(self.unit.name) + manager_service.cleanup(self.unit.name) def _install_deps(self) -> None: """Install dependences for the charm.""" diff --git a/src/manager_service.py b/src/manager_service.py index 5cce925da..a5e2ddbd4 100644 --- a/src/manager_service.py +++ b/src/manager_service.py @@ -6,6 +6,7 @@ import fcntl import json import logging +import shutil import socket import textwrap from pathlib import Path @@ -39,6 +40,7 @@ _INSTALL_ERROR_MESSAGE = "Unable to install github-runner-manager package from source" _SERVICE_SETUP_ERROR_MESSAGE = "Unable to enable or start the github-runner-manager application" _SERVICE_STOP_ERROR_MESSAGE = "Unable to stop the github-runner-manager application" +_SERVICE_CLEANUP_ERROR_MESSAGE = "Unable to clean up the github-runner-manager service" logger = logging.getLogger(__name__) @@ -267,6 +269,34 @@ def stop(unit_name: str) -> None: _stop(unit_name) +def cleanup(unit_name: str) -> None: + """Stop, disable, and remove all artifacts for a unit's service. + + Used during unit removal to ensure no stale systemd service or data remains + on the machine. + + Args: + unit_name: The Juju unit name. + + Raises: + RunnerManagerApplicationStopError: Failed to clean up the service. + """ + _stop(unit_name) + instance_service = _instance_service_name(unit_name) + service_file = SYSTEMD_SERVICE_PATH / f"{instance_service}.service" + try: + systemd.service_disable(instance_service) + except SystemdError: + logger.warning("Failed to disable %s, unit may already be absent", instance_service) + try: + service_file.unlink(missing_ok=True) + systemd.daemon_reload() + except (SystemdError, OSError) as err: + raise RunnerManagerApplicationStopError(_SERVICE_CLEANUP_ERROR_MESSAGE) from err + + _remove_unit_data(unit_name) + + def _stop(unit_name: str) -> None: """Stop the GitHub runner manager service. @@ -375,3 +405,24 @@ def _setup_service_file(unit_name: str, config_file: Path, log_file: Path, log_l ) service_path.parent.mkdir(parents=True, exist_ok=True) service_path.write_text(service_file_content, "utf-8") + + +def _remove_unit_data(unit_name: str) -> None: + """Remove the per-unit data directory and log file. + + Raises: + RunnerManagerApplicationStopError: Failed to remove unit data. + """ + normalized = _normalized_unit(unit_name) + unit_dir = GITHUB_RUNNER_MANAGER_SERVICE_DIR / normalized + try: + shutil.rmtree(unit_dir) + except FileNotFoundError: + pass + except OSError as exc: + raise RunnerManagerApplicationStopError(_SERVICE_CLEANUP_ERROR_MESSAGE) from exc + log_file = GITHUB_RUNNER_MANAGER_SERVICE_LOG_DIR / f"{normalized}.log" + try: + log_file.unlink(missing_ok=True) + except OSError as exc: + raise RunnerManagerApplicationStopError(_SERVICE_CLEANUP_ERROR_MESSAGE) from exc diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 625c1bc9e..b50f9d563 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -307,11 +307,11 @@ def test__on_config_changed_no_flush(monkeypatch: pytest.MonkeyPatch, config_opt harness.charm._manager_client.flush_runner.assert_not_called() -def test_on_stop_busy_flush_and_stop_service(harness: Harness, monkeypatch: pytest.MonkeyPatch): +def test_on_stop_busy_flush_and_cleanup_service(harness: Harness, monkeypatch: pytest.MonkeyPatch): """ arrange: Set up charm with Openstack mode and runner scaler mock. act: Trigger stop event. - assert: Runner scaler mock flushes the runners using busy mode. + assert: Runners are flushed in busy mode and service cleanup is called. """ state_mock = MagicMock() harness.charm._setup_state = MagicMock(return_value=state_mock) @@ -324,7 +324,7 @@ def test_on_stop_busy_flush_and_stop_service(harness: Harness, monkeypatch: pyte harness.charm._on_stop(mock_event) manager_client_mock.flush_runner.assert_called_once_with(busy=True) - mock_manager_service.stop.assert_called_once() + mock_manager_service.cleanup.assert_called_once() @pytest.mark.parametrize( diff --git a/tests/unit/test_manager_service.py b/tests/unit/test_manager_service.py index 8795ad8c4..dacfc23c6 100644 --- a/tests/unit/test_manager_service.py +++ b/tests/unit/test_manager_service.py @@ -190,6 +190,85 @@ def test_stop_with_stopped_service(mock_systemd: MagicMock): mock_systemd.service_stop.assert_not_called() +def test_cleanup_removes_service_and_data(mock_systemd: MagicMock, patched_paths: PatchedPaths): + """ + arrange: A running service with service file, data dir, and log file on disk. + act: Run cleanup. + assert: Service is stopped, disabled, service file removed, daemon reloaded, + data dir and log file removed. + """ + unit_name = "test-unit/0" + normalized = "test-unit-0" + instance_service = f"github-runner-manager@{normalized}" + mock_systemd.service_running.return_value = True + + service_file = patched_paths.systemd_service_path / f"{instance_service}.service" + service_file.parent.mkdir(parents=True, exist_ok=True) + service_file.write_text("mock service", encoding="utf-8") + + unit_dir = patched_paths.service_dir / normalized + unit_dir.mkdir(parents=True, exist_ok=True) + (unit_dir / "config.yaml").write_text("mock config", encoding="utf-8") + (unit_dir / "http_port").write_text("55555", encoding="utf-8") + + log_file = patched_paths.service_log_dir / f"{normalized}.log" + log_file.parent.mkdir(parents=True, exist_ok=True) + log_file.write_text("mock log", encoding="utf-8") + + manager_service.cleanup(unit_name) + + mock_systemd.service_stop.assert_called_once_with(instance_service) + mock_systemd.service_disable.assert_called_once_with(instance_service) + mock_systemd.daemon_reload.assert_called_once() + assert not service_file.exists() + assert not unit_dir.exists() + assert not log_file.exists() + + +def test_cleanup_idempotent_missing_files(mock_systemd: MagicMock, patched_paths: PatchedPaths): + """ + arrange: Service is not running and no files exist on disk. + act: Run cleanup. + assert: No errors raised; disable is attempted, daemon_reload still called. + """ + mock_systemd.service_running.return_value = False + mock_systemd.service_disable.side_effect = SystemdError("unit not found") + + manager_service.cleanup(unit_name="test-unit/0") + + mock_systemd.service_stop.assert_not_called() + mock_systemd.service_disable.assert_called_once() + mock_systemd.daemon_reload.assert_called_once() + + +def test_cleanup_daemon_reload_error(mock_systemd: MagicMock): + """ + arrange: systemd.daemon_reload raises SystemdError. + act: Run cleanup. + assert: RunnerManagerApplicationStopError is raised. + """ + mock_systemd.service_running.return_value = False + mock_systemd.daemon_reload.side_effect = SystemdError("Mock error") + + with pytest.raises(manager_service.RunnerManagerApplicationStopError): + manager_service.cleanup(unit_name="test-unit/0") + + +def test_cleanup_rmtree_permission_error( + mock_systemd: MagicMock, patched_paths: PatchedPaths, monkeypatch +): + """ + arrange: shutil.rmtree raises PermissionError. + act: Run cleanup. + assert: RunnerManagerApplicationStopError is raised. + """ + mock_systemd.service_running.return_value = False + monkeypatch.setattr("shutil.rmtree", MagicMock(side_effect=PermissionError("denied"))) + + with pytest.raises(manager_service.RunnerManagerApplicationStopError): + manager_service.cleanup(unit_name="test-unit/0") + + # 2026-01-19 Skip the mocks fixture to test the actual ensure_http_port_for_unit implementation. # The mocks fixture (see conftest.py) normally patches this function to return 55555. @pytest.mark.nomocks