From 85449ff71617c732e1ff6b9cdf15362538d83096 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Mon, 30 Mar 2026 10:19:23 +0200 Subject: [PATCH 1/7] fix(stop): clean up systemd service and data on unit removal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On co-located machines with multiple github-runner charms, removing a unit left its systemd service file enabled. Systemd would restart the stale service, which — if the binary was still compatible — silently kept managing runners with outdated configuration. The stop hook now disables the service, removes the service file, reloads systemd, and cleans up the per-unit data and log files. --- docs/changelog.md | 4 ++ github-runner-manager/pyproject.toml | 2 +- src/charm.py | 2 +- src/manager_service.py | 38 +++++++++++++++++ tests/unit/test_charm.py | 2 +- tests/unit/test_manager_service.py | 63 ++++++++++++++++++++++++++++ 6 files changed, 108 insertions(+), 3 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 120c953524..f32b979369 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,10 @@ This changelog documents user-relevant changes to the GitHub runner charm. +## 2026-03-30 + +- Fixed stale systemd service files left behind when a unit is removed from a co-located machine. The service is now disabled, the service file removed, and per-unit data cleaned up on stop. + ## 2026-03-25 - Removed `mongodb` relation. MongoDB-based reactive runner spawning is no longer supported. **If you have an active MongoDB relation, remove it with `juju remove-relation` before upgrading.** diff --git a/github-runner-manager/pyproject.toml b/github-runner-manager/pyproject.toml index a5877fdd63..2655cb0e73 100644 --- a/github-runner-manager/pyproject.toml +++ b/github-runner-manager/pyproject.toml @@ -3,7 +3,7 @@ [project] name = "github-runner-manager" -version = "0.17.0" +version = "0.17.1" authors = [ { name = "Canonical IS DevOps", email = "is-devops-team@canonical.com" }, ] diff --git a/src/charm.py b/src/charm.py index 8d9f7bf7b0..8d84eb1165 100755 --- a/src/charm.py +++ b/src/charm.py @@ -448,7 +448,7 @@ def _log_charm_status(self) -> None: def _on_stop(self, _: StopEvent) -> None: """Handle the stopping of the charm.""" self._manager_client.flush_runner(busy=True) - manager_service.stop(self.unit.name) + manager_service.cleanup(self.unit.name) def _install_deps(self) -> None: """Install dependences for the charm.""" diff --git a/src/manager_service.py b/src/manager_service.py index 5cce925da7..ab58253784 100644 --- a/src/manager_service.py +++ b/src/manager_service.py @@ -6,6 +6,7 @@ import fcntl import json import logging +import shutil import socket import textwrap from pathlib import Path @@ -267,6 +268,34 @@ def stop(unit_name: str) -> None: _stop(unit_name) +def cleanup(unit_name: str) -> None: + """Stop, disable, and remove all artifacts for a unit's service. + + Used during unit removal to ensure no stale systemd service or data remains + on the machine. + + Args: + unit_name: The Juju unit name. + + Raises: + RunnerManagerApplicationStopError: Failed to clean up the service. + """ + _stop(unit_name) + instance_service = _instance_service_name(unit_name) + try: + systemd.service_disable(instance_service) + normalized = _normalized_unit(unit_name) + service_file = ( + SYSTEMD_SERVICE_PATH / f"{GITHUB_RUNNER_MANAGER_SERVICE_NAME}@{normalized}.service" + ) + service_file.unlink(missing_ok=True) + systemd.daemon_reload() + except SystemdError as err: + raise RunnerManagerApplicationStopError(_SERVICE_STOP_ERROR_MESSAGE) from err + + _remove_unit_data(unit_name) + + def _stop(unit_name: str) -> None: """Stop the GitHub runner manager service. @@ -375,3 +404,12 @@ def _setup_service_file(unit_name: str, config_file: Path, log_file: Path, log_l ) service_path.parent.mkdir(parents=True, exist_ok=True) service_path.write_text(service_file_content, "utf-8") + + +def _remove_unit_data(unit_name: str) -> None: + """Remove the per-unit data directory and log file.""" + normalized = _normalized_unit(unit_name) + unit_dir = GITHUB_RUNNER_MANAGER_SERVICE_DIR / normalized + shutil.rmtree(unit_dir, ignore_errors=True) + log_file = GITHUB_RUNNER_MANAGER_SERVICE_LOG_DIR / f"{normalized}.log" + log_file.unlink(missing_ok=True) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 625c1bc9ed..5543e91f67 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -324,7 +324,7 @@ def test_on_stop_busy_flush_and_stop_service(harness: Harness, monkeypatch: pyte harness.charm._on_stop(mock_event) manager_client_mock.flush_runner.assert_called_once_with(busy=True) - mock_manager_service.stop.assert_called_once() + mock_manager_service.cleanup.assert_called_once() @pytest.mark.parametrize( diff --git a/tests/unit/test_manager_service.py b/tests/unit/test_manager_service.py index 8795ad8c4e..193ac22619 100644 --- a/tests/unit/test_manager_service.py +++ b/tests/unit/test_manager_service.py @@ -190,6 +190,69 @@ def test_stop_with_stopped_service(mock_systemd: MagicMock): mock_systemd.service_stop.assert_not_called() +def test_cleanup_removes_service_and_data(mock_systemd: MagicMock, patched_paths: PatchedPaths): + """ + arrange: A running service with service file, data dir, and log file on disk. + act: Run cleanup. + assert: Service is stopped, disabled, service file removed, daemon reloaded, + data dir and log file removed. + """ + unit_name = "test-unit/0" + normalized = "test-unit-0" + instance_service = f"github-runner-manager@{normalized}" + mock_systemd.service_running.return_value = True + + service_file = patched_paths.systemd_service_path / f"{instance_service}.service" + service_file.parent.mkdir(parents=True, exist_ok=True) + service_file.write_text("mock service", encoding="utf-8") + + unit_dir = patched_paths.service_dir / normalized + unit_dir.mkdir(parents=True, exist_ok=True) + (unit_dir / "config.yaml").write_text("mock config", encoding="utf-8") + (unit_dir / "http_port").write_text("55555", encoding="utf-8") + + log_file = patched_paths.service_log_dir / f"{normalized}.log" + log_file.parent.mkdir(parents=True, exist_ok=True) + log_file.write_text("mock log", encoding="utf-8") + + manager_service.cleanup(unit_name) + + mock_systemd.service_stop.assert_called_once_with(instance_service) + mock_systemd.service_disable.assert_called_once_with(instance_service) + mock_systemd.daemon_reload.assert_called_once() + assert not service_file.exists() + assert not unit_dir.exists() + assert not log_file.exists() + + +def test_cleanup_idempotent_missing_files(mock_systemd: MagicMock, patched_paths: PatchedPaths): + """ + arrange: Service is not running and no files exist on disk. + act: Run cleanup. + assert: No errors raised; disable and daemon_reload still called. + """ + mock_systemd.service_running.return_value = False + + manager_service.cleanup(unit_name="test-unit/0") + + mock_systemd.service_stop.assert_not_called() + mock_systemd.service_disable.assert_called_once() + mock_systemd.daemon_reload.assert_called_once() + + +def test_cleanup_systemd_error(mock_systemd: MagicMock): + """ + arrange: systemd.service_disable raises SystemdError. + act: Run cleanup. + assert: RunnerManagerApplicationStopError is raised. + """ + mock_systemd.service_running.return_value = False + mock_systemd.service_disable.side_effect = SystemdError("Mock error") + + with pytest.raises(manager_service.RunnerManagerApplicationStopError): + manager_service.cleanup(unit_name="test-unit/0") + + # 2026-01-19 Skip the mocks fixture to test the actual ensure_http_port_for_unit implementation. # The mocks fixture (see conftest.py) normally patches this function to return 55555. @pytest.mark.nomocks From 1ed29a60106cb087379453ba5804d399072cbbd3 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Mon, 30 Mar 2026 11:03:41 +0200 Subject: [PATCH 2/7] fix(docs): address vale lint findings in changelog Replace 'per-unit' with 'the unit' and backtick-quote 'systemd'. --- docs/changelog.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/changelog.md b/docs/changelog.md index f32b979369..fb6061fc40 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -4,7 +4,7 @@ This changelog documents user-relevant changes to the GitHub runner charm. ## 2026-03-30 -- Fixed stale systemd service files left behind when a unit is removed from a co-located machine. The service is now disabled, the service file removed, and per-unit data cleaned up on stop. +- Fixed stale `systemd` service files left behind when a unit is removed from a co-located machine. The service is now disabled, the service file removed, and the unit data cleaned up on stop. ## 2026-03-25 From 48b4a10d0467b31451d30dd2c25bf363c08ace74 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Mon, 30 Mar 2026 12:53:30 +0200 Subject: [PATCH 3/7] fix(cleanup): surface filesystem errors instead of silently ignoring Catch OSError from service file unlink in cleanup(), and replace shutil.rmtree(ignore_errors=True) with explicit FileNotFoundError handling so permission errors surface as RunnerManagerApplicationStopError. --- src/manager_service.py | 15 ++++++++++++--- tests/unit/test_manager_service.py | 15 +++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/src/manager_service.py b/src/manager_service.py index ab58253784..868939aafc 100644 --- a/src/manager_service.py +++ b/src/manager_service.py @@ -290,7 +290,7 @@ def cleanup(unit_name: str) -> None: ) service_file.unlink(missing_ok=True) systemd.daemon_reload() - except SystemdError as err: + except (SystemdError, OSError) as err: raise RunnerManagerApplicationStopError(_SERVICE_STOP_ERROR_MESSAGE) from err _remove_unit_data(unit_name) @@ -407,9 +407,18 @@ def _setup_service_file(unit_name: str, config_file: Path, log_file: Path, log_l def _remove_unit_data(unit_name: str) -> None: - """Remove the per-unit data directory and log file.""" + """Remove the per-unit data directory and log file. + + Raises: + RunnerManagerApplicationStopError: Failed to remove unit data. + """ normalized = _normalized_unit(unit_name) unit_dir = GITHUB_RUNNER_MANAGER_SERVICE_DIR / normalized - shutil.rmtree(unit_dir, ignore_errors=True) + try: + shutil.rmtree(unit_dir) + except FileNotFoundError: + pass + except OSError as exc: + raise RunnerManagerApplicationStopError(_SERVICE_STOP_ERROR_MESSAGE) from exc log_file = GITHUB_RUNNER_MANAGER_SERVICE_LOG_DIR / f"{normalized}.log" log_file.unlink(missing_ok=True) diff --git a/tests/unit/test_manager_service.py b/tests/unit/test_manager_service.py index 193ac22619..6c1de619d0 100644 --- a/tests/unit/test_manager_service.py +++ b/tests/unit/test_manager_service.py @@ -253,6 +253,21 @@ def test_cleanup_systemd_error(mock_systemd: MagicMock): manager_service.cleanup(unit_name="test-unit/0") +def test_cleanup_rmtree_permission_error( + mock_systemd: MagicMock, patched_paths: PatchedPaths, monkeypatch +): + """ + arrange: shutil.rmtree raises PermissionError. + act: Run cleanup. + assert: RunnerManagerApplicationStopError is raised. + """ + mock_systemd.service_running.return_value = False + monkeypatch.setattr("shutil.rmtree", MagicMock(side_effect=PermissionError("denied"))) + + with pytest.raises(manager_service.RunnerManagerApplicationStopError): + manager_service.cleanup(unit_name="test-unit/0") + + # 2026-01-19 Skip the mocks fixture to test the actual ensure_http_port_for_unit implementation. # The mocks fixture (see conftest.py) normally patches this function to return 55555. @pytest.mark.nomocks From 2503d33d9a2ae655a98ccb6d4fef57919a777635 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Mon, 30 Mar 2026 13:51:58 +0200 Subject: [PATCH 4/7] fix(cleanup): use dedicated error message for cleanup failures Distinguish cleanup errors from stop errors in log output to make diagnosing unit removal failures easier. --- src/manager_service.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/manager_service.py b/src/manager_service.py index 868939aafc..a0f5951dcf 100644 --- a/src/manager_service.py +++ b/src/manager_service.py @@ -40,6 +40,7 @@ _INSTALL_ERROR_MESSAGE = "Unable to install github-runner-manager package from source" _SERVICE_SETUP_ERROR_MESSAGE = "Unable to enable or start the github-runner-manager application" _SERVICE_STOP_ERROR_MESSAGE = "Unable to stop the github-runner-manager application" +_SERVICE_CLEANUP_ERROR_MESSAGE = "Unable to clean up the github-runner-manager service" logger = logging.getLogger(__name__) @@ -291,7 +292,7 @@ def cleanup(unit_name: str) -> None: service_file.unlink(missing_ok=True) systemd.daemon_reload() except (SystemdError, OSError) as err: - raise RunnerManagerApplicationStopError(_SERVICE_STOP_ERROR_MESSAGE) from err + raise RunnerManagerApplicationStopError(_SERVICE_CLEANUP_ERROR_MESSAGE) from err _remove_unit_data(unit_name) @@ -419,6 +420,6 @@ def _remove_unit_data(unit_name: str) -> None: except FileNotFoundError: pass except OSError as exc: - raise RunnerManagerApplicationStopError(_SERVICE_STOP_ERROR_MESSAGE) from exc + raise RunnerManagerApplicationStopError(_SERVICE_CLEANUP_ERROR_MESSAGE) from exc log_file = GITHUB_RUNNER_MANAGER_SERVICE_LOG_DIR / f"{normalized}.log" log_file.unlink(missing_ok=True) From afc09eb8adb693c63fd96f931543c8a493a40992 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Mon, 30 Mar 2026 14:14:11 +0200 Subject: [PATCH 5/7] fix(cleanup): guard service_disable and wrap log file unlink Skip service_disable when the service file is already absent to ensure idempotent cleanup. Wrap log file unlink in OSError handling for consistent error reporting. --- src/manager_service.py | 16 ++++++++++------ tests/unit/test_manager_service.py | 11 +++++++---- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/manager_service.py b/src/manager_service.py index a0f5951dcf..588e47fb4c 100644 --- a/src/manager_service.py +++ b/src/manager_service.py @@ -283,12 +283,13 @@ def cleanup(unit_name: str) -> None: """ _stop(unit_name) instance_service = _instance_service_name(unit_name) + normalized = _normalized_unit(unit_name) + service_file = ( + SYSTEMD_SERVICE_PATH / f"{GITHUB_RUNNER_MANAGER_SERVICE_NAME}@{normalized}.service" + ) try: - systemd.service_disable(instance_service) - normalized = _normalized_unit(unit_name) - service_file = ( - SYSTEMD_SERVICE_PATH / f"{GITHUB_RUNNER_MANAGER_SERVICE_NAME}@{normalized}.service" - ) + if service_file.exists(): + systemd.service_disable(instance_service) service_file.unlink(missing_ok=True) systemd.daemon_reload() except (SystemdError, OSError) as err: @@ -422,4 +423,7 @@ def _remove_unit_data(unit_name: str) -> None: except OSError as exc: raise RunnerManagerApplicationStopError(_SERVICE_CLEANUP_ERROR_MESSAGE) from exc log_file = GITHUB_RUNNER_MANAGER_SERVICE_LOG_DIR / f"{normalized}.log" - log_file.unlink(missing_ok=True) + try: + log_file.unlink(missing_ok=True) + except OSError as exc: + raise RunnerManagerApplicationStopError(_SERVICE_CLEANUP_ERROR_MESSAGE) from exc diff --git a/tests/unit/test_manager_service.py b/tests/unit/test_manager_service.py index 6c1de619d0..143f19b77e 100644 --- a/tests/unit/test_manager_service.py +++ b/tests/unit/test_manager_service.py @@ -229,25 +229,28 @@ def test_cleanup_idempotent_missing_files(mock_systemd: MagicMock, patched_paths """ arrange: Service is not running and no files exist on disk. act: Run cleanup. - assert: No errors raised; disable and daemon_reload still called. + assert: No errors raised; disable is skipped, daemon_reload still called. """ mock_systemd.service_running.return_value = False manager_service.cleanup(unit_name="test-unit/0") mock_systemd.service_stop.assert_not_called() - mock_systemd.service_disable.assert_called_once() + mock_systemd.service_disable.assert_not_called() mock_systemd.daemon_reload.assert_called_once() -def test_cleanup_systemd_error(mock_systemd: MagicMock): +def test_cleanup_systemd_error(mock_systemd: MagicMock, patched_paths: PatchedPaths): """ - arrange: systemd.service_disable raises SystemdError. + arrange: Service file exists and systemd.service_disable raises SystemdError. act: Run cleanup. assert: RunnerManagerApplicationStopError is raised. """ mock_systemd.service_running.return_value = False mock_systemd.service_disable.side_effect = SystemdError("Mock error") + service_file = patched_paths.systemd_service_path / "github-runner-manager@test-unit-0.service" + service_file.parent.mkdir(parents=True, exist_ok=True) + service_file.write_text("mock service", encoding="utf-8") with pytest.raises(manager_service.RunnerManagerApplicationStopError): manager_service.cleanup(unit_name="test-unit/0") From 9fb84adaf2e33707f239a90e280847e33c477ae2 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Mon, 30 Mar 2026 14:23:12 +0200 Subject: [PATCH 6/7] refactor(cleanup): derive service file path from instance name Reuse instance_service to build the service file path, removing duplicated naming logic. Rename charm test to reflect cleanup behavior. --- src/manager_service.py | 5 +---- tests/unit/test_charm.py | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/manager_service.py b/src/manager_service.py index 588e47fb4c..65ce8fe44b 100644 --- a/src/manager_service.py +++ b/src/manager_service.py @@ -283,10 +283,7 @@ def cleanup(unit_name: str) -> None: """ _stop(unit_name) instance_service = _instance_service_name(unit_name) - normalized = _normalized_unit(unit_name) - service_file = ( - SYSTEMD_SERVICE_PATH / f"{GITHUB_RUNNER_MANAGER_SERVICE_NAME}@{normalized}.service" - ) + service_file = SYSTEMD_SERVICE_PATH / f"{instance_service}.service" try: if service_file.exists(): systemd.service_disable(instance_service) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 5543e91f67..b50f9d5636 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -307,11 +307,11 @@ def test__on_config_changed_no_flush(monkeypatch: pytest.MonkeyPatch, config_opt harness.charm._manager_client.flush_runner.assert_not_called() -def test_on_stop_busy_flush_and_stop_service(harness: Harness, monkeypatch: pytest.MonkeyPatch): +def test_on_stop_busy_flush_and_cleanup_service(harness: Harness, monkeypatch: pytest.MonkeyPatch): """ arrange: Set up charm with Openstack mode and runner scaler mock. act: Trigger stop event. - assert: Runner scaler mock flushes the runners using busy mode. + assert: Runners are flushed in busy mode and service cleanup is called. """ state_mock = MagicMock() harness.charm._setup_state = MagicMock(return_value=state_mock) From dbdb4bf99adf429b0bb379878b26da6b375879a2 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Mon, 30 Mar 2026 15:00:52 +0200 Subject: [PATCH 7/7] fix(cleanup): call service_disable unconditionally with non-fatal error Call service_disable regardless of service file presence to clean up dangling enablement symlinks from partial manual cleanup. Treat SystemdError from disable as non-fatal (log warning), matching the pattern in _disable_legacy_service(). --- src/manager_service.py | 6 ++++-- tests/unit/test_manager_service.py | 14 ++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/manager_service.py b/src/manager_service.py index 65ce8fe44b..a5e2ddbd42 100644 --- a/src/manager_service.py +++ b/src/manager_service.py @@ -285,8 +285,10 @@ def cleanup(unit_name: str) -> None: instance_service = _instance_service_name(unit_name) service_file = SYSTEMD_SERVICE_PATH / f"{instance_service}.service" try: - if service_file.exists(): - systemd.service_disable(instance_service) + systemd.service_disable(instance_service) + except SystemdError: + logger.warning("Failed to disable %s, unit may already be absent", instance_service) + try: service_file.unlink(missing_ok=True) systemd.daemon_reload() except (SystemdError, OSError) as err: diff --git a/tests/unit/test_manager_service.py b/tests/unit/test_manager_service.py index 143f19b77e..dacfc23c61 100644 --- a/tests/unit/test_manager_service.py +++ b/tests/unit/test_manager_service.py @@ -229,28 +229,26 @@ def test_cleanup_idempotent_missing_files(mock_systemd: MagicMock, patched_paths """ arrange: Service is not running and no files exist on disk. act: Run cleanup. - assert: No errors raised; disable is skipped, daemon_reload still called. + assert: No errors raised; disable is attempted, daemon_reload still called. """ mock_systemd.service_running.return_value = False + mock_systemd.service_disable.side_effect = SystemdError("unit not found") manager_service.cleanup(unit_name="test-unit/0") mock_systemd.service_stop.assert_not_called() - mock_systemd.service_disable.assert_not_called() + mock_systemd.service_disable.assert_called_once() mock_systemd.daemon_reload.assert_called_once() -def test_cleanup_systemd_error(mock_systemd: MagicMock, patched_paths: PatchedPaths): +def test_cleanup_daemon_reload_error(mock_systemd: MagicMock): """ - arrange: Service file exists and systemd.service_disable raises SystemdError. + arrange: systemd.daemon_reload raises SystemdError. act: Run cleanup. assert: RunnerManagerApplicationStopError is raised. """ mock_systemd.service_running.return_value = False - mock_systemd.service_disable.side_effect = SystemdError("Mock error") - service_file = patched_paths.systemd_service_path / "github-runner-manager@test-unit-0.service" - service_file.parent.mkdir(parents=True, exist_ok=True) - service_file.write_text("mock service", encoding="utf-8") + mock_systemd.daemon_reload.side_effect = SystemdError("Mock error") with pytest.raises(manager_service.RunnerManagerApplicationStopError): manager_service.cleanup(unit_name="test-unit/0")