Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ This changelog documents user-relevant changes to the GitHub runner charm.

## 2026-03-20

- The pressure reconciler is now always used, replacing the legacy reconcile mode. When no planner relation is configured, it uses `base-virtual-machines` as static pressure to maintain the configured minimum runner count.
- HTTP server endpoints (`/runner/check`, `/runner/flush`) now use `RunnerManager` directly instead of the legacy `RunnerScaler`.
- Set GitHub API pagination page size to 100 (up from PyGithub default of 30), reducing API calls when listing runners.

## 2026-03-18
Expand Down
2 changes: 1 addition & 1 deletion github-runner-manager/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

[project]
name = "github-runner-manager"
version = "0.15.2"
version = "0.16.0"
authors = [
{ name = "Canonical IS DevOps", email = "is-devops-team@canonical.com" },
]
Expand Down
49 changes: 21 additions & 28 deletions github-runner-manager/src/github_runner_manager/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
from github_runner_manager.manager.pressure_reconciler import (
PressureReconciler,
build_pressure_reconciler,
build_runner_manager,
)
from github_runner_manager.reconcile_service import start_reconcile_service
from github_runner_manager.thread_manager import ThreadManager

version = importlib.metadata.version("github-runner-manager")
Expand Down Expand Up @@ -94,19 +94,12 @@ def handle_shutdown(
default="INFO",
help="The log level for the application.",
)
@click.option(
"--python-path",
type=str,
required=False,
help="The PYTHONPATH to access the github-runner-manager library.",
)
# The entry point for the CLI will be tested with integration test.
def main( # pylint: disable=too-many-arguments, too-many-positional-arguments
def main(
config_file: TextIO,
host: str,
port: int,
debug: bool,
python_path: str | None,
log_level: str,
) -> None: # pragma: no cover
"""Start the reconcile service.
Expand All @@ -116,8 +109,10 @@ def main( # pylint: disable=too-many-arguments, too-many-positional-arguments
host: The hostname to listen on for the HTTP server
port: The port to listen on the HTTP server.
debug: Whether to start the application in debug mode.
python_path: PYTHONPATH to access the github-runner-manager library.
log_level: The log level.

Raises:
ClickException: If no non-reactive combinations are configured.
"""
logging.basicConfig(
level=log_level,
Expand All @@ -128,30 +123,28 @@ def main( # pylint: disable=too-many-arguments, too-many-positional-arguments
config = ApplicationConfiguration.from_yaml_file(StringIO(config_file.read()))
lock = Lock()

combinations = config.non_reactive_configuration.combinations
if not combinations:
raise click.ClickException("No non-reactive combinations configured.")
runner_manager = build_runner_manager(config, combinations[0])
pressure_reconciler = build_pressure_reconciler(config, runner_manager, lock)

thread_manager = ThreadManager()
http_server_args = FlaskArgs(host=host, port=port, debug=debug)
thread_manager.add_thread(
target=partial(start_http_server, config, lock, http_server_args),
target=partial(start_http_server, runner_manager, lock, http_server_args),
daemon=True,
)

if config.planner_url and config.planner_token:
pressure_reconciler = build_pressure_reconciler(config, lock)
shutdown = partial(
handle_shutdown,
pressure_reconciler=pressure_reconciler,
thread_manager=thread_manager,
)
signal.signal(signal.SIGTERM, shutdown)
signal.signal(signal.SIGINT, shutdown)
thread_manager.add_thread(target=pressure_reconciler.start_create_loop, daemon=True)
thread_manager.add_thread(target=pressure_reconciler.start_reconcile_loop, daemon=True)
# Legacy mode is still supported for deployments without planner config.
else:
thread_manager.add_thread(
target=partial(start_reconcile_service, config, python_path, lock),
daemon=True,
)
shutdown = partial(
handle_shutdown,
pressure_reconciler=pressure_reconciler,
thread_manager=thread_manager,
)
signal.signal(signal.SIGTERM, shutdown)
signal.signal(signal.SIGINT, shutdown)
thread_manager.add_thread(target=pressure_reconciler.start_create_loop, daemon=True)
thread_manager.add_thread(target=pressure_reconciler.start_reconcile_loop, daemon=True)

thread_manager.start()
thread_manager.raise_on_error()
41 changes: 20 additions & 21 deletions github-runner-manager/src/github_runner_manager/http_server.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,19 @@
# Copyright 2026 Canonical Ltd.
# See LICENSE file for licensing details.

"""The HTTP server for github-runner-manager.
"""The HTTP server for github-runner-manager."""

The HTTP server for request to the github-runner-manager.
"""

import dataclasses
import json
from dataclasses import dataclass
from threading import Lock

from flask import Flask, request
from prometheus_client import generate_latest

from github_runner_manager.configuration import ApplicationConfiguration
from github_runner_manager.errors import CloudError, LockError
from github_runner_manager.manager.runner_manager import FlushMode
from github_runner_manager.reconcile_service import get_runner_scaler
from github_runner_manager.manager.runner_manager import FlushMode, RunnerManager

APP_CONFIG_NAME = "app_config"
OPENSTACK_CONFIG_NAME = "openstack_config"
RUNNER_MANAGER_CONFIG_NAME = "runner_manager"

app = Flask(__name__)

Expand All @@ -45,15 +38,23 @@ def check_runner() -> tuple[str, int]:
Returns:
Information on the runners in JSON format.
"""
app_config: ApplicationConfiguration = app.config[APP_CONFIG_NAME]
runner_manager: RunnerManager = app.config[RUNNER_MANAGER_CONFIG_NAME]
app.logger.info("Checking runners...")
runner_scaler = get_runner_scaler(app_config)
try:
runner_info = runner_scaler.get_runner_info()
runner_info = runner_manager.get_runner_info()
except CloudError as err:
app.logger.exception("Cloud error encountered while getting runner info")
return (str(err), 500)
return (json.dumps(dataclasses.asdict(runner_info)), 200)

response = {
"online": runner_info.online,
"busy": runner_info.busy,
"offline": runner_info.offline,
"unknown": runner_info.unknown,
"runners": list(runner_info.runners),
"busy_runners": list(runner_info.busy_runners),
}
return (json.dumps(response), 200)


@app.route("/runner/flush", methods=["POST"])
Expand All @@ -66,7 +67,7 @@ def flush_runner() -> tuple[str, int]:
Returns:
A empty response.
"""
app_config = app.config[APP_CONFIG_NAME]
runner_manager: RunnerManager = app.config[RUNNER_MANAGER_CONFIG_NAME]

flush_busy_str = request.args.get("flush-busy")
flush_busy = False
Expand All @@ -76,15 +77,13 @@ def flush_runner() -> tuple[str, int]:
lock = _get_lock()
with lock:
app.logger.info("Flushing runners...")
runner_scaler = get_runner_scaler(app_config)
app.logger.info("Flushing busy: %s", flush_busy)
flush_mode = FlushMode.FLUSH_BUSY if flush_busy else FlushMode.FLUSH_IDLE
try:
num_flushed = runner_scaler.flush(flush_mode)
runner_manager.flush_runners(flush_mode)
except CloudError as err:
app.logger.exception("Cloud error encountered while flushing runners")
return (str(err), 500)
app.logger.info("Flushed %s runners", num_flushed)
return ("", 204)


Expand Down Expand Up @@ -130,22 +129,22 @@ class FlaskArgs:


def start_http_server(
app_config: ApplicationConfiguration,
runner_manager: RunnerManager,
lock: Lock,
flask_args: FlaskArgs,
) -> None:
"""Start the HTTP server for interacting with the github-runner-manager service.

Args:
app_config: The application configuration.
runner_manager: The runner manager for managing runners.
lock: The lock representing modification access to the managed set of runners.
flask_args: The arguments for the flask HTTP server.
"""
app.logger.info("Starting the server...")
# The lock is passed from the caller, hence the need to update the global variable.
global _lock # pylint: disable=global-statement
_lock = lock
app.config[APP_CONFIG_NAME] = app_config
app.config[RUNNER_MANAGER_CONFIG_NAME] = runner_manager
app.run(
host=flask_args.host,
port=flask_args.port,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ class PressureReconciler: # pylint: disable=too-few-public-methods,too-many-ins
def __init__(
self,
manager: RunnerManager,
planner_client: PlannerClient,
planner_client: PlannerClient | None,
config: PressureReconcilerConfig,
lock: Lock,
) -> None:
Expand All @@ -122,6 +122,7 @@ def __init__(
manager: Runner manager interface for creating, cleaning up,
and listing runners.
planner_client: Client used to stream pressure updates.
None when no planner relation is configured.
config: Reconciler configuration.
lock: Shared lock to serialize operations with other reconcile loops.
"""
Expand All @@ -140,6 +141,15 @@ def start_create_loop(self) -> None:
with self._lock:
self._runner_count = len(self._manager.get_runners())
logger.info("Create loop: initial sync, _runner_count=%s", self._runner_count)
if self._planner is None:
self._last_pressure = self._config.min_pressure
logger.info(
"Create loop: no planner configured, using min_pressure=%s",
self._config.min_pressure,
)
self._handle_create_runners(self._config.min_pressure)
self._stop.wait()
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the _handle_timer_reconcile that runs periodically will handle creating/deleting runners.

return
while not self._stop.is_set():
try:
for update in self._planner.stream_pressure(self._config.flavor_name):
Expand Down Expand Up @@ -394,31 +404,38 @@ def _desired_total_from_pressure(self, pressure: int) -> int:
return total


def build_pressure_reconciler(config: ApplicationConfiguration, lock: Lock) -> PressureReconciler:
def build_pressure_reconciler(
config: ApplicationConfiguration, manager: RunnerManager, lock: Lock
) -> PressureReconciler:
"""Construct a PressureReconciler from application configuration.

Args:
config: Application configuration.
manager: The runner manager to use for creating, cleaning up, and listing runners.
lock: Shared lock to serialize operations with other reconcile loops.

Raises:
ValueError: If no non-reactive combinations are configured.
ValueError: If planner configuration is partial (only one of URL/token set).

Returns:
A fully constructed PressureReconciler.
"""
combinations = config.non_reactive_configuration.combinations
if not combinations:
first = config.non_reactive_configuration.combinations[0]
planner_client: PlannerClient | None = None
has_url = bool(config.planner_url)
has_token = bool(config.planner_token)
if has_url != has_token:
raise ValueError(
"Cannot build PressureReconciler: no non-reactive combinations configured."
"Partial planner configuration: both planner_url and planner_token must be set"
" or both unset."
)
if has_url and has_token:
planner_client = PlannerClient(
PlannerConfiguration(base_url=config.planner_url, token=config.planner_token)
)
first = combinations[0]
manager = _build_runner_manager(config, first)
return PressureReconciler(
manager=manager,
planner_client=PlannerClient(
PlannerConfiguration(base_url=config.planner_url, token=config.planner_token)
),
planner_client=planner_client,
config=PressureReconcilerConfig(
flavor_name=config.name,
reconcile_interval=config.reconcile_interval,
Expand All @@ -429,7 +446,7 @@ def build_pressure_reconciler(config: ApplicationConfiguration, lock: Lock) -> P
)


def _build_runner_manager(
def build_runner_manager(
config: ApplicationConfiguration, combination: NonReactiveCombination
) -> RunnerManager:
"""Build a RunnerManager from application config and a flavor/image combination.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright 2026 Canonical Ltd.
# See LICENSE file for licensing details.

# RunnerInfo is duplicated in runner_scaler (legacy), will be removed in follow-up PR.
# pylint: disable=duplicate-code
"""Module for managing the GitHub self-hosted runners hosted on cloud instances."""

import copy
Expand Down Expand Up @@ -45,6 +47,27 @@
IssuedMetricEventsStats = dict[Type[metric_events.Event], int]


@dataclass(frozen=True)
class RunnerInfo:
"""Aggregated information on the runners.

Attributes:
online: The number of runners in online state.
busy: The number of runners in busy state.
offline: The number of runners in offline state.
unknown: The number of runners in unknown state.
runners: The names of the online runners.
busy_runners: The names of the busy runners.
"""

online: int
busy: int
offline: int
unknown: int
runners: tuple[str, ...]
busy_runners: tuple[str, ...]


class FlushMode(Enum):
"""Strategy for flushing runners.

Expand Down Expand Up @@ -263,6 +286,42 @@ def get_runners(self) -> tuple[RunnerInstance, ...]:
for vm in vms
)

def get_runner_info(self) -> RunnerInfo:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

runner_scaler will be removed in a follow-up PR. The logic seems appropriate to live in runner_manager

"""Get aggregated information on the runners.

Returns:
Aggregated runner counts and names.
"""
runner_list = self.get_runners()
online = 0
busy = 0
offline = 0
unknown = 0
online_runners: list[str] = []
busy_runners: list[str] = []
for runner in runner_list:
match runner.platform_state:
case PlatformRunnerState.BUSY:
online += 1
online_runners.append(runner.name)
busy += 1
busy_runners.append(runner.name)
case PlatformRunnerState.IDLE:
online += 1
online_runners.append(runner.name)
case PlatformRunnerState.OFFLINE:
offline += 1
case _:
unknown += 1
return RunnerInfo(
online=online,
busy=busy,
offline=offline,
unknown=unknown,
runners=tuple(online_runners),
busy_runners=tuple(busy_runners),
)

def delete_runners(self, num: int) -> IssuedMetricEventsStats:
"""Delete up to `num` runners, preferring idle ones over busy.

Expand Down
Loading
Loading