From 6fdda6533ed8aac993c7a781821f5ff1016a32f2 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Fri, 20 Mar 2026 13:13:17 +0100 Subject: [PATCH 1/8] REVERT ME: disable other tests for efficiency --- .github/workflows/e2e_test.yaml | 48 -------- .github/workflows/integration_test.yaml | 113 +++++++++--------- .github/workflows/test.yaml | 13 -- .../workflows/test_github_runner_manager.yaml | 75 ------------ .github/workflows/tics.yaml | 34 ------ 5 files changed, 57 insertions(+), 226 deletions(-) delete mode 100644 .github/workflows/e2e_test.yaml delete mode 100644 .github/workflows/test.yaml delete mode 100644 .github/workflows/test_github_runner_manager.yaml delete mode 100644 .github/workflows/tics.yaml diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml deleted file mode 100644 index 2a0ac5a2ad..0000000000 --- a/.github/workflows/e2e_test.yaml +++ /dev/null @@ -1,48 +0,0 @@ -name: End-to-End tests - -on: - pull_request: - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - # test option values defined at test/conftest.py are passed on via repository secret - # INTEGRATION_TEST_ARGS to operator-workflows automatically. - openstack-integration-end-to-end-test: - name: end-to-end test using private-endpoint - uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main - secrets: inherit - with: - juju-channel: 3.6/stable - provider: lxd - test-tox-env: integration-juju3.6 - modules: '["test_e2e"]' - # INTEGRATION_TOKEN, OS_PASSWORD are passed through INTEGRATION_TEST_SECRET_ENV_VALUE_ - # mapping. See CONTRIBUTING.md for more details. - extra-arguments: | - -m=openstack \ - --log-format="%(asctime)s %(levelname)s %(message)s" \ - --path="${{ vars.INTEGRATION_TEST_PATH }}" \ - --https-proxy="${{ vars.INTEGRATION_TEST_HTTPS_PROXY }}" \ - --http-proxy="${{ vars.INTEGRATION_TEST_HTTP_PROXY }}" \ - --no-proxy="${{ vars.INTEGRATION_TEST_NO_PROXY }}" \ - --openstack-https-proxy="${{ vars.INTEGRATION_TEST_OPENSTACK_HTTPS_PROXY }}" \ - --openstack-http-proxy="${{ vars.INTEGRATION_TEST_OPENSTACK_HTTP_PROXY }}" \ - --openstack-no-proxy="${{ vars.INTEGRATION_TEST_OPENSTACK_NO_PROXY }}" \ - --openstack-flavor-name="${{ vars.INTEGRATION_TEST_OPENSTACK_FLAVOR_NAME }}" \ - --dockerhub-mirror="${{ vars.INTEGRATION_TEST_DOCKERHUB_MIRROR }}" - self-hosted-runner: true - self-hosted-runner-label: pfe-ci - - required_status_checks: - name: Required E2E Test Status Checks - runs-on: ubuntu-latest - needs: - - openstack-integration-end-to-end-test - if: always() && !cancelled() - timeout-minutes: 5 - steps: - - run: | - [ '${{ needs.openstack-integration-end-to-end-test.result }}' = 'success' ] || (echo openstack-integration-end-to-end-test failed && false) diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 836f6ba4dd..b2bbc9ed8c 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -12,32 +12,31 @@ concurrency: cancel-in-progress: true jobs: - openstack-integration-tests-private-endpoint: - name: Integration test using private-endpoint - uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main - secrets: inherit - with: - juju-channel: 3.6/stable - provider: lxd - test-tox-env: integration-juju3.6 - modules: '["test_multi_unit_same_machine", "test_charm_fork_path_change", "test_charm_no_runner", "test_charm_upgrade"]' - # INTEGRATION_TOKEN, INTEGRATION_TOKEN_ALT, OS_* are passed through INTEGRATION_TEST_SECRET_ENV_VALUE_ - # mapping. See CONTRIBUTING.md for more details. - extra-arguments: | - -m=openstack \ - --log-format="%(asctime)s %(levelname)s %(message)s" \ - --path="${{ vars.INTEGRATION_TEST_PATH }}" \ - --https-proxy="${{ vars.INTEGRATION_TEST_HTTPS_PROXY }}" \ - --http-proxy="${{ vars.INTEGRATION_TEST_HTTP_PROXY }}" \ - --no-proxy="${{ vars.INTEGRATION_TEST_NO_PROXY }}" \ - --openstack-https-proxy="${{ vars.INTEGRATION_TEST_OPENSTACK_HTTPS_PROXY }}" \ - --openstack-http-proxy="${{ vars.INTEGRATION_TEST_OPENSTACK_HTTP_PROXY }}" \ - --openstack-no-proxy="${{ vars.INTEGRATION_TEST_OPENSTACK_NO_PROXY }}" \ - --openstack-flavor-name="${{ vars.INTEGRATION_TEST_OPENSTACK_FLAVOR_NAME }}" \ - --openstack-image-id="${{ vars.INTEGRATION_TEST_IMAGE_ID }}" \ - --dockerhub-mirror="${{ vars.INTEGRATION_TEST_DOCKERHUB_MIRROR }}" - self-hosted-runner: true - self-hosted-runner-label: pfe-ci + # DIAGNOSTIC: Commented out to isolate test_prometheus_metrics hang. + # openstack-integration-tests-private-endpoint: + # name: Integration test using private-endpoint + # uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main + # secrets: inherit + # with: + # juju-channel: 3.6/stable + # provider: lxd + # test-tox-env: integration-juju3.6 + # modules: '["test_multi_unit_same_machine", "test_charm_fork_path_change", "test_charm_no_runner", "test_charm_upgrade"]' + # extra-arguments: | + # -m=openstack \ + # --log-format="%(asctime)s %(levelname)s %(message)s" \ + # --path="${{ vars.INTEGRATION_TEST_PATH }}" \ + # --https-proxy="${{ vars.INTEGRATION_TEST_HTTPS_PROXY }}" \ + # --http-proxy="${{ vars.INTEGRATION_TEST_HTTP_PROXY }}" \ + # --no-proxy="${{ vars.INTEGRATION_TEST_NO_PROXY }}" \ + # --openstack-https-proxy="${{ vars.INTEGRATION_TEST_OPENSTACK_HTTPS_PROXY }}" \ + # --openstack-http-proxy="${{ vars.INTEGRATION_TEST_OPENSTACK_HTTP_PROXY }}" \ + # --openstack-no-proxy="${{ vars.INTEGRATION_TEST_OPENSTACK_NO_PROXY }}" \ + # --openstack-flavor-name="${{ vars.INTEGRATION_TEST_OPENSTACK_FLAVOR_NAME }}" \ + # --openstack-image-id="${{ vars.INTEGRATION_TEST_IMAGE_ID }}" \ + # --dockerhub-mirror="${{ vars.INTEGRATION_TEST_DOCKERHUB_MIRROR }}" + # self-hosted-runner: true + # self-hosted-runner-label: pfe-ci openstack-integration-tests-cross-controller-private-endpoint: name: Cross controller integration test using private-endpoint uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main @@ -65,38 +64,40 @@ jobs: --dockerhub-mirror="${{ vars.INTEGRATION_TEST_DOCKERHUB_MIRROR }}" self-hosted-runner: true self-hosted-runner-label: pfe-ci - openstack-integration-tests-runner-bases: - name: Runner tests across bases - uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main - secrets: inherit - strategy: - matrix: - base: ["22.04", "24.04"] - with: - juju-channel: 3.6/stable - provider: lxd - test-tox-env: integration-juju3.6 - modules: '["test_charm_runner"]' - extra-arguments: | - -m=openstack \ - --log-format="%(asctime)s %(levelname)s %(message)s" \ - --path="${{ vars.INTEGRATION_TEST_PATH }}" \ - --https-proxy="${{ vars.INTEGRATION_TEST_HTTPS_PROXY }}" \ - --http-proxy="${{ vars.INTEGRATION_TEST_HTTP_PROXY }}" \ - --no-proxy="${{ vars.INTEGRATION_TEST_NO_PROXY }}" \ - --openstack-https-proxy="${{ vars.INTEGRATION_TEST_OPENSTACK_HTTPS_PROXY }}" \ - --openstack-http-proxy="${{ vars.INTEGRATION_TEST_OPENSTACK_HTTP_PROXY }}" \ - --openstack-no-proxy="${{ vars.INTEGRATION_TEST_OPENSTACK_NO_PROXY }}" \ - --openstack-flavor-name="${{ vars.INTEGRATION_TEST_OPENSTACK_FLAVOR_NAME }}" \ - --openstack-image-id="${{ vars.INTEGRATION_TEST_IMAGE_ID }}" \ - --dockerhub-mirror="${{ vars.INTEGRATION_TEST_DOCKERHUB_MIRROR }}" \ - --base="${{ matrix.base }}" - self-hosted-runner: true - self-hosted-runner-label: pfe-ci + # DIAGNOSTIC: Commented out to isolate test_prometheus_metrics hang. + # openstack-integration-tests-runner-bases: + # name: Runner tests across bases + # uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main + # secrets: inherit + # strategy: + # matrix: + # base: ["22.04", "24.04"] + # with: + # juju-channel: 3.6/stable + # provider: lxd + # test-tox-env: integration-juju3.6 + # modules: '["test_charm_runner"]' + # extra-arguments: | + # -m=openstack \ + # --log-format="%(asctime)s %(levelname)s %(message)s" \ + # --path="${{ vars.INTEGRATION_TEST_PATH }}" \ + # --https-proxy="${{ vars.INTEGRATION_TEST_HTTPS_PROXY }}" \ + # --http-proxy="${{ vars.INTEGRATION_TEST_HTTP_PROXY }}" \ + # --no-proxy="${{ vars.INTEGRATION_TEST_NO_PROXY }}" \ + # --openstack-https-proxy="${{ vars.INTEGRATION_TEST_OPENSTACK_HTTPS_PROXY }}" \ + # --openstack-http-proxy="${{ vars.INTEGRATION_TEST_OPENSTACK_HTTP_PROXY }}" \ + # --openstack-no-proxy="${{ vars.INTEGRATION_TEST_OPENSTACK_NO_PROXY }}" \ + # --openstack-flavor-name="${{ vars.INTEGRATION_TEST_OPENSTACK_FLAVOR_NAME }}" \ + # --openstack-image-id="${{ vars.INTEGRATION_TEST_IMAGE_ID }}" \ + # --dockerhub-mirror="${{ vars.INTEGRATION_TEST_DOCKERHUB_MIRROR }}" \ + # --base="${{ matrix.base }}" + # self-hosted-runner: true + # self-hosted-runner-label: pfe-ci allure-report: if: ${{ (success() || failure()) && github.event_name == 'schedule' }} needs: - - openstack-integration-tests-private-endpoint + # DIAGNOSTIC: Only cross-controller job active. + # - openstack-integration-tests-private-endpoint - openstack-integration-tests-cross-controller-private-endpoint - - openstack-integration-tests-runner-bases + # - openstack-integration-tests-runner-bases uses: canonical/operator-workflows/.github/workflows/allure_report.yaml@main diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml deleted file mode 100644 index c88ff8b1f4..0000000000 --- a/.github/workflows/test.yaml +++ /dev/null @@ -1,13 +0,0 @@ -name: Tests - -on: - pull_request: - -jobs: - unit-tests: - uses: canonical/operator-workflows/.github/workflows/test.yaml@main - secrets: inherit - with: - self-hosted-runner: true - self-hosted-runner-label: edge - vale-style-check: false diff --git a/.github/workflows/test_github_runner_manager.yaml b/.github/workflows/test_github_runner_manager.yaml deleted file mode 100644 index cdbecc563f..0000000000 --- a/.github/workflows/test_github_runner_manager.yaml +++ /dev/null @@ -1,75 +0,0 @@ -name: Tests for github-runner-manager - -on: - pull_request: - -jobs: - application-unit-tests: - name: Unit tests for github-runner-manager - uses: canonical/operator-workflows/.github/workflows/test.yaml@main - secrets: inherit - with: - self-hosted-runner: true - self-hosted-runner-label: edge - working-directory: ./github-runner-manager/ - application-integration-tests: - name: Integration tests for github-runner-manager - runs-on: [self-hosted, pfe-ci] - strategy: - fail-fast: false - matrix: - test-module: - - test_debug_ssh - - test_metrics - - test_planner_runner - steps: - - name: Checkout code - uses: actions/checkout@v6.0.2 - - name: Setup Astral UV - uses: astral-sh/setup-uv@v7 - - name: Install tox with UV - run: uv tool install tox --with tox-uv - - name: configure DockerHub mirror - run: | - echo "{\"registry-mirrors\": [\"$DOCKERHUB_MIRROR\"]}" | sudo tee /etc/docker/daemon.json - sudo systemctl restart docker.service - - name: Run integration tests - ${{ matrix.test-module }} - working-directory: ./github-runner-manager/ - env: - # GitHub configuration - # INTEGRATION_TOKEN, INTEGRATION_TOKEN_ALT - ${{ vars.INTEGRATION_TEST_SECRET_ENV_NAME_1 }}: ${{ secrets.INTEGRATION_TEST_SECRET_ENV_VALUE_1 }} - ${{ vars.INTEGRATION_TEST_SECRET_ENV_NAME_2 }}: ${{ secrets.INTEGRATION_TEST_SECRET_ENV_VALUE_2 }} - # OpenStack configuration - # OS_AUTH_URL, OS_PROJECT_DOMAIN_NAME, OS_PROJECT_NAME, OS_USER_DOMAIN_NAME, OS_USERNAME, OS_PASSWORD, OS_NETWORK, OS_REGION_NAME - ${{ vars.INTEGRATION_TEST_SECRET_ENV_NAME_3 }}: ${{ secrets.INTEGRATION_TEST_SECRET_ENV_VALUE_3 }} - ${{ vars.INTEGRATION_TEST_SECRET_ENV_NAME_4 }}: ${{ secrets.INTEGRATION_TEST_SECRET_ENV_VALUE_4 }} - ${{ vars.INTEGRATION_TEST_SECRET_ENV_NAME_5 }}: ${{ secrets.INTEGRATION_TEST_SECRET_ENV_VALUE_5 }} - ${{ vars.INTEGRATION_TEST_SECRET_ENV_NAME_6 }}: ${{ secrets.INTEGRATION_TEST_SECRET_ENV_VALUE_6 }} - ${{ vars.INTEGRATION_TEST_SECRET_ENV_NAME_7 }}: ${{ secrets.INTEGRATION_TEST_SECRET_ENV_VALUE_7 }} - ${{ vars.INTEGRATION_TEST_SECRET_ENV_NAME_8 }}: ${{ secrets.INTEGRATION_TEST_SECRET_ENV_VALUE_8 }} - ${{ vars.INTEGRATION_TEST_SECRET_ENV_NAME_9 }}: ${{ secrets.INTEGRATION_TEST_SECRET_ENV_VALUE_9 }} - ${{ vars.INTEGRATION_TEST_SECRET_ENV_NAME_10 }}: ${{ secrets.INTEGRATION_TEST_SECRET_ENV_VALUE_10 }} - run: | - tox -e integration -- -v --tb=native -s \ - tests/integration/${{ matrix.test-module }}.py \ - --github-repository=${{ vars.INTEGRATION_TEST_PATH }} \ - --https-proxy="${{ vars.INTEGRATION_TEST_HTTPS_PROXY }}" \ - --http-proxy="${{ vars.INTEGRATION_TEST_HTTP_PROXY }}" \ - --no-proxy="${{ vars.INTEGRATION_TEST_NO_PROXY }}" \ - --openstack-https-proxy="${{ vars.INTEGRATION_TEST_OPENSTACK_HTTPS_PROXY }}" \ - --openstack-http-proxy="${{ vars.INTEGRATION_TEST_OPENSTACK_HTTP_PROXY }}" \ - --openstack-no-proxy="${{ vars.INTEGRATION_TEST_OPENSTACK_NO_PROXY }}" \ - --openstack-flavor-name="${{ vars.INTEGRATION_TEST_OPENSTACK_FLAVOR_NAME }}" \ - --openstack-image-id="${{ vars.INTEGRATION_TEST_IMAGE_ID }}" \ - --debug-log-dir="/tmp/github-runner-manager-test-logs" - - name: Print debug logs - if: ${{ failure() }} - run: | - echo "Printing debug logs from /tmp/github-runner-manager-test-logs" - ls -l /tmp/github-runner-manager-test-logs - for log_file in /tmp/github-runner-manager-test-logs/*; do - echo "===== Contents of $log_file =====" - cat "$log_file" - echo "=================================" - done diff --git a/.github/workflows/tics.yaml b/.github/workflows/tics.yaml deleted file mode 100644 index 4dcd8081f9..0000000000 --- a/.github/workflows/tics.yaml +++ /dev/null @@ -1,34 +0,0 @@ -on: [pull_request] - -jobs: - TICS: - name: TICS - runs-on: [self-hosted, linux, amd64, tiobe, jammy] - steps: - - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Run Unit Tests (manager app) - run: | - cd github-runner-manager - pip install tox flake8 -r ./requirements.txt - tox -e unit,coverage-report - rm -rf .tox - - name: Run Unit Tests - run: | - pip install tox flake8 -r ./requirements.txt -r ./github-runner-manager/requirements.txt - tox -e unit,coverage-report - rm -rf .tox - - name: TICS GitHub Action - uses: tiobe/tics-github-action@v3 - with: - mode: qserver - project: github-runner-operator - branchdir: ${{ github.workspace }} - viewerUrl: https://canonical.tiobe.com/tiobeweb/TICS/api/cfg?name=default - ticsAuthToken: ${{ secrets.TICSAUTHTOKEN }} - installTics: true - -concurrency: - group: tics - cancel-in-progress: false From d6c344430e91312ffbf3622ad55af192b8090c65 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Fri, 20 Mar 2026 13:13:38 +0100 Subject: [PATCH 2/8] test: add diagonosis for prometheus metrics test --- tests/integration/test_prometheus_metrics.py | 82 ++++++++++++++++++-- tox.ini | 1 + 2 files changed, 78 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_prometheus_metrics.py b/tests/integration/test_prometheus_metrics.py index db54795ad2..d85aff6c72 100644 --- a/tests/integration/test_prometheus_metrics.py +++ b/tests/integration/test_prometheus_metrics.py @@ -4,6 +4,9 @@ """Module for collecting metrics related to the reconciliation process.""" import logging +import subprocess +import threading +import time from typing import Any, Generator, cast import jubilant @@ -26,22 +29,87 @@ logger = logging.getLogger(__name__) +# DIAGNOSTIC: 40-minute timeout so hangs produce a traceback instead of running forever. +pytestmark = [pytest.mark.openstack, pytest.mark.timeout(2400)] + MICROK8S_CONTROLLER_NAME = "microk8s" COS_AGENT_CHARM = "opentelemetry-collector" +def _start_status_logger(model_name: str, interval: int = 30) -> threading.Thread: + """Spawn a daemon thread that polls `juju status` on a model every `interval` seconds.""" + + def _poll(): + """Poll juju status in a loop.""" + while True: + try: + result = subprocess.run( + ["juju", "status", "-m", model_name, "--format", "yaml"], + capture_output=True, + text=True, + timeout=30, + ) + logger.info( + "DIAGNOSTIC juju status -m %s (rc=%d):\n%s", + model_name, + result.returncode, + result.stdout[-2000:] if len(result.stdout) > 2000 else result.stdout, + ) + if result.stderr: + logger.warning("DIAGNOSTIC juju status stderr: %s", result.stderr[-500:]) + except Exception: + logger.exception("DIAGNOSTIC status poll failed for model %s", model_name) + time.sleep(interval) + + t = threading.Thread(target=_poll, daemon=True) + t.start() + return t + + +def _dump_debug_log(model_name: str) -> None: + """Capture the last 200 lines of juju debug-log for the model.""" + try: + result = subprocess.run( + ["juju", "debug-log", "-m", model_name, "--replay", "--limit", "200"], + capture_output=True, + text=True, + timeout=60, + ) + logger.info( + "DIAGNOSTIC debug-log -m %s (rc=%d):\n%s", + model_name, + result.returncode, + result.stdout, + ) + if result.stderr: + logger.warning("DIAGNOSTIC debug-log stderr: %s", result.stderr[-500:]) + except Exception: + logger.exception("DIAGNOSTIC debug-log dump failed for model %s", model_name) + + +def _timed_deploy(juju_obj: jubilant.Juju, label: str, *args: Any, **kwargs: Any) -> None: + """Wrap juju.deploy() with timing info.""" + logger.info("DIAGNOSTIC [%s] starting deploy: args=%s kwargs=%s", label, args, kwargs) + t0 = time.monotonic() + juju_obj.deploy(*args, **kwargs) + elapsed = time.monotonic() - t0 + logger.info("DIAGNOSTIC [%s] deploy completed in %.1fs", label, elapsed) + + @pytest_asyncio.fixture(scope="module", name="k8s_juju") def k8s_juju_fixture(request: pytest.FixtureRequest) -> Generator[jubilant.Juju, None, None]: """The machine model for K8s charms.""" keep_models = cast(bool, request.config.getoption("--keep-models")) with jubilant.temp_model(keep=keep_models, controller=MICROK8S_CONTROLLER_NAME) as juju: + _start_status_logger(juju.model) yield juju + _dump_debug_log(juju.model) @pytest.fixture(scope="module", name="prometheus_app") def prometheus_app_fixture(k8s_juju: jubilant.Juju): """Deploy prometheus charm.""" - k8s_juju.deploy("prometheus-k8s", channel="1/stable") + _timed_deploy(k8s_juju, "k8s/prometheus-k8s", "prometheus-k8s", channel="1/stable") k8s_juju.wait(lambda status: jubilant.all_active(status, "prometheus-k8s")) # k8s_juju.model and juju.model already has : prefixed. we must split them since # juju.consume expects only the model name. @@ -57,7 +125,7 @@ def prometheus_app_fixture(k8s_juju: jubilant.Juju): @pytest.fixture(scope="module", name="grafana_app") def grafana_app_fixture(k8s_juju: jubilant.Juju, prometheus_app: AppStatus): """Deploy prometheus charm.""" - k8s_juju.deploy("grafana-k8s", channel="1/stable") + _timed_deploy(k8s_juju, "k8s/grafana-k8s", "grafana-k8s", channel="1/stable") k8s_juju.integrate("grafana-k8s:grafana-source", f"{prometheus_app.charm_name}:grafana-source") k8s_juju.wait(lambda status: jubilant.all_active(status, "grafana-k8s", "prometheus-k8s")) # k8s_juju.model and juju.model already has : prefixed. we must split them since @@ -76,7 +144,7 @@ def traefik_ingress_fixture( k8s_juju: jubilant.Juju, prometheus_app: AppStatus, grafana_app: AppStatus ): """Ingress for cross controller communication.""" - k8s_juju.deploy("traefik-k8s", channel="latest/stable") + _timed_deploy(k8s_juju, "k8s/traefik-k8s", "traefik-k8s", channel="latest/stable") k8s_juju.integrate("traefik-k8s", f"{prometheus_app.charm_name}:ingress") k8s_juju.integrate("traefik-k8s", f"{grafana_app.charm_name}:ingress") @@ -92,7 +160,10 @@ def grafana_password_fixture(k8s_juju: jubilant.Juju, grafana_app: AppStatus): @pytest.fixture(scope="module", name="openstack_app_cos_agent") def openstack_app_cos_agent_fixture(juju: jubilant.Juju, app_openstack_runner: Application): """Deploy cos-agent subordinate charm on OpenStack runner application.""" - juju.deploy( + _start_status_logger(juju.model) + _timed_deploy( + juju, + "openstack/cos-agent", COS_AGENT_CHARM, channel="2/candidate", base="ubuntu@22.04", @@ -102,7 +173,8 @@ def openstack_app_cos_agent_fixture(juju: jubilant.Juju, app_openstack_runner: A juju.wait( lambda status: jubilant.all_agents_idle(status, app_openstack_runner.name, COS_AGENT_CHARM) ) - return app_openstack_runner + yield app_openstack_runner + _dump_debug_log(juju.model) @pytest.mark.usefixtures("traefik_ingress") diff --git a/tox.ini b/tox.ini index 4c67d72ed3..d00ba384d4 100644 --- a/tox.ini +++ b/tox.ini @@ -133,6 +133,7 @@ deps = pytest_httpserver websockets<14.0 # https://github.com/juju/python-libjuju/issues/1184 -r{toxinidir}/requirements.txt + pytest-timeout allure-pytest>=2.8.18 git+https://github.com/canonical/data-platform-workflows@v24.0.0\#subdirectory=python/pytest_plugins/allure_pytest_collection_report -r{[vars]tst_path}integration/requirements.txt From 5f8afd569881a0f7fe8e7b095a80803460c35753 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Fri, 20 Mar 2026 20:23:23 +0100 Subject: [PATCH 3/8] test: add alternative using subprocess --- .github/workflows/integration_test.yaml | 2 +- ...py => test_prometheus_metrics_original.py} | 0 .../test_prometheus_metrics_subprocess.py | 414 ++++++++++++++++++ 3 files changed, 415 insertions(+), 1 deletion(-) rename tests/integration/{test_prometheus_metrics.py => test_prometheus_metrics_original.py} (100%) create mode 100644 tests/integration/test_prometheus_metrics_subprocess.py diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index b2bbc9ed8c..8f11bdb7d8 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -46,7 +46,7 @@ jobs: pre-run-script: tests/integration/setup-integration-tests.sh provider: lxd test-tox-env: integration-juju3.6 - modules: '["test_prometheus_metrics"]' + modules: '["test_prometheus_metrics_original", "test_prometheus_metrics_subprocess"]' # INTEGRATION_TOKEN, INTEGRATION_TOKEN_ALT, OS_* are passed through INTEGRATION_TEST_SECRET_ENV_VALUE_ # mapping. See CONTRIBUTING.md for more details. extra-arguments: | diff --git a/tests/integration/test_prometheus_metrics.py b/tests/integration/test_prometheus_metrics_original.py similarity index 100% rename from tests/integration/test_prometheus_metrics.py rename to tests/integration/test_prometheus_metrics_original.py diff --git a/tests/integration/test_prometheus_metrics_subprocess.py b/tests/integration/test_prometheus_metrics_subprocess.py new file mode 100644 index 0000000000..350d15c437 --- /dev/null +++ b/tests/integration/test_prometheus_metrics_subprocess.py @@ -0,0 +1,414 @@ +# Copyright 2026 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Prometheus metrics integration test — subprocess variant. + +Replaces jubilant CLI calls with direct subprocess calls that have hard timeouts +to work around jubilant#271 (_cli() can hang indefinitely when juju commands stall). +Compare behavior with the original test_prometheus_metrics.py to evaluate this approach. +""" + +import json +import logging +import subprocess +import time +from collections.abc import Callable +from dataclasses import dataclass, field +from typing import Any, Generator, cast + +import jubilant +import pytest +import pytest_asyncio +import requests +from github.Branch import Branch +from github.Repository import Repository +from juju.application import Application +from tenacity import retry, stop_after_attempt, wait_exponential + +from charm_state import BASE_VIRTUAL_MACHINES_CONFIG_NAME +from tests.integration.helpers.common import ( + DISPATCH_TEST_WORKFLOW_FILENAME, + dispatch_workflow, + wait_for, +) +from tests.integration.helpers.openstack import OpenStackInstanceHelper + +logger = logging.getLogger(__name__) + +pytestmark = [pytest.mark.openstack, pytest.mark.timeout(2400)] + +MICROK8S_CONTROLLER_NAME = "microk8s" +COS_AGENT_CHARM = "opentelemetry-collector" +CLI_TIMEOUT = 300 +STATUS_TIMEOUT = 30 + + +# --------------------------------------------------------------------------- +# Subprocess helpers — every juju CLI call gets a hard subprocess timeout +# --------------------------------------------------------------------------- + + +def _juju_cli( + *args: str, model: str | None = None, timeout: int = CLI_TIMEOUT +) -> subprocess.CompletedProcess[str]: + """Run a juju CLI command with a hard subprocess timeout. + + Raises subprocess.TimeoutExpired if the command hangs. + """ + cmd = ["juju", *args] + if model: + cmd.extend(["-m", model]) + logger.info("juju-cli: %s", " ".join(cmd)) + t0 = time.monotonic() + result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=True) + logger.info("juju-cli: completed in %.1fs", time.monotonic() - t0) + if result.stderr: + logger.debug("juju-cli stderr: %s", result.stderr[:500]) + return result + + +def _juju_status(model: str, timeout: int = STATUS_TIMEOUT) -> dict[str, Any]: + """Fetch and parse juju status as JSON.""" + result = _juju_cli("status", "--format", "json", model=model, timeout=timeout) + return json.loads(result.stdout) + + +def _all_active(status: dict[str, Any], *app_names: str) -> bool: + """Check all named apps and their units have active workload status.""" + apps = status.get("applications", {}) + for name in app_names: + app = apps.get(name) + if not app: + return False + if app.get("application-status", {}).get("current") != "active": + return False + for unit in app.get("units", {}).values(): + if unit.get("workload-status", {}).get("current") != "active": + return False + return True + + +def _all_agents_idle(status: dict[str, Any], *app_names: str) -> bool: + """Check all agents (including subordinates) for named apps are idle.""" + apps = status.get("applications", {}) + for name in app_names: + app = apps.get(name) + if not app: + return False + for unit in app.get("units", {}).values(): + if unit.get("juju-status", {}).get("current") != "idle": + return False + for sub in unit.get("subordinates", {}).values(): + if sub.get("juju-status", {}).get("current") != "idle": + return False + return True + + +def _wait_for_status( + model: str, + check: Callable[[dict[str, Any]], bool], + *, + timeout: int = 600, + delay: int = 5, + successes_needed: int = 3, +) -> dict[str, Any]: + """Poll ``juju status`` until *check* returns True. + + Each individual ``juju status`` call has its own subprocess timeout so a + single hung call cannot block the whole wait. + """ + deadline = time.monotonic() + timeout + consecutive = 0 + last_status: dict[str, Any] = {} + while time.monotonic() < deadline: + try: + last_status = _juju_status(model, timeout=STATUS_TIMEOUT) + if check(last_status): + consecutive += 1 + if consecutive >= successes_needed: + return last_status + else: + consecutive = 0 + except subprocess.TimeoutExpired: + logger.warning("juju status timed out for model %s, retrying", model) + consecutive = 0 + except subprocess.CalledProcessError as exc: + logger.warning( + "juju status failed for model %s (rc=%d): %s", + model, + exc.returncode, + (exc.stderr or "")[:300], + ) + consecutive = 0 + time.sleep(delay) + raise TimeoutError( + f"Condition not met within {timeout}s. " + f"Last status apps: {list(last_status.get('applications', {}).keys())}" + ) + + +# --------------------------------------------------------------------------- +# Lightweight status types (replace jubilant.statustypes) +# --------------------------------------------------------------------------- + + +@dataclass +class UnitInfo: + """Minimal unit info extracted from juju status JSON. + + Attributes: + address: The unit's IP address. + """ + + address: str + + +@dataclass +class AppInfo: + """Minimal app info extracted from juju status JSON. + + Attributes: + name: The application name. + charm_name: The charm name. + address: The application's IP address. + units: Mapping of unit name to UnitInfo. + """ + + name: str + charm_name: str + address: str + units: dict[str, UnitInfo] = field(default_factory=dict) + + +def _app_info_from_status(status: dict[str, Any], app_name: str) -> AppInfo: + """Build an AppInfo from parsed juju status JSON.""" + app = status["applications"][app_name] + units: dict[str, UnitInfo] = {} + first_unit_addr = "" + for unit_name, unit_data in app.get("units", {}).items(): + addr = unit_data.get("address", "") + units[unit_name] = UnitInfo(address=addr) + if not first_unit_addr: + first_unit_addr = addr + return AppInfo( + name=app_name, + charm_name=app.get("charm-name", app_name), + address=app.get("address", first_unit_addr), + units=units, + ) + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest_asyncio.fixture(scope="module", name="k8s_juju") +def k8s_juju_fixture(request: pytest.FixtureRequest) -> Generator[jubilant.Juju, None, None]: + """Temporary K8s model — uses jubilant only for model lifecycle.""" + keep_models = cast(bool, request.config.getoption("--keep-models")) + with jubilant.temp_model(keep=keep_models, controller=MICROK8S_CONTROLLER_NAME) as juju: + yield juju + + +@pytest.fixture(scope="module", name="k8s_model") +def k8s_model_fixture(k8s_juju: jubilant.Juju) -> str: + """K8s model name in ``controller:model`` format for subprocess calls.""" + return k8s_juju.model + + +@pytest.fixture(scope="module", name="prometheus_app") +def prometheus_app_fixture(k8s_model: str) -> AppInfo: + """Deploy prometheus-k8s via subprocess.""" + _juju_cli("deploy", "prometheus-k8s", "--channel", "1/stable", model=k8s_model) + status = _wait_for_status(k8s_model, lambda s: _all_active(s, "prometheus-k8s")) + k8s_model_name = k8s_model.split(":", 1)[1] + _juju_cli( + "offer", + f"{k8s_model_name}.prometheus-k8s:receive-remote-write", + "-c", + MICROK8S_CONTROLLER_NAME, + ) + return _app_info_from_status(status, "prometheus-k8s") + + +@pytest.fixture(scope="module", name="grafana_app") +def grafana_app_fixture(k8s_model: str, prometheus_app: AppInfo) -> AppInfo: + """Deploy grafana-k8s via subprocess.""" + _juju_cli("deploy", "grafana-k8s", "--channel", "1/stable", model=k8s_model) + _juju_cli( + "integrate", + "grafana-k8s:grafana-source", + f"{prometheus_app.charm_name}:grafana-source", + model=k8s_model, + ) + status = _wait_for_status(k8s_model, lambda s: _all_active(s, "grafana-k8s", "prometheus-k8s")) + k8s_model_name = k8s_model.split(":", 1)[1] + _juju_cli( + "offer", + f"{k8s_model_name}.grafana-k8s:grafana-dashboard", + "-c", + MICROK8S_CONTROLLER_NAME, + ) + return _app_info_from_status(status, "grafana-k8s") + + +@pytest.fixture(scope="module", name="traefik_ingress") +def traefik_ingress_fixture(k8s_model: str, prometheus_app: AppInfo, grafana_app: AppInfo) -> None: + """Deploy traefik ingress via subprocess.""" + _juju_cli("deploy", "traefik-k8s", "--channel", "latest/stable", model=k8s_model) + _juju_cli("integrate", "traefik-k8s", f"{prometheus_app.charm_name}:ingress", model=k8s_model) + _juju_cli("integrate", "traefik-k8s", f"{grafana_app.charm_name}:ingress", model=k8s_model) + + +@pytest.fixture(scope="module", name="grafana_password") +def grafana_password_fixture(k8s_model: str, grafana_app: AppInfo) -> str: + """Get Grafana admin password via juju run.""" + unit = next(iter(grafana_app.units.keys())) + result = _juju_cli("run", unit, "get-admin-password", "--format", "json", model=k8s_model) + data = json.loads(result.stdout) + return data[unit]["results"]["admin-password"] + + +@pytest.fixture(scope="module", name="openstack_app_cos_agent") +def openstack_app_cos_agent_fixture( + juju: jubilant.Juju, app_openstack_runner: Application +) -> Application: + """Deploy cos-agent via subprocess; return libjuju Application for async ops.""" + os_model = juju.model + _juju_cli( + "deploy", + COS_AGENT_CHARM, + "--channel", + "2/candidate", + "--base", + "ubuntu@22.04", + "--revision", + "149", + model=os_model, + ) + _juju_cli("integrate", app_openstack_runner.name, COS_AGENT_CHARM, model=os_model) + _wait_for_status( + os_model, + lambda s: _all_agents_idle(s, app_openstack_runner.name, COS_AGENT_CHARM), + ) + return app_openstack_runner + + +# --------------------------------------------------------------------------- +# Test +# --------------------------------------------------------------------------- + + +@pytest.mark.usefixtures("traefik_ingress") +@pytest.mark.openstack +async def test_prometheus_metrics_subprocess( + juju: jubilant.Juju, + k8s_model: str, + openstack_app_cos_agent: Application, + grafana_app: AppInfo, + grafana_password: str, + prometheus_app: AppInfo, + test_github_branch: Branch, + github_repository: Repository, + instance_helper: OpenStackInstanceHelper, +): + """Subprocess variant of test_prometheus_metrics. + + arrange: given a prometheus charm application. + act: when GitHub runner is integrated. + assert: the datasource is registered and basic metrics are available. + """ + os_model = juju.model + k8s_model_name = k8s_model.split(":", 1)[1] + _juju_cli( + "consume", + f"{MICROK8S_CONTROLLER_NAME}:{k8s_model_name}.prometheus-k8s", + "prometheus-k8s", + model=os_model, + ) + _juju_cli( + "consume", + f"{MICROK8S_CONTROLLER_NAME}:{k8s_model_name}.grafana-k8s", + "grafana-k8s", + model=os_model, + ) + + _juju_cli("integrate", COS_AGENT_CHARM, "prometheus-k8s", model=os_model) + _juju_cli("integrate", COS_AGENT_CHARM, "grafana-k8s", model=os_model) + _wait_for_status( + os_model, + lambda s: _all_agents_idle(s, openstack_app_cos_agent.name, COS_AGENT_CHARM), + ) + + grafana_ip = grafana_app.units["grafana-k8s/0"].address + _patiently_wait_for_prometheus_datasource( + grafana_ip=grafana_ip, grafana_password=grafana_password + ) + + await instance_helper.ensure_charm_has_runner(openstack_app_cos_agent) + await dispatch_workflow( + app=openstack_app_cos_agent, + branch=test_github_branch, + github_repository=github_repository, + conclusion="success", + workflow_id_or_name=DISPATCH_TEST_WORKFLOW_FILENAME, + ) + # Set the number of virtual machines to 0 to speedup reconciliation + await openstack_app_cos_agent.set_config({BASE_VIRTUAL_MACHINES_CONFIG_NAME: "0"}) + + async def _no_runners() -> bool: + """Check that no runners are active.""" + action = await openstack_app_cos_agent.units[0].run_action("check-runners") + await action.wait() + return ( + action.status == "completed" + and action.results["online"] == "0" + and action.results["offline"] == "0" + and action.results["unknown"] == "0" + ) + + await wait_for(_no_runners, timeout=10 * 60, check_interval=10) + + prometheus_ip = prometheus_app.address + _patiently_wait_for_prometheus_metrics( + prometheus_ip, + "openstack_http_requests_total", + "reconcile_duration_seconds_sum", + "expected_runners_count", + "busy_runners_count", + "idle_runners_count", + "runner_spawn_duration_seconds_bucket", + "runner_idle_duration_seconds_bucket", + "runner_queue_duration_seconds_bucket", + "deleted_runners_total", + "delete_runner_duration_seconds_bucket", + "deleted_vms_total", + "delete_vm_duration_seconds_bucket", + "job_duration_seconds_bucket", + "job_status_count", + "job_event_count", + ) + + +@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=2, max=60), reraise=True) +def _patiently_wait_for_prometheus_datasource(grafana_ip: str, grafana_password: str): + """Wait for prometheus datasource to come up.""" + response = requests.get(f"http://admin:{grafana_password}@{grafana_ip}:3000/api/datasources") + response.raise_for_status() + datasources: list[dict[str, Any]] = response.json() + assert any(datasource["type"] == "prometheus" for datasource in datasources) + + +@retry( + stop=stop_after_attempt(10), wait=wait_exponential(multiplier=2, min=10, max=60), reraise=True +) +def _patiently_wait_for_prometheus_metrics(prometheus_ip: str, *metric_names: str): + """Wait for the prometheus metrics to be available.""" + for metric_name in metric_names: + response = requests.get( + f"http://{prometheus_ip}:9090/api/v1/series", params={"match[]": metric_name} + ) + response.raise_for_status() + query_result = response.json()["data"] + assert len(query_result), f"No data found for metric: {metric_name}" From 957b844014498f1b50a30bae4b981f4abd56781f Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Fri, 20 Mar 2026 20:28:26 +0100 Subject: [PATCH 4/8] test: add test timeout of 60 minutes --- .github/workflows/integration_test.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 8f11bdb7d8..50186a9a77 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -46,6 +46,7 @@ jobs: pre-run-script: tests/integration/setup-integration-tests.sh provider: lxd test-tox-env: integration-juju3.6 + test-timeout: 60 modules: '["test_prometheus_metrics_original", "test_prometheus_metrics_subprocess"]' # INTEGRATION_TOKEN, INTEGRATION_TOKEN_ALT, OS_* are passed through INTEGRATION_TEST_SECRET_ENV_VALUE_ # mapping. See CONTRIBUTING.md for more details. From d5b482ee072264cc79f8b6ede979a80f1cf78fae Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Fri, 20 Mar 2026 22:08:23 +0100 Subject: [PATCH 5/8] test: use full jubilant path --- .github/workflows/integration_test.yaml | 4 +- .../test_prometheus_metrics_full_jubilant.py | 254 ++++++++++++++++++ 2 files changed, 256 insertions(+), 2 deletions(-) create mode 100644 tests/integration/test_prometheus_metrics_full_jubilant.py diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 50186a9a77..d0c63358c9 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -46,8 +46,8 @@ jobs: pre-run-script: tests/integration/setup-integration-tests.sh provider: lxd test-tox-env: integration-juju3.6 - test-timeout: 60 - modules: '["test_prometheus_metrics_original", "test_prometheus_metrics_subprocess"]' + test-timeout: 120 + modules: '["test_prometheus_metrics_full_jubilant"]' # INTEGRATION_TOKEN, INTEGRATION_TOKEN_ALT, OS_* are passed through INTEGRATION_TEST_SECRET_ENV_VALUE_ # mapping. See CONTRIBUTING.md for more details. extra-arguments: | diff --git a/tests/integration/test_prometheus_metrics_full_jubilant.py b/tests/integration/test_prometheus_metrics_full_jubilant.py new file mode 100644 index 0000000000..5a34b84e46 --- /dev/null +++ b/tests/integration/test_prometheus_metrics_full_jubilant.py @@ -0,0 +1,254 @@ +# Copyright 2026 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Prometheus metrics integration test — fully jubilant, no python-libjuju awaits. + +Uses jubilant for ALL juju operations (deploy, integrate, wait, config, run) +to avoid the python-libjuju AllWatcher hang after cross-controller integrations. +The conftest ``app_openstack_runner`` fixture still creates a libjuju Model in the +background, but this test never awaits on any libjuju object. +""" + +import logging +from typing import Any, Generator, cast + +import jubilant +import pytest +import pytest_asyncio +import requests +from github.Branch import Branch +from github.Repository import Repository +from jubilant.statustypes import AppStatus +from juju.application import Application +from tenacity import retry, stop_after_attempt, wait_exponential + +from charm_state import BASE_VIRTUAL_MACHINES_CONFIG_NAME +from tests.integration.helpers.common import ( + DISPATCH_TEST_WORKFLOW_FILENAME, + dispatch_workflow, +) + +logger = logging.getLogger(__name__) + +pytestmark = [pytest.mark.openstack, pytest.mark.timeout(2400)] + +MICROK8S_CONTROLLER_NAME = "microk8s" +COS_AGENT_CHARM = "opentelemetry-collector" + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest_asyncio.fixture(scope="module", name="k8s_juju") +def k8s_juju_fixture(request: pytest.FixtureRequest) -> Generator[jubilant.Juju, None, None]: + """The machine model for K8s charms.""" + keep_models = cast(bool, request.config.getoption("--keep-models")) + with jubilant.temp_model(keep=keep_models, controller=MICROK8S_CONTROLLER_NAME) as juju: + yield juju + + +@pytest.fixture(scope="module", name="prometheus_app") +def prometheus_app_fixture(k8s_juju: jubilant.Juju) -> AppStatus: + """Deploy prometheus charm.""" + k8s_juju.deploy("prometheus-k8s", channel="1/stable") + k8s_juju.wait(lambda status: jubilant.all_active(status, "prometheus-k8s")) + k8s_juju_model_name = k8s_juju.model.split(":", 1)[1] + k8s_juju.offer( + f"{k8s_juju_model_name}.prometheus-k8s", + endpoint="receive-remote-write", + controller=MICROK8S_CONTROLLER_NAME, + ) + return k8s_juju.status().apps["prometheus-k8s"] + + +@pytest.fixture(scope="module", name="grafana_app") +def grafana_app_fixture(k8s_juju: jubilant.Juju, prometheus_app: AppStatus) -> AppStatus: + """Deploy grafana charm.""" + k8s_juju.deploy("grafana-k8s", channel="1/stable") + k8s_juju.integrate("grafana-k8s:grafana-source", f"{prometheus_app.charm_name}:grafana-source") + k8s_juju.wait(lambda status: jubilant.all_active(status, "grafana-k8s", "prometheus-k8s")) + k8s_juju_model_name = k8s_juju.model.split(":", 1)[1] + k8s_juju.offer( + f"{k8s_juju_model_name}.grafana-k8s", + endpoint="grafana-dashboard", + controller=MICROK8S_CONTROLLER_NAME, + ) + return k8s_juju.status().apps["grafana-k8s"] + + +@pytest.fixture(scope="module", name="traefik_ingress") +def traefik_ingress_fixture( + k8s_juju: jubilant.Juju, prometheus_app: AppStatus, grafana_app: AppStatus +) -> None: + """Ingress for cross controller communication.""" + k8s_juju.deploy("traefik-k8s", channel="latest/stable") + k8s_juju.integrate("traefik-k8s", f"{prometheus_app.charm_name}:ingress") + k8s_juju.integrate("traefik-k8s", f"{grafana_app.charm_name}:ingress") + + +@pytest.fixture(scope="module", name="grafana_password") +def grafana_password_fixture(k8s_juju: jubilant.Juju, grafana_app: AppStatus) -> str: + """Get Grafana dashboard password.""" + unit = next(iter(grafana_app.units.keys())) + result = k8s_juju.run(unit, "get-admin-password") + return result.results["admin-password"] + + +@pytest.fixture(scope="module", name="openstack_app_cos_agent") +def openstack_app_cos_agent_fixture(juju: jubilant.Juju, app_openstack_runner: Application) -> str: + """Deploy cos-agent subordinate charm. Return the app name as a string.""" + app_name = app_openstack_runner.name + juju.deploy( + COS_AGENT_CHARM, + channel="2/candidate", + base="ubuntu@22.04", + revision=149, + ) + juju.integrate(app_name, COS_AGENT_CHARM) + juju.wait(lambda status: jubilant.all_agents_idle(status, app_name, COS_AGENT_CHARM)) + return app_name + + +# --------------------------------------------------------------------------- +# Test +# --------------------------------------------------------------------------- + + +@pytest.mark.usefixtures("traefik_ingress") +@pytest.mark.openstack +async def test_prometheus_metrics_full_jubilant( + juju: jubilant.Juju, + k8s_juju: jubilant.Juju, + openstack_app_cos_agent: str, + grafana_app: AppStatus, + grafana_password: str, + prometheus_app: AppStatus, + test_github_branch: Branch, + github_repository: Repository, +): + """Fully jubilant variant — no python-libjuju awaits after fixture setup. + + arrange: given a prometheus charm application. + act: when GitHub runner is integrated. + assert: the datasource is registered and basic metrics are available. + """ + app_name = openstack_app_cos_agent + k8s_juju_model_name = k8s_juju.model.split(":", 1)[1] + juju.consume( + f"{k8s_juju_model_name}.prometheus-k8s", + alias="prometheus-k8s", + controller=MICROK8S_CONTROLLER_NAME, + ) + juju.consume( + f"{k8s_juju_model_name}.grafana-k8s", + alias="grafana-k8s", + controller=MICROK8S_CONTROLLER_NAME, + ) + + juju.integrate(COS_AGENT_CHARM, "prometheus-k8s") + juju.integrate(COS_AGENT_CHARM, "grafana-k8s") + juju.wait(lambda status: jubilant.all_agents_idle(status, app_name, COS_AGENT_CHARM)) + + grafana_ip = grafana_app.units["grafana-k8s/0"].address + _patiently_wait_for_prometheus_datasource( + grafana_ip=grafana_ip, grafana_password=grafana_password + ) + + # --- Runner lifecycle via jubilant (no python-libjuju) --- + juju.config(app_name, values={BASE_VIRTUAL_MACHINES_CONFIG_NAME: "1"}) + _wait_for_runner_ready(juju, app_name) + + await dispatch_workflow( + app=None, + branch=test_github_branch, + github_repository=github_repository, + conclusion="success", + workflow_id_or_name=DISPATCH_TEST_WORKFLOW_FILENAME, + dispatch_input={"runner": app_name}, + ) + + juju.config(app_name, values={BASE_VIRTUAL_MACHINES_CONFIG_NAME: "0"}) + _wait_for_no_runners(juju, app_name) + + prometheus_ip = prometheus_app.address + _patiently_wait_for_prometheus_metrics( + prometheus_ip, + "openstack_http_requests_total", + "reconcile_duration_seconds_sum", + "expected_runners_count", + "busy_runners_count", + "idle_runners_count", + "runner_spawn_duration_seconds_bucket", + "runner_idle_duration_seconds_bucket", + "runner_queue_duration_seconds_bucket", + "deleted_runners_total", + "delete_runner_duration_seconds_bucket", + "deleted_vms_total", + "delete_vm_duration_seconds_bucket", + "job_duration_seconds_bucket", + "job_status_count", + "job_event_count", + ) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _wait_for_runner_ready(juju: jubilant.Juju, app_name: str) -> None: + """Poll check-runners action until at least one runner is online.""" + unit = f"{app_name}/0" + for attempt in range(20): + result = juju.run(unit, "check-runners") + if result.status == "completed" and int(result.results["online"]) >= 1: + return + logger.info("Waiting for runner (attempt %d): online=%s", attempt, result.results) + import time + + time.sleep(30) + raise TimeoutError(f"Runner on {unit} never came online after 20 attempts") + + +def _wait_for_no_runners(juju: jubilant.Juju, app_name: str) -> None: + """Poll check-runners action until all runners are gone.""" + unit = f"{app_name}/0" + import time + + deadline = time.monotonic() + 10 * 60 + while time.monotonic() < deadline: + result = juju.run(unit, "check-runners") + if ( + result.status == "completed" + and result.results["online"] == "0" + and result.results["offline"] == "0" + and result.results["unknown"] == "0" + ): + return + time.sleep(10) + raise TimeoutError(f"Runners on {unit} still present after 10 minutes") + + +@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=2, max=60), reraise=True) +def _patiently_wait_for_prometheus_datasource(grafana_ip: str, grafana_password: str): + """Wait for prometheus datasource to come up.""" + response = requests.get(f"http://admin:{grafana_password}@{grafana_ip}:3000/api/datasources") + response.raise_for_status() + datasources: list[dict[str, Any]] = response.json() + assert any(datasource["type"] == "prometheus" for datasource in datasources) + + +@retry( + stop=stop_after_attempt(10), wait=wait_exponential(multiplier=2, min=10, max=60), reraise=True +) +def _patiently_wait_for_prometheus_metrics(prometheus_ip: str, *metric_names: str): + """Wait for the prometheus metrics to be available.""" + for metric_name in metric_names: + response = requests.get( + f"http://{prometheus_ip}:9090/api/v1/series", params={"match[]": metric_name} + ) + response.raise_for_status() + query_result = response.json()["data"] + assert len(query_result), f"No data found for metric: {metric_name}" From dbc86b04c8f4874f736bc9a8fb21be9deac81165 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Sat, 21 Mar 2026 08:41:04 +0100 Subject: [PATCH 6/8] test: three variants at same time --- .github/workflows/integration_test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index d0c63358c9..2838681580 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -47,7 +47,7 @@ jobs: provider: lxd test-tox-env: integration-juju3.6 test-timeout: 120 - modules: '["test_prometheus_metrics_full_jubilant"]' + modules: '["test_prometheus_metrics_original", "test_prometheus_metrics_subprocess", "test_prometheus_metrics_full_jubilant"]' # INTEGRATION_TOKEN, INTEGRATION_TOKEN_ALT, OS_* are passed through INTEGRATION_TEST_SECRET_ENV_VALUE_ # mapping. See CONTRIBUTING.md for more details. extra-arguments: | From d7f6b97cf007d5e2cff4db635d411b8f33df3241 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Sat, 21 Mar 2026 18:37:43 +0100 Subject: [PATCH 7/8] Revert "test: add diagonosis for prometheus metrics test" This reverts commit d6c344430e91312ffbf3622ad55af192b8090c65. --- .../test_prometheus_metrics_original.py | 82 ++----------------- tox.ini | 1 - 2 files changed, 5 insertions(+), 78 deletions(-) diff --git a/tests/integration/test_prometheus_metrics_original.py b/tests/integration/test_prometheus_metrics_original.py index d85aff6c72..db54795ad2 100644 --- a/tests/integration/test_prometheus_metrics_original.py +++ b/tests/integration/test_prometheus_metrics_original.py @@ -4,9 +4,6 @@ """Module for collecting metrics related to the reconciliation process.""" import logging -import subprocess -import threading -import time from typing import Any, Generator, cast import jubilant @@ -29,87 +26,22 @@ logger = logging.getLogger(__name__) -# DIAGNOSTIC: 40-minute timeout so hangs produce a traceback instead of running forever. -pytestmark = [pytest.mark.openstack, pytest.mark.timeout(2400)] - MICROK8S_CONTROLLER_NAME = "microk8s" COS_AGENT_CHARM = "opentelemetry-collector" -def _start_status_logger(model_name: str, interval: int = 30) -> threading.Thread: - """Spawn a daemon thread that polls `juju status` on a model every `interval` seconds.""" - - def _poll(): - """Poll juju status in a loop.""" - while True: - try: - result = subprocess.run( - ["juju", "status", "-m", model_name, "--format", "yaml"], - capture_output=True, - text=True, - timeout=30, - ) - logger.info( - "DIAGNOSTIC juju status -m %s (rc=%d):\n%s", - model_name, - result.returncode, - result.stdout[-2000:] if len(result.stdout) > 2000 else result.stdout, - ) - if result.stderr: - logger.warning("DIAGNOSTIC juju status stderr: %s", result.stderr[-500:]) - except Exception: - logger.exception("DIAGNOSTIC status poll failed for model %s", model_name) - time.sleep(interval) - - t = threading.Thread(target=_poll, daemon=True) - t.start() - return t - - -def _dump_debug_log(model_name: str) -> None: - """Capture the last 200 lines of juju debug-log for the model.""" - try: - result = subprocess.run( - ["juju", "debug-log", "-m", model_name, "--replay", "--limit", "200"], - capture_output=True, - text=True, - timeout=60, - ) - logger.info( - "DIAGNOSTIC debug-log -m %s (rc=%d):\n%s", - model_name, - result.returncode, - result.stdout, - ) - if result.stderr: - logger.warning("DIAGNOSTIC debug-log stderr: %s", result.stderr[-500:]) - except Exception: - logger.exception("DIAGNOSTIC debug-log dump failed for model %s", model_name) - - -def _timed_deploy(juju_obj: jubilant.Juju, label: str, *args: Any, **kwargs: Any) -> None: - """Wrap juju.deploy() with timing info.""" - logger.info("DIAGNOSTIC [%s] starting deploy: args=%s kwargs=%s", label, args, kwargs) - t0 = time.monotonic() - juju_obj.deploy(*args, **kwargs) - elapsed = time.monotonic() - t0 - logger.info("DIAGNOSTIC [%s] deploy completed in %.1fs", label, elapsed) - - @pytest_asyncio.fixture(scope="module", name="k8s_juju") def k8s_juju_fixture(request: pytest.FixtureRequest) -> Generator[jubilant.Juju, None, None]: """The machine model for K8s charms.""" keep_models = cast(bool, request.config.getoption("--keep-models")) with jubilant.temp_model(keep=keep_models, controller=MICROK8S_CONTROLLER_NAME) as juju: - _start_status_logger(juju.model) yield juju - _dump_debug_log(juju.model) @pytest.fixture(scope="module", name="prometheus_app") def prometheus_app_fixture(k8s_juju: jubilant.Juju): """Deploy prometheus charm.""" - _timed_deploy(k8s_juju, "k8s/prometheus-k8s", "prometheus-k8s", channel="1/stable") + k8s_juju.deploy("prometheus-k8s", channel="1/stable") k8s_juju.wait(lambda status: jubilant.all_active(status, "prometheus-k8s")) # k8s_juju.model and juju.model already has : prefixed. we must split them since # juju.consume expects only the model name. @@ -125,7 +57,7 @@ def prometheus_app_fixture(k8s_juju: jubilant.Juju): @pytest.fixture(scope="module", name="grafana_app") def grafana_app_fixture(k8s_juju: jubilant.Juju, prometheus_app: AppStatus): """Deploy prometheus charm.""" - _timed_deploy(k8s_juju, "k8s/grafana-k8s", "grafana-k8s", channel="1/stable") + k8s_juju.deploy("grafana-k8s", channel="1/stable") k8s_juju.integrate("grafana-k8s:grafana-source", f"{prometheus_app.charm_name}:grafana-source") k8s_juju.wait(lambda status: jubilant.all_active(status, "grafana-k8s", "prometheus-k8s")) # k8s_juju.model and juju.model already has : prefixed. we must split them since @@ -144,7 +76,7 @@ def traefik_ingress_fixture( k8s_juju: jubilant.Juju, prometheus_app: AppStatus, grafana_app: AppStatus ): """Ingress for cross controller communication.""" - _timed_deploy(k8s_juju, "k8s/traefik-k8s", "traefik-k8s", channel="latest/stable") + k8s_juju.deploy("traefik-k8s", channel="latest/stable") k8s_juju.integrate("traefik-k8s", f"{prometheus_app.charm_name}:ingress") k8s_juju.integrate("traefik-k8s", f"{grafana_app.charm_name}:ingress") @@ -160,10 +92,7 @@ def grafana_password_fixture(k8s_juju: jubilant.Juju, grafana_app: AppStatus): @pytest.fixture(scope="module", name="openstack_app_cos_agent") def openstack_app_cos_agent_fixture(juju: jubilant.Juju, app_openstack_runner: Application): """Deploy cos-agent subordinate charm on OpenStack runner application.""" - _start_status_logger(juju.model) - _timed_deploy( - juju, - "openstack/cos-agent", + juju.deploy( COS_AGENT_CHARM, channel="2/candidate", base="ubuntu@22.04", @@ -173,8 +102,7 @@ def openstack_app_cos_agent_fixture(juju: jubilant.Juju, app_openstack_runner: A juju.wait( lambda status: jubilant.all_agents_idle(status, app_openstack_runner.name, COS_AGENT_CHARM) ) - yield app_openstack_runner - _dump_debug_log(juju.model) + return app_openstack_runner @pytest.mark.usefixtures("traefik_ingress") diff --git a/tox.ini b/tox.ini index d00ba384d4..4c67d72ed3 100644 --- a/tox.ini +++ b/tox.ini @@ -133,7 +133,6 @@ deps = pytest_httpserver websockets<14.0 # https://github.com/juju/python-libjuju/issues/1184 -r{toxinidir}/requirements.txt - pytest-timeout allure-pytest>=2.8.18 git+https://github.com/canonical/data-platform-workflows@v24.0.0\#subdirectory=python/pytest_plugins/allure_pytest_collection_report -r{[vars]tst_path}integration/requirements.txt From f12bb6c1dc3013bba33ed5df9838322ccf20eb09 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Sat, 21 Mar 2026 18:38:12 +0100 Subject: [PATCH 8/8] test: original vs full jubilant --- .github/workflows/integration_test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 2838681580..6350011890 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -47,7 +47,7 @@ jobs: provider: lxd test-tox-env: integration-juju3.6 test-timeout: 120 - modules: '["test_prometheus_metrics_original", "test_prometheus_metrics_subprocess", "test_prometheus_metrics_full_jubilant"]' + modules: '["test_prometheus_metrics_original", "test_prometheus_metrics_full_jubilant"]' # INTEGRATION_TOKEN, INTEGRATION_TOKEN_ALT, OS_* are passed through INTEGRATION_TEST_SECRET_ENV_VALUE_ # mapping. See CONTRIBUTING.md for more details. extra-arguments: |