From 9d4600ed8c695df9f2431aa8b62e76ca9d518eeb Mon Sep 17 00:00:00 2001 From: Vijay Chakravarty Date: Wed, 1 Jul 2026 16:53:18 +0200 Subject: [PATCH 01/18] Add ARC CE + SLURM integration test pipeline (GitLab CI) --- .gitlab-ci.yml | 89 +++++++++ README.md | 196 ++++++++++++++----- arc-ce-slurm-ci/.gitlab-ci.yml | 89 +++++++++ arc-ce-slurm-ci/README.md | 144 ++++++++++++++ arc-ce-slurm-ci/docker-compose.yml | 19 ++ arc-ce-slurm-ci/docker/Dockerfile | 78 ++++++++ arc-ce-slurm-ci/docker/arc-bootstrap.service | 13 ++ arc-ce-slurm-ci/docker/arc.conf | 63 ++++++ arc-ce-slurm-ci/docker/bootstrap.sh | 61 ++++++ arc-ce-slurm-ci/docker/cgroup.conf | 3 + arc-ce-slurm-ci/docker/healthcheck.sh | 11 ++ arc-ce-slurm-ci/docker/slurm.conf | 36 ++++ arc-ce-slurm-ci/test/job.xrsl | 11 ++ arc-ce-slurm-ci/test/run.sh | 7 + arc-ce-slurm-ci/test/run_integration_test.sh | 112 +++++++++++ docker-compose.yml | 19 ++ docker/Dockerfile | 78 ++++++++ docker/arc-bootstrap.service | 13 ++ docker/arc.conf | 63 ++++++ docker/bootstrap.sh | 61 ++++++ docker/cgroup.conf | 3 + docker/healthcheck.sh | 11 ++ docker/slurm.conf | 36 ++++ test/job.xrsl | 11 ++ test/run.sh | 7 + test/run_integration_test.sh | 112 +++++++++++ 26 files changed, 1294 insertions(+), 52 deletions(-) create mode 100644 .gitlab-ci.yml create mode 100644 arc-ce-slurm-ci/.gitlab-ci.yml create mode 100644 arc-ce-slurm-ci/README.md create mode 100644 arc-ce-slurm-ci/docker-compose.yml create mode 100644 arc-ce-slurm-ci/docker/Dockerfile create mode 100644 arc-ce-slurm-ci/docker/arc-bootstrap.service create mode 100644 arc-ce-slurm-ci/docker/arc.conf create mode 100644 arc-ce-slurm-ci/docker/bootstrap.sh create mode 100644 arc-ce-slurm-ci/docker/cgroup.conf create mode 100644 arc-ce-slurm-ci/docker/healthcheck.sh create mode 100644 arc-ce-slurm-ci/docker/slurm.conf create mode 100644 arc-ce-slurm-ci/test/job.xrsl create mode 100644 arc-ce-slurm-ci/test/run.sh create mode 100644 arc-ce-slurm-ci/test/run_integration_test.sh create mode 100644 docker-compose.yml create mode 100644 docker/Dockerfile create mode 100644 docker/arc-bootstrap.service create mode 100644 docker/arc.conf create mode 100644 docker/bootstrap.sh create mode 100644 docker/cgroup.conf create mode 100644 docker/healthcheck.sh create mode 100644 docker/slurm.conf create mode 100644 test/job.xrsl create mode 100644 test/run.sh create mode 100644 test/run_integration_test.sh diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..7b71018 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,89 @@ +stages: + - build + - test + +variables: + # Standard docker-in-docker setup (see explanation in the README) + DOCKER_HOST: tcp://docker:2375 + DOCKER_TLS_CERTDIR: "" + DOCKER_DRIVER: overlay2 + IMAGE_NAME: "${CI_REGISTRY_IMAGE}/arc-ce-slurm-test" + IMAGE_TAG: "${CI_COMMIT_SHORT_SHA}" + CONTAINER_NAME: arc-ce-slurm-test + +# ----------------------------------------------------------------------- +# Build the ARC CE + SLURM image and hand it to the test job as an artifact +# (avoids needing a registry push/pull round-trip just for CI). +# ----------------------------------------------------------------------- +build_image: + stage: build + image: docker:26 + services: + - docker:26-dind + script: + - docker build -t "${IMAGE_NAME}:${IMAGE_TAG}" -f docker/Dockerfile docker/ + - docker save "${IMAGE_NAME}:${IMAGE_TAG}" -o image.tar + artifacts: + paths: + - image.tar + expire_in: 1 hour + +# ----------------------------------------------------------------------- +# Spin the image up as a real (privileged) container, wait for the +# HEALTHCHECK to go green, then drive submit -> monitor -> retrieve +# against it via `docker exec`. +# +# NOTE: this needs a GitLab Runner whose executor is allowed to run +# privileged containers, i.e. in the runner's config.toml: +# +# [runners.docker] +# privileged = true +# +# --privileged is required here because the container runs systemd as +# PID 1 (which arcctl / SLURM's own unit files expect), and systemd +# needs to manage cgroups. +# ----------------------------------------------------------------------- +integration_test: + stage: test + image: docker:26 + services: + - docker:26-dind + needs: + - build_image + before_script: + - docker load -i image.tar + script: + - docker run -d --privileged --name "${CONTAINER_NAME}" --hostname arc-ce "${IMAGE_NAME}:${IMAGE_TAG}" + - echo "Waiting for ARC CE + SLURM health check..." + - | + status="starting" + for i in $(seq 1 60); do + status=$(docker inspect -f '{{.State.Health.Status}}' "${CONTAINER_NAME}" 2>/dev/null || echo "starting") + echo " [$i/60] health=${status}" + [ "${status}" = "healthy" ] && break + sleep 5 + done + if [ "${status}" != "healthy" ]; then + echo "Container never became healthy, dumping logs:" + docker logs "${CONTAINER_NAME}" || true + docker exec "${CONTAINER_NAME}" cat /var/log/arc-bootstrap.log || true + exit 1 + fi + - docker cp test/. "${CONTAINER_NAME}:/opt/arc-test/" + - docker exec "${CONTAINER_NAME}" chown -R griduser01:griduser01 /opt/arc-test + - docker exec "${CONTAINER_NAME}" chmod +x /opt/arc-test/run_integration_test.sh /opt/arc-test/run.sh + - docker exec -u griduser01 "${CONTAINER_NAME}" /opt/arc-test/run_integration_test.sh + after_script: + - docker logs "${CONTAINER_NAME}" > container-console.log 2>&1 || true + - docker exec "${CONTAINER_NAME}" cat /var/log/arc/arex.log > arex.log 2>/dev/null || true + - docker exec "${CONTAINER_NAME}" cat /var/log/arc-bootstrap.log > arc-bootstrap.log 2>/dev/null || true + - docker exec "${CONTAINER_NAME}" cat /var/log/slurm/slurmctld.log > slurmctld.log 2>/dev/null || true + - docker rm -f "${CONTAINER_NAME}" || true + artifacts: + when: always + paths: + - container-console.log + - arex.log + - arc-bootstrap.log + - slurmctld.log + expire_in: 1 week diff --git a/README.md b/README.md index d6c977f..5983cc6 100644 --- a/README.md +++ b/README.md @@ -1,52 +1,144 @@ -

- interCEde -

- -

- Unified interfaces to Computing Elements and batch systems for DIRAC / DiracX and beyond —
- submit, monitor, retrieve — validated against containerized backends.
-

- -

- CI - PyPI - Python versions - License -

- ---- - -## What is interCEde? - -**interCEde** sits between Workload Management Systems, such as [DiracX](https://github.com/DIRACGrid/diracx) and the -many resources where jobs actually run. It provides a single, consistent interface for -talking to **Computing Elements (CEs)** and **batch systems** — submitting jobs, querying -their status, and retrieving their outputs — regardless of which backend is on the other -end. - -The name is the job description: the library *intercedes* on WMS' behalf, acting between -two parties so the rest of the stack never has to know whether it is talking to ARC, -HTCondor, Slurm over SSH, or a process on the local machine. - -Every interface ships with **integration tests that run against containerized instances** -of the real backends, so a given CE type is verified against multiple versions and -configurations rather than against a mock that drifts from reality. - -## Why it exists - -- **One contract, many backends.** Calling code submits a job the same way everywhere; the - backend-specific quirks live behind the interface. -- **Composable resources.** Backends combine — `SSH + Slurm`, `SSH + HTCondor`, - `ARC + HTCondor`, or a plain `local` runner — and each combination is just another - implementation of the same interface. -- **Tested against the real thing.** Containerized Slurm, HTCondor, and ARC instances are - spun up in CI so behavior is checked against actual schedulers, not stubs. -- **Version coverage.** The same test suite runs across a matrix of backend versions to - catch incompatibilities before they reach production. - -## Relationship to DIRAC / DiracX - -interCEde is part of the [DIRACGrid](https://github.com/DIRACGrid) ecosystem and is designed -to back the Computing Element layer used by [DiracX](https://github.com/DIRACGrid/diracx). -It can also be used standalone wherever a uniform interface to heterogeneous CEs and batch -systems is useful. +# ARC CE + SLURM integration test (GitLab CI) + +Spins up a single Docker container running a NorduGrid **ARC Compute +Element (ARC7)** wired to a single-node **SLURM** batch system, then +drives `arcsub` / `arcstat` / `arcget` against it to prove the whole +submit → monitor → retrieve path works end to end. Designed to run as +a GitLab CI pipeline (build stage + test stage), but also runnable +locally with `docker-compose`. + +## Layout + +``` +docker/ + Dockerfile AlmaLinux 9 image: munge + SLURM + ARC7 + systemd + slurm.conf single-node SLURM cluster config + cgroup.conf cgroups disabled (see note below) + arc.conf ARC CE config, LRMS=slurm, REST interface on :443 + bootstrap.sh one-shot startup script (systemd unit runs this) + arc-bootstrap.service systemd unit that runs bootstrap.sh at boot + healthcheck.sh Docker HEALTHCHECK / CI readiness probe +test/ + job.xrsl the test job description (xRSL) + run.sh payload script executed on the SLURM worker + run_integration_test.sh submit -> monitor -> retrieve driver script +.gitlab-ci.yml build_image + integration_test pipeline +docker-compose.yml local equivalent of the CI run +``` + +## How it fits together + +1. **Image build** installs `munge`, `slurm`/`slurm-slurmctld`/`slurm-slurmd`, + and ARC7 (`nordugrid-arc7-arex`, `nordugrid-arc7-client`, + `nordugrid-arc7-arcctl`) from EPEL on AlmaLinux 9, and enables + `systemd` as PID 1 — this matters because ARC's own tooling + (`arcctl`) and the SLURM/munge packages ship real systemd unit + files, and re-using those is far more reliable than hand-rolling a + supervisor script. + +2. **Container start** (`arc-bootstrap.service`, ordered after + `munge`/`slurmctld`/`slurmd`) runs `bootstrap.sh`, which: + - waits until `munge` and `sinfo` actually work, + - (re)generates the ARC **Test-CA** and a **host certificate** bound + to the container's *runtime* hostname (`arcctl test-ca hostcert -n + $(hostname) -f`) — this can't be baked into the image at build + time because the build-time hostname is a random ID, not `arc-ce`, + - starts `arc-arex` / `arc-arex-ws` (`arcctl service start + --as-configured`), + - mints a Test-CA **client certificate** for `griduser01` + (`arcctl test-ca usercert --install-user griduser01 -f`), which + `arcctl` automatically whitelists in + `/etc/grid-security/testCA.allowed-subjects` — this is what makes + the CE's default "closed by default" `[authgroup: zero]` accept + that user, + - waits for the REST endpoint to answer and writes `/run/arc-ready`. + +3. **Docker HEALTHCHECK** (`healthcheck.sh`) only reports `healthy` + once `/run/arc-ready` exists, `sinfo` works, and the REST endpoint + responds — the CI job polls this instead of guessing a fixed sleep. + +4. **The test itself** (`test/run_integration_test.sh`, run as + `griduser01` inside the container via `docker exec`): + - `arcproxy` — generate a short-lived proxy from the Test-CA user cert + - `arcinfo -C https://arc-ce/arex` — sanity-check the CE is reachable + - `arcsub -C https://arc-ce/arex job.xrsl` — **submit** + - poll `arcstat ` until `Finished` (or fail fast on + `Failed`/`Killed`) — **monitor** + - `arcget ` — **retrieve** `stdout.log` and `result.txt`, + then assert their contents + - `arcclean ` to tidy up + +## Why systemd + `--privileged` + +SLURM's daemons and ARC's `arcctl` assume a normal init system +(starting/stopping via `systemctl`, log rotation, etc). Running +`systemd` as PID 1 inside the container needs elevated privileges to +manage cgroups, so both the GitLab job and local `docker-compose` run +the container with `--privileged`. + +**In GitLab, this means your Runner's `config.toml` must allow +privileged containers for the `docker:dind` service:** + +```toml +[[runners]] + executor = "docker" + [runners.docker] + privileged = true +``` + +If you can't get a privileged runner, the alternative is to drop +systemd entirely and hand-roll process supervision (e.g. `supervisord` +calling `munged`, `slurmctld -D`, `slurmd -D`, and the `A-REX` daemon +binary directly) — more portable, but you lose the packaged unit files +and have to reproduce their startup ordering/flags yourself. + +## Why `cgroup.conf` disables cgroups + +`TaskPlugin=task/none` and `ProctrackType=proctrack/linuxproc` in +`slurm.conf` avoid SLURM's cgroup-based process tracking, which +typically isn't usable inside a CI container even with `--privileged` +unless you also bind-mount the host's cgroup hierarchy. Fine for an +integration test that just proves the plumbing works; not +representative of production resource enforcement. + +## Running locally + +```bash +docker compose up --build -d +# watch it come up +docker inspect -f '{{.State.Health.Status}}' arc-ce-slurm-test +# once "healthy": +docker cp test/. arc-ce-slurm-test:/opt/arc-test/ +docker exec arc-ce-slurm-test chown -R griduser01:griduser01 /opt/arc-test +docker exec arc-ce-slurm-test chmod +x /opt/arc-test/run_integration_test.sh /opt/arc-test/run.sh +docker exec -u griduser01 arc-ce-slurm-test /opt/arc-test/run_integration_test.sh +``` + +## Running in GitLab CI + +Just push this repo (or merge these files into yours) with +`.gitlab-ci.yml` at the root. The `build_image` stage builds and saves +the image as a job artifact; `integration_test` loads it, runs it +privileged, waits for the health check, executes the test script +inside the container, and archives ARC/SLURM logs as artifacts +regardless of pass/fail. + +## Things you'll likely want to change for a real environment + +- **Package versions**: this pins nothing beyond "ARC7 from EPEL on + EL9". For reproducible CI, pin `nordugrid-arc7-arex-` etc. + explicitly, or build from the upstream NorduGrid repo instead of + EPEL (see https://www.nordugrid.org/arc/arc7/common/repos/repository.html). +- **Multi-container topology**: this is deliberately an all-in-one + container (CE + SLURM + client in one box) to keep the CI pipeline + simple. For something closer to production, split into an `arc-ce` + service, a `slurmctld`/`slurmd` service (or a real multi-node SLURM + cluster), and a separate `client` container talking to the CE over + the Docker network, sharing a `munge.key` via a named volume. +- **Certificates**: this uses ARC's built-in Test-CA, which is exactly + what it's for (throwaway integration testing). Never use it for + anything reachable from outside your CI network. +- **Job payload**: `test/job.xrsl` / `test/run.sh` are a minimal + smoke test. Extend them to cover whatever your real batch workloads + look like (multi-core requests, input/output staging from object + storage, RunTime Environments, etc). diff --git a/arc-ce-slurm-ci/.gitlab-ci.yml b/arc-ce-slurm-ci/.gitlab-ci.yml new file mode 100644 index 0000000..7b71018 --- /dev/null +++ b/arc-ce-slurm-ci/.gitlab-ci.yml @@ -0,0 +1,89 @@ +stages: + - build + - test + +variables: + # Standard docker-in-docker setup (see explanation in the README) + DOCKER_HOST: tcp://docker:2375 + DOCKER_TLS_CERTDIR: "" + DOCKER_DRIVER: overlay2 + IMAGE_NAME: "${CI_REGISTRY_IMAGE}/arc-ce-slurm-test" + IMAGE_TAG: "${CI_COMMIT_SHORT_SHA}" + CONTAINER_NAME: arc-ce-slurm-test + +# ----------------------------------------------------------------------- +# Build the ARC CE + SLURM image and hand it to the test job as an artifact +# (avoids needing a registry push/pull round-trip just for CI). +# ----------------------------------------------------------------------- +build_image: + stage: build + image: docker:26 + services: + - docker:26-dind + script: + - docker build -t "${IMAGE_NAME}:${IMAGE_TAG}" -f docker/Dockerfile docker/ + - docker save "${IMAGE_NAME}:${IMAGE_TAG}" -o image.tar + artifacts: + paths: + - image.tar + expire_in: 1 hour + +# ----------------------------------------------------------------------- +# Spin the image up as a real (privileged) container, wait for the +# HEALTHCHECK to go green, then drive submit -> monitor -> retrieve +# against it via `docker exec`. +# +# NOTE: this needs a GitLab Runner whose executor is allowed to run +# privileged containers, i.e. in the runner's config.toml: +# +# [runners.docker] +# privileged = true +# +# --privileged is required here because the container runs systemd as +# PID 1 (which arcctl / SLURM's own unit files expect), and systemd +# needs to manage cgroups. +# ----------------------------------------------------------------------- +integration_test: + stage: test + image: docker:26 + services: + - docker:26-dind + needs: + - build_image + before_script: + - docker load -i image.tar + script: + - docker run -d --privileged --name "${CONTAINER_NAME}" --hostname arc-ce "${IMAGE_NAME}:${IMAGE_TAG}" + - echo "Waiting for ARC CE + SLURM health check..." + - | + status="starting" + for i in $(seq 1 60); do + status=$(docker inspect -f '{{.State.Health.Status}}' "${CONTAINER_NAME}" 2>/dev/null || echo "starting") + echo " [$i/60] health=${status}" + [ "${status}" = "healthy" ] && break + sleep 5 + done + if [ "${status}" != "healthy" ]; then + echo "Container never became healthy, dumping logs:" + docker logs "${CONTAINER_NAME}" || true + docker exec "${CONTAINER_NAME}" cat /var/log/arc-bootstrap.log || true + exit 1 + fi + - docker cp test/. "${CONTAINER_NAME}:/opt/arc-test/" + - docker exec "${CONTAINER_NAME}" chown -R griduser01:griduser01 /opt/arc-test + - docker exec "${CONTAINER_NAME}" chmod +x /opt/arc-test/run_integration_test.sh /opt/arc-test/run.sh + - docker exec -u griduser01 "${CONTAINER_NAME}" /opt/arc-test/run_integration_test.sh + after_script: + - docker logs "${CONTAINER_NAME}" > container-console.log 2>&1 || true + - docker exec "${CONTAINER_NAME}" cat /var/log/arc/arex.log > arex.log 2>/dev/null || true + - docker exec "${CONTAINER_NAME}" cat /var/log/arc-bootstrap.log > arc-bootstrap.log 2>/dev/null || true + - docker exec "${CONTAINER_NAME}" cat /var/log/slurm/slurmctld.log > slurmctld.log 2>/dev/null || true + - docker rm -f "${CONTAINER_NAME}" || true + artifacts: + when: always + paths: + - container-console.log + - arex.log + - arc-bootstrap.log + - slurmctld.log + expire_in: 1 week diff --git a/arc-ce-slurm-ci/README.md b/arc-ce-slurm-ci/README.md new file mode 100644 index 0000000..5983cc6 --- /dev/null +++ b/arc-ce-slurm-ci/README.md @@ -0,0 +1,144 @@ +# ARC CE + SLURM integration test (GitLab CI) + +Spins up a single Docker container running a NorduGrid **ARC Compute +Element (ARC7)** wired to a single-node **SLURM** batch system, then +drives `arcsub` / `arcstat` / `arcget` against it to prove the whole +submit → monitor → retrieve path works end to end. Designed to run as +a GitLab CI pipeline (build stage + test stage), but also runnable +locally with `docker-compose`. + +## Layout + +``` +docker/ + Dockerfile AlmaLinux 9 image: munge + SLURM + ARC7 + systemd + slurm.conf single-node SLURM cluster config + cgroup.conf cgroups disabled (see note below) + arc.conf ARC CE config, LRMS=slurm, REST interface on :443 + bootstrap.sh one-shot startup script (systemd unit runs this) + arc-bootstrap.service systemd unit that runs bootstrap.sh at boot + healthcheck.sh Docker HEALTHCHECK / CI readiness probe +test/ + job.xrsl the test job description (xRSL) + run.sh payload script executed on the SLURM worker + run_integration_test.sh submit -> monitor -> retrieve driver script +.gitlab-ci.yml build_image + integration_test pipeline +docker-compose.yml local equivalent of the CI run +``` + +## How it fits together + +1. **Image build** installs `munge`, `slurm`/`slurm-slurmctld`/`slurm-slurmd`, + and ARC7 (`nordugrid-arc7-arex`, `nordugrid-arc7-client`, + `nordugrid-arc7-arcctl`) from EPEL on AlmaLinux 9, and enables + `systemd` as PID 1 — this matters because ARC's own tooling + (`arcctl`) and the SLURM/munge packages ship real systemd unit + files, and re-using those is far more reliable than hand-rolling a + supervisor script. + +2. **Container start** (`arc-bootstrap.service`, ordered after + `munge`/`slurmctld`/`slurmd`) runs `bootstrap.sh`, which: + - waits until `munge` and `sinfo` actually work, + - (re)generates the ARC **Test-CA** and a **host certificate** bound + to the container's *runtime* hostname (`arcctl test-ca hostcert -n + $(hostname) -f`) — this can't be baked into the image at build + time because the build-time hostname is a random ID, not `arc-ce`, + - starts `arc-arex` / `arc-arex-ws` (`arcctl service start + --as-configured`), + - mints a Test-CA **client certificate** for `griduser01` + (`arcctl test-ca usercert --install-user griduser01 -f`), which + `arcctl` automatically whitelists in + `/etc/grid-security/testCA.allowed-subjects` — this is what makes + the CE's default "closed by default" `[authgroup: zero]` accept + that user, + - waits for the REST endpoint to answer and writes `/run/arc-ready`. + +3. **Docker HEALTHCHECK** (`healthcheck.sh`) only reports `healthy` + once `/run/arc-ready` exists, `sinfo` works, and the REST endpoint + responds — the CI job polls this instead of guessing a fixed sleep. + +4. **The test itself** (`test/run_integration_test.sh`, run as + `griduser01` inside the container via `docker exec`): + - `arcproxy` — generate a short-lived proxy from the Test-CA user cert + - `arcinfo -C https://arc-ce/arex` — sanity-check the CE is reachable + - `arcsub -C https://arc-ce/arex job.xrsl` — **submit** + - poll `arcstat ` until `Finished` (or fail fast on + `Failed`/`Killed`) — **monitor** + - `arcget ` — **retrieve** `stdout.log` and `result.txt`, + then assert their contents + - `arcclean ` to tidy up + +## Why systemd + `--privileged` + +SLURM's daemons and ARC's `arcctl` assume a normal init system +(starting/stopping via `systemctl`, log rotation, etc). Running +`systemd` as PID 1 inside the container needs elevated privileges to +manage cgroups, so both the GitLab job and local `docker-compose` run +the container with `--privileged`. + +**In GitLab, this means your Runner's `config.toml` must allow +privileged containers for the `docker:dind` service:** + +```toml +[[runners]] + executor = "docker" + [runners.docker] + privileged = true +``` + +If you can't get a privileged runner, the alternative is to drop +systemd entirely and hand-roll process supervision (e.g. `supervisord` +calling `munged`, `slurmctld -D`, `slurmd -D`, and the `A-REX` daemon +binary directly) — more portable, but you lose the packaged unit files +and have to reproduce their startup ordering/flags yourself. + +## Why `cgroup.conf` disables cgroups + +`TaskPlugin=task/none` and `ProctrackType=proctrack/linuxproc` in +`slurm.conf` avoid SLURM's cgroup-based process tracking, which +typically isn't usable inside a CI container even with `--privileged` +unless you also bind-mount the host's cgroup hierarchy. Fine for an +integration test that just proves the plumbing works; not +representative of production resource enforcement. + +## Running locally + +```bash +docker compose up --build -d +# watch it come up +docker inspect -f '{{.State.Health.Status}}' arc-ce-slurm-test +# once "healthy": +docker cp test/. arc-ce-slurm-test:/opt/arc-test/ +docker exec arc-ce-slurm-test chown -R griduser01:griduser01 /opt/arc-test +docker exec arc-ce-slurm-test chmod +x /opt/arc-test/run_integration_test.sh /opt/arc-test/run.sh +docker exec -u griduser01 arc-ce-slurm-test /opt/arc-test/run_integration_test.sh +``` + +## Running in GitLab CI + +Just push this repo (or merge these files into yours) with +`.gitlab-ci.yml` at the root. The `build_image` stage builds and saves +the image as a job artifact; `integration_test` loads it, runs it +privileged, waits for the health check, executes the test script +inside the container, and archives ARC/SLURM logs as artifacts +regardless of pass/fail. + +## Things you'll likely want to change for a real environment + +- **Package versions**: this pins nothing beyond "ARC7 from EPEL on + EL9". For reproducible CI, pin `nordugrid-arc7-arex-` etc. + explicitly, or build from the upstream NorduGrid repo instead of + EPEL (see https://www.nordugrid.org/arc/arc7/common/repos/repository.html). +- **Multi-container topology**: this is deliberately an all-in-one + container (CE + SLURM + client in one box) to keep the CI pipeline + simple. For something closer to production, split into an `arc-ce` + service, a `slurmctld`/`slurmd` service (or a real multi-node SLURM + cluster), and a separate `client` container talking to the CE over + the Docker network, sharing a `munge.key` via a named volume. +- **Certificates**: this uses ARC's built-in Test-CA, which is exactly + what it's for (throwaway integration testing). Never use it for + anything reachable from outside your CI network. +- **Job payload**: `test/job.xrsl` / `test/run.sh` are a minimal + smoke test. Extend them to cover whatever your real batch workloads + look like (multi-core requests, input/output staging from object + storage, RunTime Environments, etc). diff --git a/arc-ce-slurm-ci/docker-compose.yml b/arc-ce-slurm-ci/docker-compose.yml new file mode 100644 index 0000000..3f6091f --- /dev/null +++ b/arc-ce-slurm-ci/docker-compose.yml @@ -0,0 +1,19 @@ +version: "3.8" + +services: + arc-ce: + build: + context: ./docker + dockerfile: Dockerfile + image: arc-ce-slurm-test:local + container_name: arc-ce-slurm-test + hostname: arc-ce + privileged: true # needed for systemd-as-PID1 (see README) + ports: + - "8443:443" + healthcheck: + test: ["CMD", "/usr/local/bin/healthcheck.sh"] + interval: 5s + timeout: 5s + start_period: 90s + retries: 30 diff --git a/arc-ce-slurm-ci/docker/Dockerfile b/arc-ce-slurm-ci/docker/Dockerfile new file mode 100644 index 0000000..1e97186 --- /dev/null +++ b/arc-ce-slurm-ci/docker/Dockerfile @@ -0,0 +1,78 @@ +# ============================================================================= +# ARC CE + SLURM "all-in-one" image for integration testing +# +# Runs, as real systemd-managed services (this is what arcctl/ARC packaging +# expects and is far more robust than re-implementing service supervision): +# - munge (SLURM auth) +# - slurmctld + slurmd (single-node SLURM cluster, one fake node) +# - arc-arex + arc-arex-ws (NorduGrid ARC CE, LRMS backend = slurm) +# +# Base: AlmaLinux 9. On EL9, ARC7 ships in EPEL as nordugrid-arc7-*. +# +# IMPORTANT: this container runs systemd as PID 1, which needs either +# docker run --privileged +# or (rootless-friendlier) +# docker run --cgroupns=host -v /sys/fs/cgroup:/sys/fs/cgroup:rw +# See the .gitlab-ci.yml in this repo for the CI-side flags. +# ============================================================================= +FROM almalinux:9 + +ENV container=docker + +# --------------------------------------------------------------------------- +# systemd + EPEL + SLURM + munge + ARC CE + ARC client tools +# --------------------------------------------------------------------------- +RUN dnf -y install epel-release && \ + dnf -y update && \ + dnf -y install systemd crypto-policies-scripts && \ + update-crypto-policies --set LEGACY && \ + dnf -y install \ + munge munge-libs \ + slurm slurm-slurmctld slurm-slurmd \ + openssl ca-certificates \ + procps-ng iproute net-tools which curl jq \ + && \ + dnf -y install \ + nordugrid-arc7-arex \ + nordugrid-arc7-client \ + nordugrid-arc7-arcctl \ + && \ + dnf clean all && \ + # standard "systemd in docker" cleanup: mask units that don't apply / fail in containers + (cd /lib/systemd/system/sysinit.target.wants/; for i in *; do [ "$i" = systemd-tmpfiles-setup.service ] || rm -f "$i"; done) ; \ + rm -f /lib/systemd/system/multi-user.target.wants/* ; \ + rm -f /etc/systemd/system/*.wants/* ; \ + rm -f /lib/systemd/system/local-fs.target.wants/* ; \ + rm -f /lib/systemd/system/sockets.target.wants/*udev* ; \ + rm -f /lib/systemd/system/sockets.target.wants/*initctl* ; \ + rm -f /lib/systemd/system/basic.target.wants/* ; \ + rm -f /lib/systemd/system/anaconda.target.wants/* 2>/dev/null || true + +# --------------------------------------------------------------------------- +# Unprivileged pool account that grid jobs get mapped to + matching SLURM node +# --------------------------------------------------------------------------- +RUN useradd -m -s /bin/bash griduser01 + +# --------------------------------------------------------------------------- +# Config files +# --------------------------------------------------------------------------- +COPY slurm.conf /etc/slurm/slurm.conf +COPY cgroup.conf /etc/slurm/cgroup.conf +COPY arc.conf /etc/arc.conf + +COPY bootstrap.sh /usr/local/bin/bootstrap.sh +COPY healthcheck.sh /usr/local/bin/healthcheck.sh +COPY arc-bootstrap.service /etc/systemd/system/arc-bootstrap.service + +RUN mkdir -p /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && \ + chown slurm:slurm /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && \ + chmod +x /usr/local/bin/bootstrap.sh /usr/local/bin/healthcheck.sh && \ + systemctl enable munge slurmctld slurmd arc-bootstrap.service + +EXPOSE 443 + +HEALTHCHECK --interval=5s --timeout=5s --start-period=90s --retries=30 \ + CMD /usr/local/bin/healthcheck.sh + +STOPSIGNAL SIGRTMIN+3 +CMD ["/usr/sbin/init"] diff --git a/arc-ce-slurm-ci/docker/arc-bootstrap.service b/arc-ce-slurm-ci/docker/arc-bootstrap.service new file mode 100644 index 0000000..457b56f --- /dev/null +++ b/arc-ce-slurm-ci/docker/arc-bootstrap.service @@ -0,0 +1,13 @@ +[Unit] +Description=Bootstrap ARC CE (arex/arex-ws) on top of local SLURM, mint test client cert +After=network.target munge.service slurmctld.service slurmd.service +Wants=munge.service slurmctld.service slurmd.service +ConditionPathExists=!/run/arc-ready + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=/usr/local/bin/bootstrap.sh + +[Install] +WantedBy=multi-user.target diff --git a/arc-ce-slurm-ci/docker/arc.conf b/arc-ce-slurm-ci/docker/arc.conf new file mode 100644 index 0000000..8fc09aa --- /dev/null +++ b/arc-ce-slurm-ci/docker/arc.conf @@ -0,0 +1,63 @@ +# ============================================================================= +# /etc/arc.conf - ARC7 CE configuration for the integration-test container +# +# This intentionally overrides the packaged "zero configuration" so we have +# an explicit, reviewable setup. Cross-check option names against your +# installed ARC version's reference doc if you bump versions: +# /usr/share/doc/nordugrid-arc7-arex/arc.conf.reference +# https://www.nordugrid.org/arc/arc7/admins/reference.html +# ============================================================================= + +[common] +hostname = arc-ce +x509_user_key = /etc/grid-security/hostkey.pem +x509_user_cert = /etc/grid-security/hostcert.pem +x509_cert_dir = /etc/grid-security/certificates + +# ----------------------------------------------------------------------- +# LRMS: point A-REX at the local single-node SLURM cluster +# ----------------------------------------------------------------------- +[lrms] +lrms = slurm +slurm_use_sacct = yes +slurm_wakeupperiod = 5 + +[queue: main] +comment = CI integration-test queue backed by local SLURM partition "main" + +# ----------------------------------------------------------------------- +# Authorization: accept anyone holding a cert signed by the ARC Test-CA +# that arcctl generates at install time. The test client cert we mint in +# entrypoint.sh (arcctl test-ca usercert) is auto-appended to +# testCA.allowed-subjects, which is what the "zero" authgroup checks. +# ----------------------------------------------------------------------- +[authgroup: zero] + +[mapping] +# every request authorized via the "zero" authgroup runs as griduser01 +map_to_user = zero griduser01 + +# ----------------------------------------------------------------------- +# A-REX core: where jobs' control/session data live +# ----------------------------------------------------------------------- +[arex] +user = root +controldir = /var/spool/arc/jobstatus +sessiondir = /var/spool/arc/sessiondir +runtimedir = /usr/share/arc/rte +delegationdb = sqlite + +[arex/ws] +wsurl = https://arc-ce/arex +allowaccess = zero + +[arex/ws/jobs] +allowaccess = zero + +# Data staging can be minimal for a CI job that just echoes something, +# but we enable it so xRSL inputfiles/outputfiles work if you extend the +# test job later. +[arex/data-staging] + +[arex/cache] +cachedir = /var/spool/arc/cache diff --git a/arc-ce-slurm-ci/docker/bootstrap.sh b/arc-ce-slurm-ci/docker/bootstrap.sh new file mode 100644 index 0000000..02fc606 --- /dev/null +++ b/arc-ce-slurm-ci/docker/bootstrap.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# ============================================================================= +# Runs once at container start (via arc-bootstrap.service, after munge, +# slurmctld and slurmd units). Responsibilities: +# 1. Wait for munge + SLURM to be actually usable +# 2. (Re)generate the ARC Test-CA and a host certificate bound to this +# container's *runtime* hostname (image build time hostname is random, +# so we can't bake a valid host cert into the image itself) +# 3. Start arc-arex / arc-arex-ws as configured in /etc/arc.conf +# 4. Mint a Test-CA user certificate for griduser01, which arcctl +# automatically whitelists in /etc/grid-security/testCA.allowed-subjects +# 5. Wait for the REST endpoint to answer, then signal readiness +# ============================================================================= +set -euo pipefail +LOG=/var/log/arc-bootstrap.log +exec > >(tee -a "$LOG") 2>&1 + +echo "== ARC CE / SLURM bootstrap starting: $(date -u) ==" + +HOSTNAME_FQDN="$(hostname)" +echo "Using hostname: ${HOSTNAME_FQDN}" + +wait_for() { + local desc="$1"; shift + local tries=0 + until "$@" >/dev/null 2>&1; do + tries=$((tries + 1)) + if [ "$tries" -gt 90 ]; then + echo "TIMED OUT waiting for: ${desc}" + return 1 + fi + sleep 2 + done + echo "${desc}: ready (after ${tries} tries)" +} + +# --- 1. munge, then SLURM control daemon ------------------------------------ +wait_for "munge" bash -c 'echo bootstrap-check | munge | unmunge' +wait_for "slurmctld (sinfo)" sinfo -h + +# --- 2. Test-CA + host certificate for the real runtime hostname ------------ +arcctl test-ca init -f +arcctl test-ca hostcert -n "${HOSTNAME_FQDN}" -f + +# --- 3. Start ARC CE services ------------------------------------------------- +arcctl service start --as-configured + +# --- 4. Test client certificate for griduser01 ------------------------------- +arcctl test-ca usercert --install-user griduser01 -f + +# Also export a portable tarball, useful if the GitLab job wants to drive +# arcsub/arcstat/arcget from *outside* this container (e.g. from the +# job's own shell talking to the CE over the docker network). +arcctl test-ca usercert -n griduser01 --export-tar -f || true +mv -f testcert-*.tar.gz /root/arc-test-client.tar.gz 2>/dev/null || true + +# --- 5. Wait until the REST endpoint actually answers ------------------------ +wait_for "arex REST endpoint" curl -sk "https://${HOSTNAME_FQDN}/arex/rest/1.0/info" + +touch /run/arc-ready +echo "== ARC CE / SLURM bootstrap complete: $(date -u) ==" diff --git a/arc-ce-slurm-ci/docker/cgroup.conf b/arc-ce-slurm-ci/docker/cgroup.conf new file mode 100644 index 0000000..4ca269d --- /dev/null +++ b/arc-ce-slurm-ci/docker/cgroup.conf @@ -0,0 +1,3 @@ +# cgroup constraints are disabled: see slurm.conf comment. +# Kept here only because slurmd/slurmctld expect the file to exist. +CgroupPlugin=disabled diff --git a/arc-ce-slurm-ci/docker/healthcheck.sh b/arc-ce-slurm-ci/docker/healthcheck.sh new file mode 100644 index 0000000..bc47b09 --- /dev/null +++ b/arc-ce-slurm-ci/docker/healthcheck.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# Used by Dockerfile HEALTHCHECK and by the GitLab CI "wait for CE" step. +set -o pipefail + +[ -f /run/arc-ready ] || exit 1 + +sinfo -h >/dev/null 2>&1 || exit 1 + +curl -sk --max-time 3 -o /dev/null "https://$(hostname)/arex/rest/1.0/info" || exit 1 + +exit 0 diff --git a/arc-ce-slurm-ci/docker/slurm.conf b/arc-ce-slurm-ci/docker/slurm.conf new file mode 100644 index 0000000..23099c7 --- /dev/null +++ b/arc-ce-slurm-ci/docker/slurm.conf @@ -0,0 +1,36 @@ +# ============================================================================= +# slurm.conf - single-node SLURM cluster for CI/integration testing +# +# NOTE: TaskPlugin=task/none and ProctrackType=proctrack/linuxproc are used +# instead of the cgroup-based plugins because GitLab CI docker executors +# usually do NOT grant access to the host cgroup hierarchy needed by +# proctrack/cgroup or task/cgroup. If your runner is privileged and mounts +# /sys/fs/cgroup read-write, you can switch to the cgroup plugins for more +# realistic resource accounting. +# ============================================================================= +ClusterName=citest +SlurmctldHost=localhost + +SlurmUser=slurm +SlurmctldPort=6817 +SlurmdPort=6818 +AuthType=auth/munge +StateSaveLocation=/var/spool/slurmctld +SlurmdSpoolDir=/var/spool/slurmd +SlurmctldPidFile=/var/run/slurmctld.pid +SlurmdPidFile=/var/run/slurmd.pid +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmdLogFile=/var/log/slurm/slurmd.log + +ProctrackType=proctrack/linuxproc +TaskPlugin=task/none +SelectType=select/cons_tres +SelectTypeParameters=CR_Core_Memory + +SchedulerType=sched/backfill +ReturnToService=2 +MpiDefault=none + +# --- Node & partition: one fake compute node backing the whole cluster --- +NodeName=cnode01 NodeAddr=localhost CPUs=2 RealMemory=2000 State=UNKNOWN +PartitionName=main Nodes=cnode01 Default=YES MaxTime=INFINITE State=UP diff --git a/arc-ce-slurm-ci/test/job.xrsl b/arc-ce-slurm-ci/test/job.xrsl new file mode 100644 index 0000000..11d34c7 --- /dev/null +++ b/arc-ce-slurm-ci/test/job.xrsl @@ -0,0 +1,11 @@ +& +(executable = "/bin/sh") +(arguments = "run.sh") +(inputFiles = ("run.sh" "run.sh")) +(jobname = "ci-integration-test") +(stdout = "stdout.log") +(stderr = "stderr.log") +(outputFiles = ("result.txt" "")) +(queue = "main") +(walltime = "5") +(memory = "256") diff --git a/arc-ce-slurm-ci/test/run.sh b/arc-ce-slurm-ci/test/run.sh new file mode 100644 index 0000000..66f9bea --- /dev/null +++ b/arc-ce-slurm-ci/test/run.sh @@ -0,0 +1,7 @@ +#!/bin/sh +echo "Running on host: $(hostname)" +echo "Running as user: $(id -un)" +echo "SLURM_JOB_ID=${SLURM_JOB_ID:-unset}" +date +sleep 2 +echo "ok $(date -u +%FT%TZ)" > result.txt diff --git a/arc-ce-slurm-ci/test/run_integration_test.sh b/arc-ce-slurm-ci/test/run_integration_test.sh new file mode 100644 index 0000000..e4c5272 --- /dev/null +++ b/arc-ce-slurm-ci/test/run_integration_test.sh @@ -0,0 +1,112 @@ +#!/bin/bash +# ============================================================================= +# Integration test: submit / monitor / retrieve a job through ARC CE, backed +# by SLURM. Meant to run *inside* the arc-ce-slurm container as griduser01 +# (that's who the test client cert + queue mapping point to), e.g.: +# +# docker exec -u griduser01 arc-ce-slurm-test /opt/arc-test/run_integration_test.sh +# +# Exit code 0 = pass, non-zero = fail (so GitLab CI can key off it directly). +# ============================================================================= +set -euo pipefail + +CE_HOST="$(hostname)" +CE_ENDPOINT="https://${CE_HOST}/arex" +JOB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +WORKDIR="$(mktemp -d)" +OUTDIR="${WORKDIR}/output" +POLL_INTERVAL=3 +POLL_TIMEOUT=180 + +log() { echo "[$(date -u +%T)] $*"; } + +fail() { log "FAIL: $*"; exit 1; } + +trap 'log "cleaning up ${WORKDIR}"; rm -rf "${WORKDIR}"' EXIT + +cd "${WORKDIR}" +cp "${JOB_DIR}/job.xrsl" . +cp "${JOB_DIR}/run.sh" . + +# ----------------------------------------------------------------------- +# 0. Sanity: we need a proxy. arcproxy reads cert/key from ~/.globus by +# default, which is exactly where `arcctl test-ca usercert --install-user` +# put them during bootstrap. +# ----------------------------------------------------------------------- +log "Generating proxy certificate for $(id -un)" +arcproxy || fail "arcproxy failed - is ~/.globus/usercert.pem present?" + +log "Querying CE info endpoint: ${CE_ENDPOINT}" +arcinfo -C "${CE_ENDPOINT}" || fail "arcinfo could not reach ${CE_ENDPOINT}" + +# ----------------------------------------------------------------------- +# 1. SUBMIT +# ----------------------------------------------------------------------- +log "Submitting job.xrsl to ${CE_ENDPOINT}" +SUBMIT_OUTPUT="$(arcsub -C "${CE_ENDPOINT}" job.xrsl 2>&1)" || { + echo "${SUBMIT_OUTPUT}" + fail "arcsub did not succeed" +} +echo "${SUBMIT_OUTPUT}" + +JOB_ID="$(echo "${SUBMIT_OUTPUT}" | grep -oE 'https://[^ ]+/jobs/[A-Za-z0-9]+' | head -n1)" +[ -n "${JOB_ID}" ] || fail "could not parse job id out of arcsub output" +log "Job submitted: ${JOB_ID}" + +# ----------------------------------------------------------------------- +# 2. MONITOR +# ----------------------------------------------------------------------- +log "Polling job state (timeout ${POLL_TIMEOUT}s)" +elapsed=0 +STATE="" +while [ "${elapsed}" -lt "${POLL_TIMEOUT}" ]; do + STAT_OUTPUT="$(arcstat "${JOB_ID}" 2>&1 || true)" + STATE="$(echo "${STAT_OUTPUT}" | awk -F': ' '/State:/{print $2; exit}')" + log "state=${STATE:-unknown}" + case "${STATE}" in + Finished|FINISHED) + break + ;; + Failed|FAILED|Killed|KILLED|Deleted) + echo "${STAT_OUTPUT}" + fail "job entered terminal failure state: ${STATE}" + ;; + esac + sleep "${POLL_INTERVAL}" + elapsed=$((elapsed + POLL_INTERVAL)) +done + +[ "${STATE}" = "Finished" ] || [ "${STATE}" = "FINISHED" ] || { + arcstat "${JOB_ID}" || true + arcctl job log "$(basename "${JOB_ID}")" --service || true + fail "job did not reach Finished state within ${POLL_TIMEOUT}s (last state: ${STATE:-unknown})" +} +log "Job reached Finished state" + +# ----------------------------------------------------------------------- +# 3. RETRIEVE +# ----------------------------------------------------------------------- +mkdir -p "${OUTDIR}" +log "Retrieving output with arcget into ${OUTDIR}" +( cd "${OUTDIR}" && arcget "${JOB_ID}" ) || fail "arcget failed" + +RESULT_FILE="$(find "${OUTDIR}" -name result.txt | head -n1)" +STDOUT_FILE="$(find "${OUTDIR}" -name stdout.log | head -n1)" + +[ -n "${RESULT_FILE}" ] || fail "result.txt was not retrieved" +[ -n "${STDOUT_FILE}" ] || fail "stdout.log was not retrieved" + +grep -q '^ok ' "${RESULT_FILE}" || fail "result.txt did not contain expected content: $(cat "${RESULT_FILE}")" +grep -q 'Running on host' "${STDOUT_FILE}" || fail "stdout.log missing expected marker" + +log "Output content:" +cat "${STDOUT_FILE}" +cat "${RESULT_FILE}" + +# ----------------------------------------------------------------------- +# 4. Cleanup the job from A-REX bookkeeping (not strictly required, but +# keeps repeated CI runs tidy) +# ----------------------------------------------------------------------- +arcclean "${JOB_ID}" || log "warning: arcclean failed (non-fatal)" + +log "PASS: submit -> monitor -> retrieve integration test succeeded" diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..3f6091f --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,19 @@ +version: "3.8" + +services: + arc-ce: + build: + context: ./docker + dockerfile: Dockerfile + image: arc-ce-slurm-test:local + container_name: arc-ce-slurm-test + hostname: arc-ce + privileged: true # needed for systemd-as-PID1 (see README) + ports: + - "8443:443" + healthcheck: + test: ["CMD", "/usr/local/bin/healthcheck.sh"] + interval: 5s + timeout: 5s + start_period: 90s + retries: 30 diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..1e97186 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,78 @@ +# ============================================================================= +# ARC CE + SLURM "all-in-one" image for integration testing +# +# Runs, as real systemd-managed services (this is what arcctl/ARC packaging +# expects and is far more robust than re-implementing service supervision): +# - munge (SLURM auth) +# - slurmctld + slurmd (single-node SLURM cluster, one fake node) +# - arc-arex + arc-arex-ws (NorduGrid ARC CE, LRMS backend = slurm) +# +# Base: AlmaLinux 9. On EL9, ARC7 ships in EPEL as nordugrid-arc7-*. +# +# IMPORTANT: this container runs systemd as PID 1, which needs either +# docker run --privileged +# or (rootless-friendlier) +# docker run --cgroupns=host -v /sys/fs/cgroup:/sys/fs/cgroup:rw +# See the .gitlab-ci.yml in this repo for the CI-side flags. +# ============================================================================= +FROM almalinux:9 + +ENV container=docker + +# --------------------------------------------------------------------------- +# systemd + EPEL + SLURM + munge + ARC CE + ARC client tools +# --------------------------------------------------------------------------- +RUN dnf -y install epel-release && \ + dnf -y update && \ + dnf -y install systemd crypto-policies-scripts && \ + update-crypto-policies --set LEGACY && \ + dnf -y install \ + munge munge-libs \ + slurm slurm-slurmctld slurm-slurmd \ + openssl ca-certificates \ + procps-ng iproute net-tools which curl jq \ + && \ + dnf -y install \ + nordugrid-arc7-arex \ + nordugrid-arc7-client \ + nordugrid-arc7-arcctl \ + && \ + dnf clean all && \ + # standard "systemd in docker" cleanup: mask units that don't apply / fail in containers + (cd /lib/systemd/system/sysinit.target.wants/; for i in *; do [ "$i" = systemd-tmpfiles-setup.service ] || rm -f "$i"; done) ; \ + rm -f /lib/systemd/system/multi-user.target.wants/* ; \ + rm -f /etc/systemd/system/*.wants/* ; \ + rm -f /lib/systemd/system/local-fs.target.wants/* ; \ + rm -f /lib/systemd/system/sockets.target.wants/*udev* ; \ + rm -f /lib/systemd/system/sockets.target.wants/*initctl* ; \ + rm -f /lib/systemd/system/basic.target.wants/* ; \ + rm -f /lib/systemd/system/anaconda.target.wants/* 2>/dev/null || true + +# --------------------------------------------------------------------------- +# Unprivileged pool account that grid jobs get mapped to + matching SLURM node +# --------------------------------------------------------------------------- +RUN useradd -m -s /bin/bash griduser01 + +# --------------------------------------------------------------------------- +# Config files +# --------------------------------------------------------------------------- +COPY slurm.conf /etc/slurm/slurm.conf +COPY cgroup.conf /etc/slurm/cgroup.conf +COPY arc.conf /etc/arc.conf + +COPY bootstrap.sh /usr/local/bin/bootstrap.sh +COPY healthcheck.sh /usr/local/bin/healthcheck.sh +COPY arc-bootstrap.service /etc/systemd/system/arc-bootstrap.service + +RUN mkdir -p /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && \ + chown slurm:slurm /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && \ + chmod +x /usr/local/bin/bootstrap.sh /usr/local/bin/healthcheck.sh && \ + systemctl enable munge slurmctld slurmd arc-bootstrap.service + +EXPOSE 443 + +HEALTHCHECK --interval=5s --timeout=5s --start-period=90s --retries=30 \ + CMD /usr/local/bin/healthcheck.sh + +STOPSIGNAL SIGRTMIN+3 +CMD ["/usr/sbin/init"] diff --git a/docker/arc-bootstrap.service b/docker/arc-bootstrap.service new file mode 100644 index 0000000..457b56f --- /dev/null +++ b/docker/arc-bootstrap.service @@ -0,0 +1,13 @@ +[Unit] +Description=Bootstrap ARC CE (arex/arex-ws) on top of local SLURM, mint test client cert +After=network.target munge.service slurmctld.service slurmd.service +Wants=munge.service slurmctld.service slurmd.service +ConditionPathExists=!/run/arc-ready + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=/usr/local/bin/bootstrap.sh + +[Install] +WantedBy=multi-user.target diff --git a/docker/arc.conf b/docker/arc.conf new file mode 100644 index 0000000..8fc09aa --- /dev/null +++ b/docker/arc.conf @@ -0,0 +1,63 @@ +# ============================================================================= +# /etc/arc.conf - ARC7 CE configuration for the integration-test container +# +# This intentionally overrides the packaged "zero configuration" so we have +# an explicit, reviewable setup. Cross-check option names against your +# installed ARC version's reference doc if you bump versions: +# /usr/share/doc/nordugrid-arc7-arex/arc.conf.reference +# https://www.nordugrid.org/arc/arc7/admins/reference.html +# ============================================================================= + +[common] +hostname = arc-ce +x509_user_key = /etc/grid-security/hostkey.pem +x509_user_cert = /etc/grid-security/hostcert.pem +x509_cert_dir = /etc/grid-security/certificates + +# ----------------------------------------------------------------------- +# LRMS: point A-REX at the local single-node SLURM cluster +# ----------------------------------------------------------------------- +[lrms] +lrms = slurm +slurm_use_sacct = yes +slurm_wakeupperiod = 5 + +[queue: main] +comment = CI integration-test queue backed by local SLURM partition "main" + +# ----------------------------------------------------------------------- +# Authorization: accept anyone holding a cert signed by the ARC Test-CA +# that arcctl generates at install time. The test client cert we mint in +# entrypoint.sh (arcctl test-ca usercert) is auto-appended to +# testCA.allowed-subjects, which is what the "zero" authgroup checks. +# ----------------------------------------------------------------------- +[authgroup: zero] + +[mapping] +# every request authorized via the "zero" authgroup runs as griduser01 +map_to_user = zero griduser01 + +# ----------------------------------------------------------------------- +# A-REX core: where jobs' control/session data live +# ----------------------------------------------------------------------- +[arex] +user = root +controldir = /var/spool/arc/jobstatus +sessiondir = /var/spool/arc/sessiondir +runtimedir = /usr/share/arc/rte +delegationdb = sqlite + +[arex/ws] +wsurl = https://arc-ce/arex +allowaccess = zero + +[arex/ws/jobs] +allowaccess = zero + +# Data staging can be minimal for a CI job that just echoes something, +# but we enable it so xRSL inputfiles/outputfiles work if you extend the +# test job later. +[arex/data-staging] + +[arex/cache] +cachedir = /var/spool/arc/cache diff --git a/docker/bootstrap.sh b/docker/bootstrap.sh new file mode 100644 index 0000000..02fc606 --- /dev/null +++ b/docker/bootstrap.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# ============================================================================= +# Runs once at container start (via arc-bootstrap.service, after munge, +# slurmctld and slurmd units). Responsibilities: +# 1. Wait for munge + SLURM to be actually usable +# 2. (Re)generate the ARC Test-CA and a host certificate bound to this +# container's *runtime* hostname (image build time hostname is random, +# so we can't bake a valid host cert into the image itself) +# 3. Start arc-arex / arc-arex-ws as configured in /etc/arc.conf +# 4. Mint a Test-CA user certificate for griduser01, which arcctl +# automatically whitelists in /etc/grid-security/testCA.allowed-subjects +# 5. Wait for the REST endpoint to answer, then signal readiness +# ============================================================================= +set -euo pipefail +LOG=/var/log/arc-bootstrap.log +exec > >(tee -a "$LOG") 2>&1 + +echo "== ARC CE / SLURM bootstrap starting: $(date -u) ==" + +HOSTNAME_FQDN="$(hostname)" +echo "Using hostname: ${HOSTNAME_FQDN}" + +wait_for() { + local desc="$1"; shift + local tries=0 + until "$@" >/dev/null 2>&1; do + tries=$((tries + 1)) + if [ "$tries" -gt 90 ]; then + echo "TIMED OUT waiting for: ${desc}" + return 1 + fi + sleep 2 + done + echo "${desc}: ready (after ${tries} tries)" +} + +# --- 1. munge, then SLURM control daemon ------------------------------------ +wait_for "munge" bash -c 'echo bootstrap-check | munge | unmunge' +wait_for "slurmctld (sinfo)" sinfo -h + +# --- 2. Test-CA + host certificate for the real runtime hostname ------------ +arcctl test-ca init -f +arcctl test-ca hostcert -n "${HOSTNAME_FQDN}" -f + +# --- 3. Start ARC CE services ------------------------------------------------- +arcctl service start --as-configured + +# --- 4. Test client certificate for griduser01 ------------------------------- +arcctl test-ca usercert --install-user griduser01 -f + +# Also export a portable tarball, useful if the GitLab job wants to drive +# arcsub/arcstat/arcget from *outside* this container (e.g. from the +# job's own shell talking to the CE over the docker network). +arcctl test-ca usercert -n griduser01 --export-tar -f || true +mv -f testcert-*.tar.gz /root/arc-test-client.tar.gz 2>/dev/null || true + +# --- 5. Wait until the REST endpoint actually answers ------------------------ +wait_for "arex REST endpoint" curl -sk "https://${HOSTNAME_FQDN}/arex/rest/1.0/info" + +touch /run/arc-ready +echo "== ARC CE / SLURM bootstrap complete: $(date -u) ==" diff --git a/docker/cgroup.conf b/docker/cgroup.conf new file mode 100644 index 0000000..4ca269d --- /dev/null +++ b/docker/cgroup.conf @@ -0,0 +1,3 @@ +# cgroup constraints are disabled: see slurm.conf comment. +# Kept here only because slurmd/slurmctld expect the file to exist. +CgroupPlugin=disabled diff --git a/docker/healthcheck.sh b/docker/healthcheck.sh new file mode 100644 index 0000000..bc47b09 --- /dev/null +++ b/docker/healthcheck.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# Used by Dockerfile HEALTHCHECK and by the GitLab CI "wait for CE" step. +set -o pipefail + +[ -f /run/arc-ready ] || exit 1 + +sinfo -h >/dev/null 2>&1 || exit 1 + +curl -sk --max-time 3 -o /dev/null "https://$(hostname)/arex/rest/1.0/info" || exit 1 + +exit 0 diff --git a/docker/slurm.conf b/docker/slurm.conf new file mode 100644 index 0000000..23099c7 --- /dev/null +++ b/docker/slurm.conf @@ -0,0 +1,36 @@ +# ============================================================================= +# slurm.conf - single-node SLURM cluster for CI/integration testing +# +# NOTE: TaskPlugin=task/none and ProctrackType=proctrack/linuxproc are used +# instead of the cgroup-based plugins because GitLab CI docker executors +# usually do NOT grant access to the host cgroup hierarchy needed by +# proctrack/cgroup or task/cgroup. If your runner is privileged and mounts +# /sys/fs/cgroup read-write, you can switch to the cgroup plugins for more +# realistic resource accounting. +# ============================================================================= +ClusterName=citest +SlurmctldHost=localhost + +SlurmUser=slurm +SlurmctldPort=6817 +SlurmdPort=6818 +AuthType=auth/munge +StateSaveLocation=/var/spool/slurmctld +SlurmdSpoolDir=/var/spool/slurmd +SlurmctldPidFile=/var/run/slurmctld.pid +SlurmdPidFile=/var/run/slurmd.pid +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmdLogFile=/var/log/slurm/slurmd.log + +ProctrackType=proctrack/linuxproc +TaskPlugin=task/none +SelectType=select/cons_tres +SelectTypeParameters=CR_Core_Memory + +SchedulerType=sched/backfill +ReturnToService=2 +MpiDefault=none + +# --- Node & partition: one fake compute node backing the whole cluster --- +NodeName=cnode01 NodeAddr=localhost CPUs=2 RealMemory=2000 State=UNKNOWN +PartitionName=main Nodes=cnode01 Default=YES MaxTime=INFINITE State=UP diff --git a/test/job.xrsl b/test/job.xrsl new file mode 100644 index 0000000..11d34c7 --- /dev/null +++ b/test/job.xrsl @@ -0,0 +1,11 @@ +& +(executable = "/bin/sh") +(arguments = "run.sh") +(inputFiles = ("run.sh" "run.sh")) +(jobname = "ci-integration-test") +(stdout = "stdout.log") +(stderr = "stderr.log") +(outputFiles = ("result.txt" "")) +(queue = "main") +(walltime = "5") +(memory = "256") diff --git a/test/run.sh b/test/run.sh new file mode 100644 index 0000000..66f9bea --- /dev/null +++ b/test/run.sh @@ -0,0 +1,7 @@ +#!/bin/sh +echo "Running on host: $(hostname)" +echo "Running as user: $(id -un)" +echo "SLURM_JOB_ID=${SLURM_JOB_ID:-unset}" +date +sleep 2 +echo "ok $(date -u +%FT%TZ)" > result.txt diff --git a/test/run_integration_test.sh b/test/run_integration_test.sh new file mode 100644 index 0000000..e4c5272 --- /dev/null +++ b/test/run_integration_test.sh @@ -0,0 +1,112 @@ +#!/bin/bash +# ============================================================================= +# Integration test: submit / monitor / retrieve a job through ARC CE, backed +# by SLURM. Meant to run *inside* the arc-ce-slurm container as griduser01 +# (that's who the test client cert + queue mapping point to), e.g.: +# +# docker exec -u griduser01 arc-ce-slurm-test /opt/arc-test/run_integration_test.sh +# +# Exit code 0 = pass, non-zero = fail (so GitLab CI can key off it directly). +# ============================================================================= +set -euo pipefail + +CE_HOST="$(hostname)" +CE_ENDPOINT="https://${CE_HOST}/arex" +JOB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +WORKDIR="$(mktemp -d)" +OUTDIR="${WORKDIR}/output" +POLL_INTERVAL=3 +POLL_TIMEOUT=180 + +log() { echo "[$(date -u +%T)] $*"; } + +fail() { log "FAIL: $*"; exit 1; } + +trap 'log "cleaning up ${WORKDIR}"; rm -rf "${WORKDIR}"' EXIT + +cd "${WORKDIR}" +cp "${JOB_DIR}/job.xrsl" . +cp "${JOB_DIR}/run.sh" . + +# ----------------------------------------------------------------------- +# 0. Sanity: we need a proxy. arcproxy reads cert/key from ~/.globus by +# default, which is exactly where `arcctl test-ca usercert --install-user` +# put them during bootstrap. +# ----------------------------------------------------------------------- +log "Generating proxy certificate for $(id -un)" +arcproxy || fail "arcproxy failed - is ~/.globus/usercert.pem present?" + +log "Querying CE info endpoint: ${CE_ENDPOINT}" +arcinfo -C "${CE_ENDPOINT}" || fail "arcinfo could not reach ${CE_ENDPOINT}" + +# ----------------------------------------------------------------------- +# 1. SUBMIT +# ----------------------------------------------------------------------- +log "Submitting job.xrsl to ${CE_ENDPOINT}" +SUBMIT_OUTPUT="$(arcsub -C "${CE_ENDPOINT}" job.xrsl 2>&1)" || { + echo "${SUBMIT_OUTPUT}" + fail "arcsub did not succeed" +} +echo "${SUBMIT_OUTPUT}" + +JOB_ID="$(echo "${SUBMIT_OUTPUT}" | grep -oE 'https://[^ ]+/jobs/[A-Za-z0-9]+' | head -n1)" +[ -n "${JOB_ID}" ] || fail "could not parse job id out of arcsub output" +log "Job submitted: ${JOB_ID}" + +# ----------------------------------------------------------------------- +# 2. MONITOR +# ----------------------------------------------------------------------- +log "Polling job state (timeout ${POLL_TIMEOUT}s)" +elapsed=0 +STATE="" +while [ "${elapsed}" -lt "${POLL_TIMEOUT}" ]; do + STAT_OUTPUT="$(arcstat "${JOB_ID}" 2>&1 || true)" + STATE="$(echo "${STAT_OUTPUT}" | awk -F': ' '/State:/{print $2; exit}')" + log "state=${STATE:-unknown}" + case "${STATE}" in + Finished|FINISHED) + break + ;; + Failed|FAILED|Killed|KILLED|Deleted) + echo "${STAT_OUTPUT}" + fail "job entered terminal failure state: ${STATE}" + ;; + esac + sleep "${POLL_INTERVAL}" + elapsed=$((elapsed + POLL_INTERVAL)) +done + +[ "${STATE}" = "Finished" ] || [ "${STATE}" = "FINISHED" ] || { + arcstat "${JOB_ID}" || true + arcctl job log "$(basename "${JOB_ID}")" --service || true + fail "job did not reach Finished state within ${POLL_TIMEOUT}s (last state: ${STATE:-unknown})" +} +log "Job reached Finished state" + +# ----------------------------------------------------------------------- +# 3. RETRIEVE +# ----------------------------------------------------------------------- +mkdir -p "${OUTDIR}" +log "Retrieving output with arcget into ${OUTDIR}" +( cd "${OUTDIR}" && arcget "${JOB_ID}" ) || fail "arcget failed" + +RESULT_FILE="$(find "${OUTDIR}" -name result.txt | head -n1)" +STDOUT_FILE="$(find "${OUTDIR}" -name stdout.log | head -n1)" + +[ -n "${RESULT_FILE}" ] || fail "result.txt was not retrieved" +[ -n "${STDOUT_FILE}" ] || fail "stdout.log was not retrieved" + +grep -q '^ok ' "${RESULT_FILE}" || fail "result.txt did not contain expected content: $(cat "${RESULT_FILE}")" +grep -q 'Running on host' "${STDOUT_FILE}" || fail "stdout.log missing expected marker" + +log "Output content:" +cat "${STDOUT_FILE}" +cat "${RESULT_FILE}" + +# ----------------------------------------------------------------------- +# 4. Cleanup the job from A-REX bookkeeping (not strictly required, but +# keeps repeated CI runs tidy) +# ----------------------------------------------------------------------- +arcclean "${JOB_ID}" || log "warning: arcclean failed (non-fatal)" + +log "PASS: submit -> monitor -> retrieve integration test succeeded" From f94fde802a30024b1c09e442a5a5d0e1fadc8415 Mon Sep 17 00:00:00 2001 From: Vijay Chakravarty Date: Thu, 2 Jul 2026 09:35:55 +0200 Subject: [PATCH 02/18] Fix: create slurm system user before chown (EPEL slurm package doesn't create it) --- docker/Dockerfile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docker/Dockerfile b/docker/Dockerfile index 1e97186..70305e6 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -48,6 +48,14 @@ RUN dnf -y install epel-release && \ rm -f /lib/systemd/system/basic.target.wants/* ; \ rm -f /lib/systemd/system/anaconda.target.wants/* 2>/dev/null || true +# --------------------------------------------------------------------------- +# slurm service account (EPEL's slurm packages do NOT create this user +# automatically, unlike some other distros' packaging - has to be done here) +# --------------------------------------------------------------------------- +RUN groupadd -r slurm --gid=990 && \ + useradd -r -c "SLURM workload manager" -d /var/lib/slurm -u 990 -g slurm -s /sbin/nologin slurm && \ + mkdir -p /var/lib/slurm && chown slurm:slurm /var/lib/slurm + # --------------------------------------------------------------------------- # Unprivileged pool account that grid jobs get mapped to + matching SLURM node # --------------------------------------------------------------------------- From 13c91f62108ca88c41f260988090a94fd621afca Mon Sep 17 00:00:00 2001 From: Vijay Chakravarty Date: Thu, 2 Jul 2026 09:42:08 +0200 Subject: [PATCH 03/18] Fix: don't systemctl-enable munge/slurmctld/slurmd directly; rely on arc-bootstrap.service dependencies --- docker/Dockerfile | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 70305e6..286ead1 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -72,10 +72,21 @@ COPY bootstrap.sh /usr/local/bin/bootstrap.sh COPY healthcheck.sh /usr/local/bin/healthcheck.sh COPY arc-bootstrap.service /etc/systemd/system/arc-bootstrap.service +# Debug aid: prints exactly which systemd unit names the munge/slurm +# packages actually shipped. Check this in the build log if bootstrap.sh +# ever times out waiting for munge/slurmctld/slurmd at runtime. +RUN echo "== systemd units shipped by munge / slurm packages ==" && \ + rpm -ql munge slurm slurm-slurmctld slurm-slurmd | grep systemd + RUN mkdir -p /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && \ chown slurm:slurm /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && \ chmod +x /usr/local/bin/bootstrap.sh /usr/local/bin/healthcheck.sh && \ - systemctl enable munge slurmctld slurmd arc-bootstrap.service + systemctl enable arc-bootstrap.service +# NOTE: munge.service / slurmctld.service / slurmd.service are deliberately +# NOT enabled here. arc-bootstrap.service declares Wants=/After= on all +# three, so systemd pulls them in automatically as dependencies when +# arc-bootstrap.service starts at boot - no need to double-enable them, +# and it avoids brittle unit-name lookups at build time. EXPOSE 443 From 80334e90134cc89c92dd55f9d316bf6c160485f2 Mon Sep 17 00:00:00 2001 From: Vijay Chakravarty Date: Thu, 2 Jul 2026 09:47:07 +0200 Subject: [PATCH 04/18] Make package debug step non-fatal and unfiltered --- docker/Dockerfile | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 286ead1..71e819c 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -75,8 +75,19 @@ COPY arc-bootstrap.service /etc/systemd/system/arc-bootstrap.service # Debug aid: prints exactly which systemd unit names the munge/slurm # packages actually shipped. Check this in the build log if bootstrap.sh # ever times out waiting for munge/slurmctld/slurmd at runtime. -RUN echo "== systemd units shipped by munge / slurm packages ==" && \ - rpm -ql munge slurm slurm-slurmctld slurm-slurmd | grep systemd +# Debug aid (non-fatal): shows exactly what's installed for munge/slurm +# and whether they shipped systemd units. Read this in the build log if +# bootstrap.sh ever times out waiting for munge/slurmctld/slurmd later. +RUN set +e; \ + echo "== rpm -q (are the packages even installed?) =="; \ + rpm -q munge slurm slurm-slurmctld slurm-slurmd; \ + echo "== munge file list =="; rpm -ql munge 2>&1; \ + echo "== slurm file list =="; rpm -ql slurm 2>&1; \ + echo "== slurm-slurmctld file list =="; rpm -ql slurm-slurmctld 2>&1; \ + echo "== slurm-slurmd file list =="; rpm -ql slurm-slurmd 2>&1; \ + echo "== anything under /usr/lib/systemd/system matching munge or slurm =="; \ + ls -la /usr/lib/systemd/system/ | grep -iE 'munge|slurm'; \ + true RUN mkdir -p /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && \ chown slurm:slurm /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && \ From 44d6e53b5a02a644a55e30a30f4e4b28c3464ccd Mon Sep 17 00:00:00 2001 From: Vijay Chakravarty Date: Thu, 2 Jul 2026 10:04:28 +0200 Subject: [PATCH 05/18] enable crb --- docker/Dockerfile | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 71e819c..dfd1579 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -23,21 +23,27 @@ ENV container=docker # systemd + EPEL + SLURM + munge + ARC CE + ARC client tools # --------------------------------------------------------------------------- RUN dnf -y install epel-release && \ + dnf -y install dnf-plugins-core && \ + dnf config-manager --set-enabled crb && \ dnf -y update && \ dnf -y install systemd crypto-policies-scripts && \ update-crypto-policies --set LEGACY && \ - dnf -y install \ + dnf -y install --setopt=strict=1 \ munge munge-libs \ slurm slurm-slurmctld slurm-slurmd \ openssl ca-certificates \ procps-ng iproute net-tools which curl jq \ && \ - dnf -y install \ + dnf -y install --setopt=strict=1 \ nordugrid-arc7-arex \ nordugrid-arc7-client \ nordugrid-arc7-arcctl \ && \ dnf clean all && \ + # Hard assertion: fail the build loudly right here if anything above + # was silently skipped, instead of finding out at container runtime. + rpm -q munge munge-libs slurm slurm-slurmctld slurm-slurmd \ + nordugrid-arc7-arex nordugrid-arc7-client nordugrid-arc7-arcctl && \ # standard "systemd in docker" cleanup: mask units that don't apply / fail in containers (cd /lib/systemd/system/sysinit.target.wants/; for i in *; do [ "$i" = systemd-tmpfiles-setup.service ] || rm -f "$i"; done) ; \ rm -f /lib/systemd/system/multi-user.target.wants/* ; \ From 980b49dc7a21cc0b86aa2a782d322ae19fff140b Mon Sep 17 00:00:00 2001 From: Vijay Chakravarty Date: Thu, 2 Jul 2026 10:25:37 +0200 Subject: [PATCH 06/18] separate to fix --- docker/Dockerfile | 53 +++++--- docker/build.log | 323 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 357 insertions(+), 19 deletions(-) create mode 100644 docker/build.log diff --git a/docker/Dockerfile b/docker/Dockerfile index dfd1579..1aaa8e4 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -22,30 +22,45 @@ ENV container=docker # --------------------------------------------------------------------------- # systemd + EPEL + SLURM + munge + ARC CE + ARC client tools # --------------------------------------------------------------------------- -RUN dnf -y install epel-release && \ - dnf -y install dnf-plugins-core && \ - dnf config-manager --set-enabled crb && \ - dnf -y update && \ - dnf -y install systemd crypto-policies-scripts && \ - update-crypto-policies --set LEGACY && \ - dnf -y install --setopt=strict=1 \ +# --------------------------------------------------------------------------- +# systemd + EPEL + SLURM + munge + ARC CE + ARC client tools +# Split into separate RUN layers on purpose: easier to see exactly which +# step fails, and a failure in one can't be silently absorbed by a long +# && chain (which is what happened with the curl/curl-minimal conflict +# below before this was split out). +# --------------------------------------------------------------------------- +RUN dnf -y install epel-release dnf-plugins-core && \ + dnf config-manager --set-enabled crb + +RUN dnf -y update + +RUN dnf -y install systemd crypto-policies-scripts && \ + update-crypto-policies --set LEGACY + +# --allowerasing: AlmaLinux's base image ships curl-minimal, which +# conflicts with the full curl package. Let dnf swap it out rather than +# aborting the whole transaction (this was silently dropping munge/slurm +# from the install set entirely). +RUN dnf -y install --allowerasing --setopt=strict=1 \ munge munge-libs \ slurm slurm-slurmctld slurm-slurmd \ openssl ca-certificates \ - procps-ng iproute net-tools which curl jq \ - && \ - dnf -y install --setopt=strict=1 \ + procps-ng iproute net-tools which curl jq + +RUN dnf -y install --allowerasing --setopt=strict=1 \ nordugrid-arc7-arex \ nordugrid-arc7-client \ - nordugrid-arc7-arcctl \ - && \ - dnf clean all && \ - # Hard assertion: fail the build loudly right here if anything above - # was silently skipped, instead of finding out at container runtime. - rpm -q munge munge-libs slurm slurm-slurmctld slurm-slurmd \ - nordugrid-arc7-arex nordugrid-arc7-client nordugrid-arc7-arcctl && \ - # standard "systemd in docker" cleanup: mask units that don't apply / fail in containers - (cd /lib/systemd/system/sysinit.target.wants/; for i in *; do [ "$i" = systemd-tmpfiles-setup.service ] || rm -f "$i"; done) ; \ + nordugrid-arc7-arcctl + +RUN dnf clean all + +# Hard assertion, isolated in its own layer: this MUST fail the build +# (visibly, as its own numbered step) if any package didn't actually land. +RUN rpm -q munge munge-libs slurm slurm-slurmctld slurm-slurmd \ + nordugrid-arc7-arex nordugrid-arc7-client nordugrid-arc7-arcctl + +# standard "systemd in docker" cleanup: mask units that don't apply / fail in containers +RUN (cd /lib/systemd/system/sysinit.target.wants/; for i in *; do [ "$i" = systemd-tmpfiles-setup.service ] || rm -f "$i"; done) ; \ rm -f /lib/systemd/system/multi-user.target.wants/* ; \ rm -f /etc/systemd/system/*.wants/* ; \ rm -f /lib/systemd/system/local-fs.target.wants/* ; \ diff --git a/docker/build.log b/docker/build.log new file mode 100644 index 0000000..d4b302e --- /dev/null +++ b/docker/build.log @@ -0,0 +1,323 @@ +--progress is a global compose flag, better use `docker compose --progress xx build ... +time="2026-07-02T10:17:34+02:00" level=warning msg="/home/vijay/Downloads/intercede/docker-compose.yml: the attribute `version` is obsolete, it will be ignored, please remove it to avoid potential confusion" + Image arc-ce-slurm-test:local Building +#1 [internal] load local bake definitions +#1 reading from stdin 563B done +#1 DONE 0.0s + +#2 [internal] load build definition from Dockerfile +#2 transferring dockerfile: 5.76kB done +#2 DONE 0.0s + +#3 [internal] load metadata for docker.io/library/almalinux:9 +#3 DONE 0.9s + +#4 [internal] load .dockerignore +#4 transferring context: 2B done +#4 DONE 0.0s + +#5 [ 1/12] FROM docker.io/library/almalinux:9@sha256:d2515c769e7b73f95c4fde38c0a505336ff38f14990c0b7253b77060a049a743 +#5 resolve docker.io/library/almalinux:9@sha256:d2515c769e7b73f95c4fde38c0a505336ff38f14990c0b7253b77060a049a743 0.0s done +#5 CACHED + +#6 [internal] load build context +#6 transferring context: 198B done +#6 DONE 0.0s + +#7 [ 2/12] RUN dnf -y install epel-release && dnf -y install dnf-plugins-core && dnf config-manager --set-enabled crb && dnf -y update && dnf -y install systemd crypto-policies-scripts && update-crypto-policies --set LEGACY && dnf -y install --setopt=strict=1 munge munge-libs slurm slurm-slurmctld slurm-slurmd openssl ca-certificates procps-ng iproute net-tools which curl jq && dnf -y install --setopt=strict=1 nordugrid-arc7-arex nordugrid-arc7-client nordugrid-arc7-arcctl && dnf clean all && rpm -q munge munge-libs slurm slurm-slurmctld slurm-slurmd nordugrid-arc7-arex nordugrid-arc7-client nordugrid-arc7-arcctl && (cd /lib/systemd/system/sysinit.target.wants/; for i in *; do [ "$i" = systemd-tmpfiles-setup.service ] || rm -f "$i"; done) ; rm -f /lib/systemd/system/multi-user.target.wants/* ; rm -f /etc/systemd/system/*.wants/* ; rm -f /lib/systemd/system/local-fs.target.wants/* ; rm -f /lib/systemd/system/sockets.target.wants/*udev* ; rm -f /lib/systemd/system/sockets.target.wants/*initctl* ; rm -f /lib/systemd/system/basic.target.wants/* ; rm -f /lib/systemd/system/anaconda.target.wants/* 2>/dev/null || true +#7 1.573 AlmaLinux 9 - AppStream 13 MB/s | 15 MB 00:01 +#7 5.018 AlmaLinux 9 - BaseOS 11 MB/s | 14 MB 00:01 +#7 7.019 AlmaLinux 9 - Extras 33 kB/s | 22 kB 00:00 +#7 7.571 Dependencies resolved. +#7 7.572 ================================================================================ +#7 7.572 Package Architecture Version Repository Size +#7 7.572 ================================================================================ +#7 7.572 Installing: +#7 7.572 epel-release noarch 9-9.el9 extras 18 k +#7 7.572 Installing weak dependencies: +#7 7.572 dnf-plugins-core noarch 4.3.0-26.el9 baseos 35 k +#7 7.572 +#7 7.572 Transaction Summary +#7 7.572 ================================================================================ +#7 7.572 Install 2 Packages +#7 7.572 +#7 7.573 Total download size: 53 k +#7 7.573 Installed size: 48 k +#7 7.573 Downloading Packages: +#7 8.870 (1/2): dnf-plugins-core-4.3.0-26.el9.noarch.rpm 612 kB/s | 35 kB 00:00 +#7 8.903 (2/2): epel-release-9-9.el9.noarch.rpm 204 kB/s | 18 kB 00:00 +#7 8.904 -------------------------------------------------------------------------------- +#7 8.904 Total 40 kB/s | 53 kB 00:01 +#7 8.943 Running transaction check +#7 8.952 Transaction check succeeded. +#7 8.952 Running transaction test +#7 8.959 Transaction test succeeded. +#7 8.959 Running transaction +#7 8.998 Preparing : 1/1 +#7 9.034 Installing : dnf-plugins-core-4.3.0-26.el9.noarch 1/2 +#7 9.072 Installing : epel-release-9-9.el9.noarch 2/2 +#7 9.076 Running scriptlet: epel-release-9-9.el9.noarch 2/2 +#7 9.086 Many EPEL packages require the CodeReady Builder (CRB) repository. +#7 9.086 It is recommended that you run /usr/bin/crb enable to enable the CRB repository. +#7 9.086 +#7 9.167 Verifying : dnf-plugins-core-4.3.0-26.el9.noarch 1/2 +#7 9.167 Verifying : epel-release-9-9.el9.noarch 2/2 +#7 9.223 +#7 9.223 Installed: +#7 9.223 dnf-plugins-core-4.3.0-26.el9.noarch epel-release-9-9.el9.noarch +#7 9.223 +#7 9.223 Complete! +#7 11.69 Extra Packages for Enterprise Linux 9 - x86_64 9.6 MB/s | 21 MB 00:02 +#7 18.74 Extra Packages for Enterprise Linux 9 openh264 3.8 kB/s | 2.5 kB 00:00 +#7 19.94 Package dnf-plugins-core-4.3.0-26.el9.noarch is already installed. +#7 19.96 Dependencies resolved. +#7 19.97 Nothing to do. +#7 19.97 Complete! +#7 21.36 AlmaLinux 9 - CRB 4.3 MB/s | 3.9 MB 00:00 +#7 22.21 Last metadata expiration check: 0:00:01 ago on Thu Jul 2 08:17:56 2026. +#7 23.45 Dependencies resolved. +#7 23.45 ================================================================================ +#7 23.45 Package Arch Version Repo Size +#7 23.45 ================================================================================ +#7 23.45 Upgrading: +#7 23.45 coreutils-single x86_64 8.32-41.el9_8 baseos 598 k +#7 23.45 epel-release noarch 9-11.el9 epel 19 k +#7 23.45 expat x86_64 2.5.0-6.el9_8.1 baseos 117 k +#7 23.45 glibc x86_64 2.34-272.el9_8 baseos 2.0 M +#7 23.45 glibc-common x86_64 2.34-272.el9_8 baseos 299 k +#7 23.45 glibc-minimal-langpack x86_64 2.34-272.el9_8 baseos 27 k +#7 23.45 libeconf x86_64 0.4.1-7.el9_8 baseos 26 k +#7 23.45 libsolv x86_64 0.7.24-5.el9_8 baseos 402 k +#7 23.45 libtasn1 x86_64 4.16.0-10.el9_8 baseos 73 k +#7 23.45 libxml2 x86_64 2.9.13-14.el9_8.1 baseos 746 k +#7 23.45 openssl x86_64 1:3.5.5-4.el9_8 baseos 1.4 M +#7 23.45 openssl-fips-provider x86_64 1:3.5.5-4.el9_8 baseos 816 k +#7 23.45 openssl-libs x86_64 1:3.5.5-4.el9_8 baseos 2.3 M +#7 23.45 systemd x86_64 252-67.el9_8.4.alma.1 baseos 4.0 M +#7 23.45 systemd-libs x86_64 252-67.el9_8.4.alma.1 baseos 651 k +#7 23.45 systemd-pam x86_64 252-67.el9_8.4.alma.1 baseos 258 k +#7 23.45 systemd-rpm-macros noarch 252-67.el9_8.4.alma.1 baseos 46 k +#7 23.45 vim-minimal x86_64 2:8.2.2637-26.el9_8.6 baseos 672 k +#7 23.45 +#7 23.45 Transaction Summary +#7 23.45 ================================================================================ +#7 23.45 Upgrade 18 Packages +#7 23.45 +#7 23.45 Total download size: 14 M +#7 23.45 Downloading Packages: +#7 24.28 (1/18): expat-2.5.0-6.el9_8.1.x86_64.rpm 1.8 MB/s | 117 kB 00:00 +#7 24.33 (2/18): coreutils-single-8.32-41.el9_8.x86_64.r 5.1 MB/s | 598 kB 00:00 +#7 24.35 (3/18): glibc-minimal-langpack-2.34-272.el9_8.x 1.6 MB/s | 27 kB 00:00 +#7 24.35 (4/18): glibc-common-2.34-272.el9_8.x86_64.rpm 3.8 MB/s | 299 kB 00:00 +#7 24.36 (5/18): libeconf-0.4.1-7.el9_8.x86_64.rpm 1.7 MB/s | 26 kB 00:00 +#7 24.39 (6/18): libtasn1-4.16.0-10.el9_8.x86_64.rpm 2.5 MB/s | 73 kB 00:00 +#7 24.42 (7/18): glibc-2.34-272.el9_8.x86_64.rpm 9.5 MB/s | 2.0 MB 00:00 +#7 24.43 (8/18): libsolv-0.7.24-5.el9_8.x86_64.rpm 5.4 MB/s | 402 kB 00:00 +#7 24.53 (9/18): openssl-fips-provider-3.5.5-4.el9_8.x86 7.9 MB/s | 816 kB 00:00 +#7 24.56 (10/18): openssl-3.5.5-4.el9_8.x86_64.rpm 10 MB/s | 1.4 MB 00:00 +#7 24.82 (11/18): systemd-252-67.el9_8.4.alma.1.x86_64.r 15 MB/s | 4.0 MB 00:00 +#7 24.85 (12/18): openssl-libs-3.5.5-4.el9_8.x86_64.rpm 7.2 MB/s | 2.3 MB 00:00 +#7 24.86 (13/18): libxml2-2.9.13-14.el9_8.1.x86_64.rpm 1.6 MB/s | 746 kB 00:00 +#7 24.87 (14/18): systemd-pam-252-67.el9_8.4.alma.1.x86_ 12 MB/s | 258 kB 00:00 +#7 24.88 (15/18): systemd-libs-252-67.el9_8.4.alma.1.x86 10 MB/s | 651 kB 00:00 +#7 24.89 (16/18): systemd-rpm-macros-252-67.el9_8.4.alma 2.0 MB/s | 46 kB 00:00 +#7 24.92 (17/18): vim-minimal-8.2.2637-26.el9_8.6.x86_64 15 MB/s | 672 kB 00:00 +#7 25.00 (18/18): epel-release-9-11.el9.noarch.rpm 162 kB/s | 19 kB 00:00 +#7 25.00 -------------------------------------------------------------------------------- +#7 25.00 Total 9.2 MB/s | 14 MB 00:01 +#7 25.47 Extra Packages for Enterprise Linux 9 - x86_64 1.6 MB/s | 1.6 kB 00:00 +#7 25.59 Importing GPG key 0x3228467C: +#7 25.59 Userid : "Fedora (epel9) " +#7 25.59 Fingerprint: FF8A D134 4597 106E CE81 3B91 8A38 72BF 3228 467C +#7 25.59 From : /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-9 +#7 25.60 Key imported successfully +#7 25.62 Running transaction check +#7 25.71 Transaction check succeeded. +#7 25.71 Running transaction test +#7 25.95 Transaction test succeeded. +#7 25.95 Running transaction +#7 26.33 Preparing : 1/1 +#7 26.44 Upgrading : glibc-common-2.34-272.el9_8.x86_64 1/36 +#7 26.45 Upgrading : glibc-minimal-langpack-2.34-272.el9_8.x86_64 2/36 +#7 26.45 Running scriptlet: glibc-2.34-272.el9_8.x86_64 3/36 +#7 26.68 Upgrading : glibc-2.34-272.el9_8.x86_64 3/36 +#7 26.69 Running scriptlet: glibc-2.34-272.el9_8.x86_64 3/36 +#7 26.78 Upgrading : coreutils-single-8.32-41.el9_8.x86_64 4/36 +#7 26.83 Upgrading : libxml2-2.9.13-14.el9_8.1.x86_64 5/36 +#7 26.87 Upgrading : openssl-fips-provider-1:3.5.5-4.el9_8.x86_64 6/36 +#7 26.97 Upgrading : openssl-libs-1:3.5.5-4.el9_8.x86_64 7/36 +#7 27.01 Upgrading : systemd-libs-252-67.el9_8.4.alma.1.x86_64 8/36 +#7 27.02 Running scriptlet: systemd-libs-252-67.el9_8.4.alma.1.x86_64 8/36 +#7 27.05 Upgrading : systemd-rpm-macros-252-67.el9_8.4.alma.1.noarch 9/36 +#7 27.07 Upgrading : systemd-pam-252-67.el9_8.4.alma.1.x86_64 10/36 +#7 27.08 Running scriptlet: systemd-252-67.el9_8.4.alma.1.x86_64 11/36 +#7 27.83 Upgrading : systemd-252-67.el9_8.4.alma.1.x86_64 11/36 +#7 27.85 Running scriptlet: systemd-252-67.el9_8.4.alma.1.x86_64 11/36 +#7 27.90 Upgrading : libsolv-0.7.24-5.el9_8.x86_64 12/36 +#7 27.96 Upgrading : openssl-1:3.5.5-4.el9_8.x86_64 13/36 +#7 27.98 Upgrading : expat-2.5.0-6.el9_8.1.x86_64 14/36 +#7 28.00 Upgrading : libeconf-0.4.1-7.el9_8.x86_64 15/36 +#7 28.02 Upgrading : libtasn1-4.16.0-10.el9_8.x86_64 16/36 +#7 28.06 Upgrading : vim-minimal-2:8.2.2637-26.el9_8.6.x86_64 17/36 +#7 28.07 Upgrading : epel-release-9-11.el9.noarch 18/36 +#7 28.07 Running scriptlet: epel-release-9-11.el9.noarch 18/36 +#7 28.22 Cleanup : systemd-252-67.el9_8.2.alma.1.x86_64 19/36 +#7 28.22 Running scriptlet: systemd-252-67.el9_8.2.alma.1.x86_64 19/36 +#7 28.35 Cleanup : systemd-libs-252-67.el9_8.2.alma.1.x86_64 20/36 +#7 28.36 Cleanup : systemd-pam-252-67.el9_8.2.alma.1.x86_64 21/36 +#7 28.37 Cleanup : openssl-1:3.5.5-2.el9_8.x86_64 22/36 +#7 28.38 Cleanup : coreutils-single-8.32-40.el9.x86_64 23/36 +#7 28.39 Cleanup : libsolv-0.7.24-4.el9.x86_64 24/36 +#7 28.40 Cleanup : libxml2-2.9.13-14.el9_7.x86_64 25/36 +#7 28.41 Cleanup : vim-minimal-2:8.2.2637-26.el9_8.4.x86_64 26/36 +#7 28.42 Cleanup : expat-2.5.0-6.el9.x86_64 27/36 +#7 28.42 Cleanup : openssl-fips-provider-1:3.5.5-2.el9_8.x86_64 28/36 +#7 28.44 Cleanup : openssl-libs-1:3.5.5-2.el9_8.x86_64 29/36 +#7 28.45 Cleanup : libeconf-0.4.1-5.el9.x86_64 30/36 +#7 28.46 Cleanup : libtasn1-4.16.0-9.el9.x86_64 31/36 +#7 28.46 Cleanup : systemd-rpm-macros-252-67.el9_8.2.alma.1.noarch 32/36 +#7 28.47 Cleanup : epel-release-9-9.el9.noarch 33/36 +#7 28.48 Cleanup : glibc-2.34-270.el9_8.x86_64 34/36 +#7 28.49 Cleanup : glibc-minimal-langpack-2.34-270.el9_8.x86_64 35/36 +#7 28.49 Cleanup : glibc-common-2.34-270.el9_8.x86_64 36/36 +#7 28.51 Running scriptlet: glibc-common-2.34-270.el9_8.x86_64 36/36 +#7 28.70 Verifying : coreutils-single-8.32-41.el9_8.x86_64 1/36 +#7 28.70 Verifying : coreutils-single-8.32-40.el9.x86_64 2/36 +#7 28.70 Verifying : expat-2.5.0-6.el9_8.1.x86_64 3/36 +#7 28.70 Verifying : expat-2.5.0-6.el9.x86_64 4/36 +#7 28.70 Verifying : glibc-2.34-272.el9_8.x86_64 5/36 +#7 28.70 Verifying : glibc-2.34-270.el9_8.x86_64 6/36 +#7 28.70 Verifying : glibc-common-2.34-272.el9_8.x86_64 7/36 +#7 28.70 Verifying : glibc-common-2.34-270.el9_8.x86_64 8/36 +#7 28.70 Verifying : glibc-minimal-langpack-2.34-272.el9_8.x86_64 9/36 +#7 28.70 Verifying : glibc-minimal-langpack-2.34-270.el9_8.x86_64 10/36 +#7 28.70 Verifying : libeconf-0.4.1-7.el9_8.x86_64 11/36 +#7 28.70 Verifying : libeconf-0.4.1-5.el9.x86_64 12/36 +#7 28.70 Verifying : libsolv-0.7.24-5.el9_8.x86_64 13/36 +#7 28.70 Verifying : libsolv-0.7.24-4.el9.x86_64 14/36 +#7 28.70 Verifying : libtasn1-4.16.0-10.el9_8.x86_64 15/36 +#7 28.70 Verifying : libtasn1-4.16.0-9.el9.x86_64 16/36 +#7 28.70 Verifying : libxml2-2.9.13-14.el9_8.1.x86_64 17/36 +#7 28.70 Verifying : libxml2-2.9.13-14.el9_7.x86_64 18/36 +#7 28.70 Verifying : openssl-1:3.5.5-4.el9_8.x86_64 19/36 +#7 28.70 Verifying : openssl-1:3.5.5-2.el9_8.x86_64 20/36 +#7 28.70 Verifying : openssl-fips-provider-1:3.5.5-4.el9_8.x86_64 21/36 +#7 28.70 Verifying : openssl-fips-provider-1:3.5.5-2.el9_8.x86_64 22/36 +#7 28.70 Verifying : openssl-libs-1:3.5.5-4.el9_8.x86_64 23/36 +#7 28.70 Verifying : openssl-libs-1:3.5.5-2.el9_8.x86_64 24/36 +#7 28.70 Verifying : systemd-252-67.el9_8.4.alma.1.x86_64 25/36 +#7 28.70 Verifying : systemd-252-67.el9_8.2.alma.1.x86_64 26/36 +#7 28.70 Verifying : systemd-libs-252-67.el9_8.4.alma.1.x86_64 27/36 +#7 28.70 Verifying : systemd-libs-252-67.el9_8.2.alma.1.x86_64 28/36 +#7 28.70 Verifying : systemd-pam-252-67.el9_8.4.alma.1.x86_64 29/36 +#7 28.70 Verifying : systemd-pam-252-67.el9_8.2.alma.1.x86_64 30/36 +#7 28.70 Verifying : systemd-rpm-macros-252-67.el9_8.4.alma.1.noarch 31/36 +#7 28.70 Verifying : systemd-rpm-macros-252-67.el9_8.2.alma.1.noarch 32/36 +#7 28.70 Verifying : vim-minimal-2:8.2.2637-26.el9_8.6.x86_64 33/36 +#7 28.70 Verifying : vim-minimal-2:8.2.2637-26.el9_8.4.x86_64 34/36 +#7 28.70 Verifying : epel-release-9-11.el9.noarch 35/36 +#7 28.70 Verifying : epel-release-9-9.el9.noarch 36/36 +#7 28.80 +#7 28.80 Upgraded: +#7 28.80 coreutils-single-8.32-41.el9_8.x86_64 +#7 28.80 epel-release-9-11.el9.noarch +#7 28.80 expat-2.5.0-6.el9_8.1.x86_64 +#7 28.80 glibc-2.34-272.el9_8.x86_64 +#7 28.80 glibc-common-2.34-272.el9_8.x86_64 +#7 28.80 glibc-minimal-langpack-2.34-272.el9_8.x86_64 +#7 28.80 libeconf-0.4.1-7.el9_8.x86_64 +#7 28.80 libsolv-0.7.24-5.el9_8.x86_64 +#7 28.80 libtasn1-4.16.0-10.el9_8.x86_64 +#7 28.80 libxml2-2.9.13-14.el9_8.1.x86_64 +#7 28.80 openssl-1:3.5.5-4.el9_8.x86_64 +#7 28.80 openssl-fips-provider-1:3.5.5-4.el9_8.x86_64 +#7 28.80 openssl-libs-1:3.5.5-4.el9_8.x86_64 +#7 28.80 systemd-252-67.el9_8.4.alma.1.x86_64 +#7 28.80 systemd-libs-252-67.el9_8.4.alma.1.x86_64 +#7 28.80 systemd-pam-252-67.el9_8.4.alma.1.x86_64 +#7 28.80 systemd-rpm-macros-252-67.el9_8.4.alma.1.noarch +#7 28.80 vim-minimal-2:8.2.2637-26.el9_8.6.x86_64 +#7 28.80 +#7 28.80 Complete! +#7 29.38 Last metadata expiration check: 0:00:08 ago on Thu Jul 2 08:17:56 2026. +#7 29.50 Package systemd-252-67.el9_8.4.alma.1.x86_64 is already installed. +#7 29.50 Package crypto-policies-scripts-20260224-1.gitea0f072.el9_8.noarch is already installed. +#7 29.52 Dependencies resolved. +#7 29.52 Nothing to do. +#7 29.52 Complete! +#7 29.71 Setting system policy to LEGACY +#7 29.71 Note: System-wide crypto policies are applied on application start-up. +#7 29.71 It is recommended to restart the system for the change of policies +#7 29.71 to fully take place. +#7 30.18 Last metadata expiration check: 0:00:09 ago on Thu Jul 2 08:17:56 2026. +#7 30.30 Package openssl-1:3.5.5-4.el9_8.x86_64 is already installed. +#7 30.30 Package ca-certificates-2025.2.80_v9.0.305-91.el9.noarch is already installed. +#7 30.33 Error: +#7 30.33 Problem: problem with installed package curl-minimal-7.76.1-40.el9.x86_64 +#7 30.33 - package curl-minimal-7.76.1-40.el9.x86_64 from @System conflicts with curl provided by curl-7.76.1-40.el9.x86_64 from baseos +#7 30.33 - package curl-minimal-7.76.1-40.el9.x86_64 from baseos conflicts with curl provided by curl-7.76.1-40.el9.x86_64 from baseos +#7 30.33 - conflicting requests +#7 30.33 (try to add '--allowerasing' to command line to replace conflicting packages or '--skip-broken' to skip uninstallable packages or '--nobest' to use not only best candidate packages) +#7 DONE 30.4s + +#8 [ 3/12] RUN groupadd -r slurm --gid=990 && useradd -r -c "SLURM workload manager" -d /var/lib/slurm -u 990 -g slurm -s /sbin/nologin slurm && mkdir -p /var/lib/slurm && chown slurm:slurm /var/lib/slurm +#8 DONE 0.4s + +#9 [ 4/12] RUN useradd -m -s /bin/bash griduser01 +#9 DONE 0.4s + +#10 [ 5/12] COPY slurm.conf /etc/slurm/slurm.conf +#10 DONE 0.1s + +#11 [ 6/12] COPY cgroup.conf /etc/slurm/cgroup.conf +#11 DONE 0.1s + +#12 [ 7/12] COPY arc.conf /etc/arc.conf +#12 DONE 0.0s + +#13 [ 8/12] COPY bootstrap.sh /usr/local/bin/bootstrap.sh +#13 DONE 0.1s + +#14 [ 9/12] COPY healthcheck.sh /usr/local/bin/healthcheck.sh +#14 DONE 0.1s + +#15 [10/12] COPY arc-bootstrap.service /etc/systemd/system/arc-bootstrap.service +#15 DONE 0.1s + +#16 [11/12] RUN set +e; echo "== rpm -q (are the packages even installed?) =="; rpm -q munge slurm slurm-slurmctld slurm-slurmd; echo "== munge file list =="; rpm -ql munge 2>&1; echo "== slurm file list =="; rpm -ql slurm 2>&1; echo "== slurm-slurmctld file list =="; rpm -ql slurm-slurmctld 2>&1; echo "== slurm-slurmd file list =="; rpm -ql slurm-slurmd 2>&1; echo "== anything under /usr/lib/systemd/system matching munge or slurm =="; ls -la /usr/lib/systemd/system/ | grep -iE 'munge|slurm'; true +#16 0.222 == rpm -q (are the packages even installed?) == +#16 0.248 package munge is not installed +#16 0.248 package slurm is not installed +#16 0.248 package slurm-slurmctld is not installed +#16 0.248 package slurm-slurmd is not installed +#16 0.248 == munge file list == +#16 0.256 package munge is not installed +#16 0.257 == slurm file list == +#16 0.266 package slurm is not installed +#16 0.266 == slurm-slurmctld file list == +#16 0.275 package slurm-slurmctld is not installed +#16 0.276 == slurm-slurmd file list == +#16 0.285 package slurm-slurmd is not installed +#16 0.286 == anything under /usr/lib/systemd/system matching munge or slurm == +#16 DONE 0.3s + +#17 [12/12] RUN mkdir -p /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && chown slurm:slurm /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && chmod +x /usr/local/bin/bootstrap.sh /usr/local/bin/healthcheck.sh && systemctl enable arc-bootstrap.service +#17 0.311 Created symlink /etc/systemd/system/multi-user.target.wants/arc-bootstrap.service → /etc/systemd/system/arc-bootstrap.service. +#17 DONE 0.3s + +#18 exporting to image +#18 exporting layers +#18 exporting layers 6.1s done +#18 exporting manifest sha256:226d793415d4629cbb1ee8fec1dbff3e3b3c1ce058acbc368514171513b45a22 0.0s done +#18 exporting config sha256:c54e2a223ca5d80ee52cd7013c35a56c197c04af5b0cd5696d87186b6565690a 0.0s done +#18 exporting attestation manifest sha256:34d3aa14e734c1e12b9d839a3c1f6b956b0bfa84239bac21a6c2f6e56ce9ccb7 0.0s done +#18 exporting manifest list sha256:b6e6a0cae27d441603e8d8f478fa9c8bbe094dbbac2e8d481adf0635323a057c +#18 exporting manifest list sha256:b6e6a0cae27d441603e8d8f478fa9c8bbe094dbbac2e8d481adf0635323a057c 0.0s done +#18 naming to docker.io/library/arc-ce-slurm-test:local done +#18 unpacking to docker.io/library/arc-ce-slurm-test:local +#18 unpacking to docker.io/library/arc-ce-slurm-test:local 1.2s done +#18 DONE 7.6s + +#19 resolving provenance for metadata file +#19 DONE 0.0s + Image arc-ce-slurm-test:local Built From 829de991576efaabf8422c195b8cc1b724964c32 Mon Sep 17 00:00:00 2001 From: Vijay Chakravarty Date: Thu, 2 Jul 2026 10:41:50 +0200 Subject: [PATCH 07/18] fix: node name and etc location --- docker/Dockerfile | 14 ++++++++++++++ docker/slurm.conf | 4 ++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 1aaa8e4..1332069 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -59,6 +59,20 @@ RUN dnf clean all RUN rpm -q munge munge-libs slurm slurm-slurmctld slurm-slurmd \ nordugrid-arc7-arex nordugrid-arc7-client nordugrid-arc7-arcctl +# munge's package should auto-generate /etc/munge/munge.key and create the +# "munge" system user via its post-install scriptlet, but neither reliably +# happens in this build environment - so do both explicitly. +RUN if ! getent group munge >/dev/null; then groupadd -r munge; fi && \ + if ! getent passwd munge >/dev/null; then \ + useradd -r -g munge -d /etc/munge -s /sbin/nologin -c "MUNGE Uid 'N' Gid Emporium" munge; \ + fi && \ + mkdir -p /etc/munge /var/lib/munge /var/log/munge /run/munge && \ + chown munge:munge /etc/munge /var/lib/munge /var/log/munge /run/munge && \ + chmod 0700 /etc/munge && \ + /usr/sbin/create-munge-key -f && \ + chown munge:munge /etc/munge/munge.key && \ + chmod 0400 /etc/munge/munge.key + # standard "systemd in docker" cleanup: mask units that don't apply / fail in containers RUN (cd /lib/systemd/system/sysinit.target.wants/; for i in *; do [ "$i" = systemd-tmpfiles-setup.service ] || rm -f "$i"; done) ; \ rm -f /lib/systemd/system/multi-user.target.wants/* ; \ diff --git a/docker/slurm.conf b/docker/slurm.conf index 23099c7..1dd2e9b 100644 --- a/docker/slurm.conf +++ b/docker/slurm.conf @@ -32,5 +32,5 @@ ReturnToService=2 MpiDefault=none # --- Node & partition: one fake compute node backing the whole cluster --- -NodeName=cnode01 NodeAddr=localhost CPUs=2 RealMemory=2000 State=UNKNOWN -PartitionName=main Nodes=cnode01 Default=YES MaxTime=INFINITE State=UP +NodeName=arc-ce NodeAddr=localhost CPUs=2 RealMemory=2000 State=UNKNOWN +PartitionName=main Nodes=arc-ce Default=YES MaxTime=INFINITE State=UP From fb9836b4f4e93347ec0b0da769eec3a0688e2408 Mon Sep 17 00:00:00 2001 From: Vijay Chakravarty Date: Thu, 2 Jul 2026 10:58:36 +0200 Subject: [PATCH 08/18] fix arc conf --- docker/arc.conf | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/docker/arc.conf b/docker/arc.conf index 8fc09aa..75cac28 100644 --- a/docker/arc.conf +++ b/docker/arc.conf @@ -10,9 +10,6 @@ [common] hostname = arc-ce -x509_user_key = /etc/grid-security/hostkey.pem -x509_user_cert = /etc/grid-security/hostcert.pem -x509_cert_dir = /etc/grid-security/certificates # ----------------------------------------------------------------------- # LRMS: point A-REX at the local single-node SLURM cluster @@ -49,7 +46,7 @@ delegationdb = sqlite [arex/ws] wsurl = https://arc-ce/arex -allowaccess = zero + [arex/ws/jobs] allowaccess = zero @@ -61,3 +58,11 @@ allowaccess = zero [arex/cache] cachedir = /var/spool/arc/cache + +[infosys] + +[infosys/glue2] + +[infosys/cluster] +cluster_alias = "ARC CE + SLURM CI integration test cluster" +comment = "Single-node, all-in-one container used for CI testing only" From bd1631466794be1c7fe79a8365a14bb640094c76 Mon Sep 17 00:00:00 2001 From: Vijay Chakravarty Date: Thu, 2 Jul 2026 11:12:20 +0200 Subject: [PATCH 09/18] remove c group --- docker/Dockerfile | 1 - docker/cgroup.conf | 3 --- 2 files changed, 4 deletions(-) delete mode 100644 docker/cgroup.conf diff --git a/docker/Dockerfile b/docker/Dockerfile index 1332069..09b8b4c 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -100,7 +100,6 @@ RUN useradd -m -s /bin/bash griduser01 # Config files # --------------------------------------------------------------------------- COPY slurm.conf /etc/slurm/slurm.conf -COPY cgroup.conf /etc/slurm/cgroup.conf COPY arc.conf /etc/arc.conf COPY bootstrap.sh /usr/local/bin/bootstrap.sh diff --git a/docker/cgroup.conf b/docker/cgroup.conf deleted file mode 100644 index 4ca269d..0000000 --- a/docker/cgroup.conf +++ /dev/null @@ -1,3 +0,0 @@ -# cgroup constraints are disabled: see slurm.conf comment. -# Kept here only because slurmd/slurmctld expect the file to exist. -CgroupPlugin=disabled From b244e2fa05dad3ef0c65bddad07806ac6b5c2b23 Mon Sep 17 00:00:00 2001 From: Vijay Chakravarty Date: Thu, 2 Jul 2026 11:22:01 +0200 Subject: [PATCH 10/18] fix cgroup 2 --- docker/Dockerfile | 1 + docker/cgroup.conf | 5 +++++ 2 files changed, 6 insertions(+) create mode 100644 docker/cgroup.conf diff --git a/docker/Dockerfile b/docker/Dockerfile index 09b8b4c..1332069 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -100,6 +100,7 @@ RUN useradd -m -s /bin/bash griduser01 # Config files # --------------------------------------------------------------------------- COPY slurm.conf /etc/slurm/slurm.conf +COPY cgroup.conf /etc/slurm/cgroup.conf COPY arc.conf /etc/arc.conf COPY bootstrap.sh /usr/local/bin/bootstrap.sh diff --git a/docker/cgroup.conf b/docker/cgroup.conf new file mode 100644 index 0000000..d833173 --- /dev/null +++ b/docker/cgroup.conf @@ -0,0 +1,5 @@ +CgroupAutomount=no +ConstrainCores=no +ConstrainRAMSpace=no +ConstrainSwapSpace=no +ConstrainDevices=no From 63960d779b31320fef052eaae93e50ee2ec55b28 Mon Sep 17 00:00:00 2001 From: Vijay Chakravarty Date: Thu, 2 Jul 2026 11:33:24 +0200 Subject: [PATCH 11/18] add dbus --- docker/Dockerfile | 3 ++- docker/arc-bootstrap.service | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 1332069..d4f0ab3 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -34,7 +34,7 @@ RUN dnf -y install epel-release dnf-plugins-core && \ RUN dnf -y update -RUN dnf -y install systemd crypto-policies-scripts && \ +RUN dnf -y install systemd crypto-policies-scripts dbus-broker && \ update-crypto-policies --set LEGACY # --allowerasing: AlmaLinux's base image ships curl-minimal, which @@ -127,6 +127,7 @@ RUN set +e; \ RUN mkdir -p /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && \ chown slurm:slurm /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && \ chmod +x /usr/local/bin/bootstrap.sh /usr/local/bin/healthcheck.sh && \ + systemctl enable dbus-broker.service && \ systemctl enable arc-bootstrap.service # NOTE: munge.service / slurmctld.service / slurmd.service are deliberately # NOT enabled here. arc-bootstrap.service declares Wants=/After= on all diff --git a/docker/arc-bootstrap.service b/docker/arc-bootstrap.service index 457b56f..111f9a8 100644 --- a/docker/arc-bootstrap.service +++ b/docker/arc-bootstrap.service @@ -1,7 +1,7 @@ [Unit] Description=Bootstrap ARC CE (arex/arex-ws) on top of local SLURM, mint test client cert -After=network.target munge.service slurmctld.service slurmd.service -Wants=munge.service slurmctld.service slurmd.service +After=network.target dbus-broker.service munge.service slurmctld.service slurmd.service +Wants=dbus-broker.service munge.service slurmctld.service slurmd.service ConditionPathExists=!/run/arc-ready [Service] From 14fcc19897a291d28bbf93496e7db1ea3bdd7439 Mon Sep 17 00:00:00 2001 From: Vijay Chakravarty Date: Thu, 2 Jul 2026 12:15:48 +0200 Subject: [PATCH 12/18] turn off sacct --- docker/arc.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/arc.conf b/docker/arc.conf index 75cac28..c9bbe91 100644 --- a/docker/arc.conf +++ b/docker/arc.conf @@ -16,7 +16,7 @@ hostname = arc-ce # ----------------------------------------------------------------------- [lrms] lrms = slurm -slurm_use_sacct = yes +slurm_use_sacct = no slurm_wakeupperiod = 5 [queue: main] From 638ea567f2ce66ca615ce77e3facd64a1f7513c4 Mon Sep 17 00:00:00 2001 From: Vijay Chakravarty Date: Thu, 2 Jul 2026 12:33:21 +0200 Subject: [PATCH 13/18] wait before build --- .gitlab-ci.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7b71018..77a8d18 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -21,6 +21,14 @@ build_image: services: - docker:26-dind script: + - | + echo "Waiting for Docker daemon..." + for i in $(seq 1 30); do + docker info >/dev/null 2>&1 && break + echo " [$i/30] not ready yet..." + sleep 2 + done + docker info - docker build -t "${IMAGE_NAME}:${IMAGE_TAG}" -f docker/Dockerfile docker/ - docker save "${IMAGE_NAME}:${IMAGE_TAG}" -o image.tar artifacts: From f8b3a839d531b435ab0fd29c4fd1dd125fcf92aa Mon Sep 17 00:00:00 2001 From: Vijay Chakravarty Date: Thu, 2 Jul 2026 13:54:48 +0200 Subject: [PATCH 14/18] add integration test workflow --- .github/workflows/integration-test.yml | 80 ++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 .github/workflows/integration-test.yml diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml new file mode 100644 index 0000000..08e6382 --- /dev/null +++ b/.github/workflows/integration-test.yml @@ -0,0 +1,80 @@ +name: arc-ce-slurm-integration-test + +on: + push: + branches: [main, add-arc-ce-slurm-ci] + pull_request: + workflow_dispatch: + +env: + IMAGE_NAME: arc-ce-slurm-test + CONTAINER_NAME: arc-ce-slurm-test + +jobs: + build-and-test: + runs-on: ubuntu-latest + steps: + - name: Check out repo + uses: actions/checkout@v4 + + - name: Build image + run: docker build -t "${IMAGE_NAME}:local" -f docker/Dockerfile docker/ + + # --privileged: this container runs systemd as PID 1 (needed by + # arcctl / SLURM's own unit files, and by slurmd's cgroup-scope + # setup via dbus-broker), which needs real cgroup access. + # GitHub-hosted runners allow this directly - no runner config + # or cluster admin approval needed, unlike a locked-down + # Kubernetes-executor GitLab runner. + - name: Run container + run: | + docker run -d --privileged --name "${CONTAINER_NAME}" --hostname arc-ce "${IMAGE_NAME}:local" + + - name: Wait for health check + run: | + status="starting" + for i in $(seq 1 60); do + status=$(docker inspect -f '{{.State.Health.Status}}' "${CONTAINER_NAME}" 2>/dev/null || echo "starting") + echo " [$i/60] health=${status}" + [ "${status}" = "healthy" ] && break + sleep 5 + done + if [ "${status}" != "healthy" ]; then + echo "::error::Container never became healthy" + docker logs "${CONTAINER_NAME}" || true + docker exec "${CONTAINER_NAME}" cat /var/log/arc-bootstrap.log || true + exit 1 + fi + + - name: Run integration test (submit -> monitor -> retrieve) + run: | + docker cp test/. "${CONTAINER_NAME}:/opt/arc-test/" + docker exec "${CONTAINER_NAME}" chown -R griduser01:griduser01 /opt/arc-test + docker exec "${CONTAINER_NAME}" chmod +x /opt/arc-test/run_integration_test.sh /opt/arc-test/run.sh + docker exec -u griduser01 "${CONTAINER_NAME}" /opt/arc-test/run_integration_test.sh + + - name: Collect logs + if: always() + run: | + docker logs "${CONTAINER_NAME}" > container-console.log 2>&1 || true + docker exec "${CONTAINER_NAME}" cat /var/log/arc/arex.log > arex.log 2>/dev/null || true + docker exec "${CONTAINER_NAME}" cat /var/log/arc-bootstrap.log > arc-bootstrap.log 2>/dev/null || true + docker exec "${CONTAINER_NAME}" cat /var/log/slurm/slurmctld.log > slurmctld.log 2>/dev/null || true + docker exec "${CONTAINER_NAME}" cat /var/log/slurm/slurmd.log > slurmd.log 2>/dev/null || true + + - name: Upload logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: arc-ce-slurm-logs + path: | + container-console.log + arex.log + arc-bootstrap.log + slurmctld.log + slurmd.log + retention-days: 7 + + - name: Clean up + if: always() + run: docker rm -f "${CONTAINER_NAME}" || true From 04927d2a0ab96b80aaada2ea5779f3a1f2f752ea Mon Sep 17 00:00:00 2001 From: Vijay Chakravarty Date: Thu, 2 Jul 2026 14:16:20 +0200 Subject: [PATCH 15/18] Create readme_arcce_slurm_int_test.md --- test/readme_arcce_slurm_int_test.md | 118 ++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 test/readme_arcce_slurm_int_test.md diff --git a/test/readme_arcce_slurm_int_test.md b/test/readme_arcce_slurm_int_test.md new file mode 100644 index 0000000..e7aff9e --- /dev/null +++ b/test/readme_arcce_slurm_int_test.md @@ -0,0 +1,118 @@ +# ARC CE + SLURM integration test + +Spins up a single Docker container running a NorduGrid **ARC Compute +Element (ARC7)** wired to a single-node **SLURM** batch system, then +drives `arcsub` / `arcstat` / `arcget` against it to prove the whole +submit → monitor → retrieve path works end to end. Designed to run as +a GitHub Workflow pipeline (build and test), but also runnable +locally with `docker-compose`. + +## Layout + +``` +docker/ + Dockerfile AlmaLinux 9 image: munge + SLURM + ARC7 + systemd + slurm.conf single-node SLURM cluster config + cgroup.conf cgroups config (although dbus preferred) + arc.conf ARC CE config, LRMS=slurm, REST interface on :443 + bootstrap.sh one-shot startup script (systemd unit runs this) + arc-bootstrap.service systemd unit that runs bootstrap.sh at boot + healthcheck.sh Docker HEALTHCHECK / CI readiness probe +test/ + job.xrsl the test job description (xRSL) + run.sh payload script executed on the SLURM worker + run_integration_test.sh submit -> monitor -> retrieve driver script +.github/workflows/ + integration-test.yml +docker-compose.yml local equivalent of the CI run +``` + +## How it fits together + +1. **Image build** installs `munge`, `slurm`/`slurm-slurmctld`/`slurm-slurmd`, + and ARC7 (`nordugrid-arc7-arex`, `nordugrid-arc7-client`, + `nordugrid-arc7-arcctl`) from EPEL on AlmaLinux 9, and enables + `systemd` as PID 1 — this matters because ARC's own tooling + (`arcctl`) and the SLURM/munge packages ship real systemd unit + files, and re-using those is far more reliable than hand-rolling a + supervisor script. + +2. **Container start** (`arc-bootstrap.service`, ordered after + `munge`/`slurmctld`/`slurmd`) runs `bootstrap.sh`, which: + - waits until `munge` and `sinfo` actually work, + - (re)generates the ARC **Test-CA** and a **host certificate** bound + to the container's *runtime* hostname (`arcctl test-ca hostcert -n + $(hostname) -f`) — this can't be baked into the image at build + time because the build-time hostname is a random ID, not `arc-ce`, + - starts `arc-arex` / `arc-arex-ws` (`arcctl service start + --as-configured`), + - mints a Test-CA **client certificate** for `griduser01` + (`arcctl test-ca usercert --install-user griduser01 -f`), which + `arcctl` automatically whitelists in + `/etc/grid-security/testCA.allowed-subjects` — this is what makes + the CE's default "closed by default" `[authgroup: zero]` accept + that user, + - waits for the REST endpoint to answer and writes `/run/arc-ready`. + +3. **Docker HEALTHCHECK** (`healthcheck.sh`) only reports `healthy` + once `/run/arc-ready` exists, `sinfo` works, and the REST endpoint + responds — the CI job polls this instead of guessing a fixed sleep. + +4. **The test itself** (`test/run_integration_test.sh`, run as + `griduser01` inside the container via `docker exec`): + - `arcproxy` — generate a short-lived proxy from the Test-CA user cert + - `arcinfo -C https://arc-ce/arex` — sanity-check the CE is reachable + - `arcsub -C https://arc-ce/arex job.xrsl` — **submit** + - poll `arcstat ` until `Finished` (or fail fast on + `Failed`/`Killed`) — **monitor** + - `arcget ` — **retrieve** `stdout.log` and `result.txt`, + then assert their contents + - `arcclean ` to tidy up + +## Why systemd + `--privileged` + +SLURM's daemons and ARC's `arcctl` assume a normal init system +(starting/stopping via `systemctl`, log rotation, etc). Running +`systemd` as PID 1 inside the container needs elevated privileges to +manage cgroups, so both the GitLab job and local `docker-compose` run +the container with `--privileged`. +``` + +If you can't get a privileged runner, the alternative is to drop +systemd entirely and hand-roll process supervision (e.g. `supervisord` +calling `munged`, `slurmctld -D`, `slurmd -D`, and the `A-REX` daemon +binary directly) — more portable, but you lose the packaged unit files +and have to reproduce their startup ordering/flags yourself. + +## Running locally + +```bash +docker compose up --build -d +# watch it come up +docker inspect -f '{{.State.Health.Status}}' arc-ce-slurm-test +# once "healthy": +docker cp test/. arc-ce-slurm-test:/opt/arc-test/ +docker exec arc-ce-slurm-test chown -R griduser01:griduser01 /opt/arc-test +docker exec arc-ce-slurm-test chmod +x /opt/arc-test/run_integration_test.sh /opt/arc-test/run.sh +docker exec -u griduser01 arc-ce-slurm-test /opt/arc-test/run_integration_test.sh +``` + +## Things you'll likely want to change for a real environment + +- **Package versions**: this pins nothing beyond "ARC7 from EPEL on + EL9". For reproducible CI, pin `nordugrid-arc7-arex-` etc. + explicitly, or build from the upstream NorduGrid repo instead of + EPEL (see https://www.nordugrid.org/arc/arc7/common/repos/repository.html). +- **Multi-container topology**: this is deliberately an all-in-one + container (CE + SLURM + client in one box) to keep the CI pipeline + simple. For something closer to production, split into an `arc-ce` + service, a `slurmctld`/`slurmd` service (or a real multi-node SLURM + cluster), and a separate `client` container talking to the CE over + the Docker network, sharing a `munge.key` via a named volume. +- **Certificates**: this uses ARC's built-in Test-CA, which is exactly + what it's for (throwaway integration testing). Never use it for + anything reachable from outside your CI network. +- **Job payload**: `test/job.xrsl` / `test/run.sh` are a minimal + smoke test. Extend them to cover whatever your real batch workloads + look like (multi-core requests, input/output staging from object + storage, RunTime Environments, etc). From c546bbffa3ac771dabb48b00d91e7117429eadb4 Mon Sep 17 00:00:00 2001 From: Vijay Chakravarty Date: Thu, 2 Jul 2026 14:19:53 +0200 Subject: [PATCH 16/18] delete unnecessary files --- .gitlab-ci.yml | 97 ------------- arc-ce-slurm-ci/.gitlab-ci.yml | 89 ------------ arc-ce-slurm-ci/README.md | 144 ------------------- arc-ce-slurm-ci/docker-compose.yml | 19 --- arc-ce-slurm-ci/docker/Dockerfile | 78 ---------- arc-ce-slurm-ci/docker/arc-bootstrap.service | 13 -- arc-ce-slurm-ci/docker/arc.conf | 63 -------- arc-ce-slurm-ci/docker/bootstrap.sh | 61 -------- arc-ce-slurm-ci/docker/cgroup.conf | 3 - arc-ce-slurm-ci/docker/healthcheck.sh | 11 -- arc-ce-slurm-ci/docker/slurm.conf | 36 ----- arc-ce-slurm-ci/test/job.xrsl | 11 -- arc-ce-slurm-ci/test/run.sh | 7 - arc-ce-slurm-ci/test/run_integration_test.sh | 112 --------------- 14 files changed, 744 deletions(-) delete mode 100644 .gitlab-ci.yml delete mode 100644 arc-ce-slurm-ci/.gitlab-ci.yml delete mode 100644 arc-ce-slurm-ci/README.md delete mode 100644 arc-ce-slurm-ci/docker-compose.yml delete mode 100644 arc-ce-slurm-ci/docker/Dockerfile delete mode 100644 arc-ce-slurm-ci/docker/arc-bootstrap.service delete mode 100644 arc-ce-slurm-ci/docker/arc.conf delete mode 100644 arc-ce-slurm-ci/docker/bootstrap.sh delete mode 100644 arc-ce-slurm-ci/docker/cgroup.conf delete mode 100644 arc-ce-slurm-ci/docker/healthcheck.sh delete mode 100644 arc-ce-slurm-ci/docker/slurm.conf delete mode 100644 arc-ce-slurm-ci/test/job.xrsl delete mode 100644 arc-ce-slurm-ci/test/run.sh delete mode 100644 arc-ce-slurm-ci/test/run_integration_test.sh diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml deleted file mode 100644 index 77a8d18..0000000 --- a/.gitlab-ci.yml +++ /dev/null @@ -1,97 +0,0 @@ -stages: - - build - - test - -variables: - # Standard docker-in-docker setup (see explanation in the README) - DOCKER_HOST: tcp://docker:2375 - DOCKER_TLS_CERTDIR: "" - DOCKER_DRIVER: overlay2 - IMAGE_NAME: "${CI_REGISTRY_IMAGE}/arc-ce-slurm-test" - IMAGE_TAG: "${CI_COMMIT_SHORT_SHA}" - CONTAINER_NAME: arc-ce-slurm-test - -# ----------------------------------------------------------------------- -# Build the ARC CE + SLURM image and hand it to the test job as an artifact -# (avoids needing a registry push/pull round-trip just for CI). -# ----------------------------------------------------------------------- -build_image: - stage: build - image: docker:26 - services: - - docker:26-dind - script: - - | - echo "Waiting for Docker daemon..." - for i in $(seq 1 30); do - docker info >/dev/null 2>&1 && break - echo " [$i/30] not ready yet..." - sleep 2 - done - docker info - - docker build -t "${IMAGE_NAME}:${IMAGE_TAG}" -f docker/Dockerfile docker/ - - docker save "${IMAGE_NAME}:${IMAGE_TAG}" -o image.tar - artifacts: - paths: - - image.tar - expire_in: 1 hour - -# ----------------------------------------------------------------------- -# Spin the image up as a real (privileged) container, wait for the -# HEALTHCHECK to go green, then drive submit -> monitor -> retrieve -# against it via `docker exec`. -# -# NOTE: this needs a GitLab Runner whose executor is allowed to run -# privileged containers, i.e. in the runner's config.toml: -# -# [runners.docker] -# privileged = true -# -# --privileged is required here because the container runs systemd as -# PID 1 (which arcctl / SLURM's own unit files expect), and systemd -# needs to manage cgroups. -# ----------------------------------------------------------------------- -integration_test: - stage: test - image: docker:26 - services: - - docker:26-dind - needs: - - build_image - before_script: - - docker load -i image.tar - script: - - docker run -d --privileged --name "${CONTAINER_NAME}" --hostname arc-ce "${IMAGE_NAME}:${IMAGE_TAG}" - - echo "Waiting for ARC CE + SLURM health check..." - - | - status="starting" - for i in $(seq 1 60); do - status=$(docker inspect -f '{{.State.Health.Status}}' "${CONTAINER_NAME}" 2>/dev/null || echo "starting") - echo " [$i/60] health=${status}" - [ "${status}" = "healthy" ] && break - sleep 5 - done - if [ "${status}" != "healthy" ]; then - echo "Container never became healthy, dumping logs:" - docker logs "${CONTAINER_NAME}" || true - docker exec "${CONTAINER_NAME}" cat /var/log/arc-bootstrap.log || true - exit 1 - fi - - docker cp test/. "${CONTAINER_NAME}:/opt/arc-test/" - - docker exec "${CONTAINER_NAME}" chown -R griduser01:griduser01 /opt/arc-test - - docker exec "${CONTAINER_NAME}" chmod +x /opt/arc-test/run_integration_test.sh /opt/arc-test/run.sh - - docker exec -u griduser01 "${CONTAINER_NAME}" /opt/arc-test/run_integration_test.sh - after_script: - - docker logs "${CONTAINER_NAME}" > container-console.log 2>&1 || true - - docker exec "${CONTAINER_NAME}" cat /var/log/arc/arex.log > arex.log 2>/dev/null || true - - docker exec "${CONTAINER_NAME}" cat /var/log/arc-bootstrap.log > arc-bootstrap.log 2>/dev/null || true - - docker exec "${CONTAINER_NAME}" cat /var/log/slurm/slurmctld.log > slurmctld.log 2>/dev/null || true - - docker rm -f "${CONTAINER_NAME}" || true - artifacts: - when: always - paths: - - container-console.log - - arex.log - - arc-bootstrap.log - - slurmctld.log - expire_in: 1 week diff --git a/arc-ce-slurm-ci/.gitlab-ci.yml b/arc-ce-slurm-ci/.gitlab-ci.yml deleted file mode 100644 index 7b71018..0000000 --- a/arc-ce-slurm-ci/.gitlab-ci.yml +++ /dev/null @@ -1,89 +0,0 @@ -stages: - - build - - test - -variables: - # Standard docker-in-docker setup (see explanation in the README) - DOCKER_HOST: tcp://docker:2375 - DOCKER_TLS_CERTDIR: "" - DOCKER_DRIVER: overlay2 - IMAGE_NAME: "${CI_REGISTRY_IMAGE}/arc-ce-slurm-test" - IMAGE_TAG: "${CI_COMMIT_SHORT_SHA}" - CONTAINER_NAME: arc-ce-slurm-test - -# ----------------------------------------------------------------------- -# Build the ARC CE + SLURM image and hand it to the test job as an artifact -# (avoids needing a registry push/pull round-trip just for CI). -# ----------------------------------------------------------------------- -build_image: - stage: build - image: docker:26 - services: - - docker:26-dind - script: - - docker build -t "${IMAGE_NAME}:${IMAGE_TAG}" -f docker/Dockerfile docker/ - - docker save "${IMAGE_NAME}:${IMAGE_TAG}" -o image.tar - artifacts: - paths: - - image.tar - expire_in: 1 hour - -# ----------------------------------------------------------------------- -# Spin the image up as a real (privileged) container, wait for the -# HEALTHCHECK to go green, then drive submit -> monitor -> retrieve -# against it via `docker exec`. -# -# NOTE: this needs a GitLab Runner whose executor is allowed to run -# privileged containers, i.e. in the runner's config.toml: -# -# [runners.docker] -# privileged = true -# -# --privileged is required here because the container runs systemd as -# PID 1 (which arcctl / SLURM's own unit files expect), and systemd -# needs to manage cgroups. -# ----------------------------------------------------------------------- -integration_test: - stage: test - image: docker:26 - services: - - docker:26-dind - needs: - - build_image - before_script: - - docker load -i image.tar - script: - - docker run -d --privileged --name "${CONTAINER_NAME}" --hostname arc-ce "${IMAGE_NAME}:${IMAGE_TAG}" - - echo "Waiting for ARC CE + SLURM health check..." - - | - status="starting" - for i in $(seq 1 60); do - status=$(docker inspect -f '{{.State.Health.Status}}' "${CONTAINER_NAME}" 2>/dev/null || echo "starting") - echo " [$i/60] health=${status}" - [ "${status}" = "healthy" ] && break - sleep 5 - done - if [ "${status}" != "healthy" ]; then - echo "Container never became healthy, dumping logs:" - docker logs "${CONTAINER_NAME}" || true - docker exec "${CONTAINER_NAME}" cat /var/log/arc-bootstrap.log || true - exit 1 - fi - - docker cp test/. "${CONTAINER_NAME}:/opt/arc-test/" - - docker exec "${CONTAINER_NAME}" chown -R griduser01:griduser01 /opt/arc-test - - docker exec "${CONTAINER_NAME}" chmod +x /opt/arc-test/run_integration_test.sh /opt/arc-test/run.sh - - docker exec -u griduser01 "${CONTAINER_NAME}" /opt/arc-test/run_integration_test.sh - after_script: - - docker logs "${CONTAINER_NAME}" > container-console.log 2>&1 || true - - docker exec "${CONTAINER_NAME}" cat /var/log/arc/arex.log > arex.log 2>/dev/null || true - - docker exec "${CONTAINER_NAME}" cat /var/log/arc-bootstrap.log > arc-bootstrap.log 2>/dev/null || true - - docker exec "${CONTAINER_NAME}" cat /var/log/slurm/slurmctld.log > slurmctld.log 2>/dev/null || true - - docker rm -f "${CONTAINER_NAME}" || true - artifacts: - when: always - paths: - - container-console.log - - arex.log - - arc-bootstrap.log - - slurmctld.log - expire_in: 1 week diff --git a/arc-ce-slurm-ci/README.md b/arc-ce-slurm-ci/README.md deleted file mode 100644 index 5983cc6..0000000 --- a/arc-ce-slurm-ci/README.md +++ /dev/null @@ -1,144 +0,0 @@ -# ARC CE + SLURM integration test (GitLab CI) - -Spins up a single Docker container running a NorduGrid **ARC Compute -Element (ARC7)** wired to a single-node **SLURM** batch system, then -drives `arcsub` / `arcstat` / `arcget` against it to prove the whole -submit → monitor → retrieve path works end to end. Designed to run as -a GitLab CI pipeline (build stage + test stage), but also runnable -locally with `docker-compose`. - -## Layout - -``` -docker/ - Dockerfile AlmaLinux 9 image: munge + SLURM + ARC7 + systemd - slurm.conf single-node SLURM cluster config - cgroup.conf cgroups disabled (see note below) - arc.conf ARC CE config, LRMS=slurm, REST interface on :443 - bootstrap.sh one-shot startup script (systemd unit runs this) - arc-bootstrap.service systemd unit that runs bootstrap.sh at boot - healthcheck.sh Docker HEALTHCHECK / CI readiness probe -test/ - job.xrsl the test job description (xRSL) - run.sh payload script executed on the SLURM worker - run_integration_test.sh submit -> monitor -> retrieve driver script -.gitlab-ci.yml build_image + integration_test pipeline -docker-compose.yml local equivalent of the CI run -``` - -## How it fits together - -1. **Image build** installs `munge`, `slurm`/`slurm-slurmctld`/`slurm-slurmd`, - and ARC7 (`nordugrid-arc7-arex`, `nordugrid-arc7-client`, - `nordugrid-arc7-arcctl`) from EPEL on AlmaLinux 9, and enables - `systemd` as PID 1 — this matters because ARC's own tooling - (`arcctl`) and the SLURM/munge packages ship real systemd unit - files, and re-using those is far more reliable than hand-rolling a - supervisor script. - -2. **Container start** (`arc-bootstrap.service`, ordered after - `munge`/`slurmctld`/`slurmd`) runs `bootstrap.sh`, which: - - waits until `munge` and `sinfo` actually work, - - (re)generates the ARC **Test-CA** and a **host certificate** bound - to the container's *runtime* hostname (`arcctl test-ca hostcert -n - $(hostname) -f`) — this can't be baked into the image at build - time because the build-time hostname is a random ID, not `arc-ce`, - - starts `arc-arex` / `arc-arex-ws` (`arcctl service start - --as-configured`), - - mints a Test-CA **client certificate** for `griduser01` - (`arcctl test-ca usercert --install-user griduser01 -f`), which - `arcctl` automatically whitelists in - `/etc/grid-security/testCA.allowed-subjects` — this is what makes - the CE's default "closed by default" `[authgroup: zero]` accept - that user, - - waits for the REST endpoint to answer and writes `/run/arc-ready`. - -3. **Docker HEALTHCHECK** (`healthcheck.sh`) only reports `healthy` - once `/run/arc-ready` exists, `sinfo` works, and the REST endpoint - responds — the CI job polls this instead of guessing a fixed sleep. - -4. **The test itself** (`test/run_integration_test.sh`, run as - `griduser01` inside the container via `docker exec`): - - `arcproxy` — generate a short-lived proxy from the Test-CA user cert - - `arcinfo -C https://arc-ce/arex` — sanity-check the CE is reachable - - `arcsub -C https://arc-ce/arex job.xrsl` — **submit** - - poll `arcstat ` until `Finished` (or fail fast on - `Failed`/`Killed`) — **monitor** - - `arcget ` — **retrieve** `stdout.log` and `result.txt`, - then assert their contents - - `arcclean ` to tidy up - -## Why systemd + `--privileged` - -SLURM's daemons and ARC's `arcctl` assume a normal init system -(starting/stopping via `systemctl`, log rotation, etc). Running -`systemd` as PID 1 inside the container needs elevated privileges to -manage cgroups, so both the GitLab job and local `docker-compose` run -the container with `--privileged`. - -**In GitLab, this means your Runner's `config.toml` must allow -privileged containers for the `docker:dind` service:** - -```toml -[[runners]] - executor = "docker" - [runners.docker] - privileged = true -``` - -If you can't get a privileged runner, the alternative is to drop -systemd entirely and hand-roll process supervision (e.g. `supervisord` -calling `munged`, `slurmctld -D`, `slurmd -D`, and the `A-REX` daemon -binary directly) — more portable, but you lose the packaged unit files -and have to reproduce their startup ordering/flags yourself. - -## Why `cgroup.conf` disables cgroups - -`TaskPlugin=task/none` and `ProctrackType=proctrack/linuxproc` in -`slurm.conf` avoid SLURM's cgroup-based process tracking, which -typically isn't usable inside a CI container even with `--privileged` -unless you also bind-mount the host's cgroup hierarchy. Fine for an -integration test that just proves the plumbing works; not -representative of production resource enforcement. - -## Running locally - -```bash -docker compose up --build -d -# watch it come up -docker inspect -f '{{.State.Health.Status}}' arc-ce-slurm-test -# once "healthy": -docker cp test/. arc-ce-slurm-test:/opt/arc-test/ -docker exec arc-ce-slurm-test chown -R griduser01:griduser01 /opt/arc-test -docker exec arc-ce-slurm-test chmod +x /opt/arc-test/run_integration_test.sh /opt/arc-test/run.sh -docker exec -u griduser01 arc-ce-slurm-test /opt/arc-test/run_integration_test.sh -``` - -## Running in GitLab CI - -Just push this repo (or merge these files into yours) with -`.gitlab-ci.yml` at the root. The `build_image` stage builds and saves -the image as a job artifact; `integration_test` loads it, runs it -privileged, waits for the health check, executes the test script -inside the container, and archives ARC/SLURM logs as artifacts -regardless of pass/fail. - -## Things you'll likely want to change for a real environment - -- **Package versions**: this pins nothing beyond "ARC7 from EPEL on - EL9". For reproducible CI, pin `nordugrid-arc7-arex-` etc. - explicitly, or build from the upstream NorduGrid repo instead of - EPEL (see https://www.nordugrid.org/arc/arc7/common/repos/repository.html). -- **Multi-container topology**: this is deliberately an all-in-one - container (CE + SLURM + client in one box) to keep the CI pipeline - simple. For something closer to production, split into an `arc-ce` - service, a `slurmctld`/`slurmd` service (or a real multi-node SLURM - cluster), and a separate `client` container talking to the CE over - the Docker network, sharing a `munge.key` via a named volume. -- **Certificates**: this uses ARC's built-in Test-CA, which is exactly - what it's for (throwaway integration testing). Never use it for - anything reachable from outside your CI network. -- **Job payload**: `test/job.xrsl` / `test/run.sh` are a minimal - smoke test. Extend them to cover whatever your real batch workloads - look like (multi-core requests, input/output staging from object - storage, RunTime Environments, etc). diff --git a/arc-ce-slurm-ci/docker-compose.yml b/arc-ce-slurm-ci/docker-compose.yml deleted file mode 100644 index 3f6091f..0000000 --- a/arc-ce-slurm-ci/docker-compose.yml +++ /dev/null @@ -1,19 +0,0 @@ -version: "3.8" - -services: - arc-ce: - build: - context: ./docker - dockerfile: Dockerfile - image: arc-ce-slurm-test:local - container_name: arc-ce-slurm-test - hostname: arc-ce - privileged: true # needed for systemd-as-PID1 (see README) - ports: - - "8443:443" - healthcheck: - test: ["CMD", "/usr/local/bin/healthcheck.sh"] - interval: 5s - timeout: 5s - start_period: 90s - retries: 30 diff --git a/arc-ce-slurm-ci/docker/Dockerfile b/arc-ce-slurm-ci/docker/Dockerfile deleted file mode 100644 index 1e97186..0000000 --- a/arc-ce-slurm-ci/docker/Dockerfile +++ /dev/null @@ -1,78 +0,0 @@ -# ============================================================================= -# ARC CE + SLURM "all-in-one" image for integration testing -# -# Runs, as real systemd-managed services (this is what arcctl/ARC packaging -# expects and is far more robust than re-implementing service supervision): -# - munge (SLURM auth) -# - slurmctld + slurmd (single-node SLURM cluster, one fake node) -# - arc-arex + arc-arex-ws (NorduGrid ARC CE, LRMS backend = slurm) -# -# Base: AlmaLinux 9. On EL9, ARC7 ships in EPEL as nordugrid-arc7-*. -# -# IMPORTANT: this container runs systemd as PID 1, which needs either -# docker run --privileged -# or (rootless-friendlier) -# docker run --cgroupns=host -v /sys/fs/cgroup:/sys/fs/cgroup:rw -# See the .gitlab-ci.yml in this repo for the CI-side flags. -# ============================================================================= -FROM almalinux:9 - -ENV container=docker - -# --------------------------------------------------------------------------- -# systemd + EPEL + SLURM + munge + ARC CE + ARC client tools -# --------------------------------------------------------------------------- -RUN dnf -y install epel-release && \ - dnf -y update && \ - dnf -y install systemd crypto-policies-scripts && \ - update-crypto-policies --set LEGACY && \ - dnf -y install \ - munge munge-libs \ - slurm slurm-slurmctld slurm-slurmd \ - openssl ca-certificates \ - procps-ng iproute net-tools which curl jq \ - && \ - dnf -y install \ - nordugrid-arc7-arex \ - nordugrid-arc7-client \ - nordugrid-arc7-arcctl \ - && \ - dnf clean all && \ - # standard "systemd in docker" cleanup: mask units that don't apply / fail in containers - (cd /lib/systemd/system/sysinit.target.wants/; for i in *; do [ "$i" = systemd-tmpfiles-setup.service ] || rm -f "$i"; done) ; \ - rm -f /lib/systemd/system/multi-user.target.wants/* ; \ - rm -f /etc/systemd/system/*.wants/* ; \ - rm -f /lib/systemd/system/local-fs.target.wants/* ; \ - rm -f /lib/systemd/system/sockets.target.wants/*udev* ; \ - rm -f /lib/systemd/system/sockets.target.wants/*initctl* ; \ - rm -f /lib/systemd/system/basic.target.wants/* ; \ - rm -f /lib/systemd/system/anaconda.target.wants/* 2>/dev/null || true - -# --------------------------------------------------------------------------- -# Unprivileged pool account that grid jobs get mapped to + matching SLURM node -# --------------------------------------------------------------------------- -RUN useradd -m -s /bin/bash griduser01 - -# --------------------------------------------------------------------------- -# Config files -# --------------------------------------------------------------------------- -COPY slurm.conf /etc/slurm/slurm.conf -COPY cgroup.conf /etc/slurm/cgroup.conf -COPY arc.conf /etc/arc.conf - -COPY bootstrap.sh /usr/local/bin/bootstrap.sh -COPY healthcheck.sh /usr/local/bin/healthcheck.sh -COPY arc-bootstrap.service /etc/systemd/system/arc-bootstrap.service - -RUN mkdir -p /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && \ - chown slurm:slurm /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && \ - chmod +x /usr/local/bin/bootstrap.sh /usr/local/bin/healthcheck.sh && \ - systemctl enable munge slurmctld slurmd arc-bootstrap.service - -EXPOSE 443 - -HEALTHCHECK --interval=5s --timeout=5s --start-period=90s --retries=30 \ - CMD /usr/local/bin/healthcheck.sh - -STOPSIGNAL SIGRTMIN+3 -CMD ["/usr/sbin/init"] diff --git a/arc-ce-slurm-ci/docker/arc-bootstrap.service b/arc-ce-slurm-ci/docker/arc-bootstrap.service deleted file mode 100644 index 457b56f..0000000 --- a/arc-ce-slurm-ci/docker/arc-bootstrap.service +++ /dev/null @@ -1,13 +0,0 @@ -[Unit] -Description=Bootstrap ARC CE (arex/arex-ws) on top of local SLURM, mint test client cert -After=network.target munge.service slurmctld.service slurmd.service -Wants=munge.service slurmctld.service slurmd.service -ConditionPathExists=!/run/arc-ready - -[Service] -Type=oneshot -RemainAfterExit=yes -ExecStart=/usr/local/bin/bootstrap.sh - -[Install] -WantedBy=multi-user.target diff --git a/arc-ce-slurm-ci/docker/arc.conf b/arc-ce-slurm-ci/docker/arc.conf deleted file mode 100644 index 8fc09aa..0000000 --- a/arc-ce-slurm-ci/docker/arc.conf +++ /dev/null @@ -1,63 +0,0 @@ -# ============================================================================= -# /etc/arc.conf - ARC7 CE configuration for the integration-test container -# -# This intentionally overrides the packaged "zero configuration" so we have -# an explicit, reviewable setup. Cross-check option names against your -# installed ARC version's reference doc if you bump versions: -# /usr/share/doc/nordugrid-arc7-arex/arc.conf.reference -# https://www.nordugrid.org/arc/arc7/admins/reference.html -# ============================================================================= - -[common] -hostname = arc-ce -x509_user_key = /etc/grid-security/hostkey.pem -x509_user_cert = /etc/grid-security/hostcert.pem -x509_cert_dir = /etc/grid-security/certificates - -# ----------------------------------------------------------------------- -# LRMS: point A-REX at the local single-node SLURM cluster -# ----------------------------------------------------------------------- -[lrms] -lrms = slurm -slurm_use_sacct = yes -slurm_wakeupperiod = 5 - -[queue: main] -comment = CI integration-test queue backed by local SLURM partition "main" - -# ----------------------------------------------------------------------- -# Authorization: accept anyone holding a cert signed by the ARC Test-CA -# that arcctl generates at install time. The test client cert we mint in -# entrypoint.sh (arcctl test-ca usercert) is auto-appended to -# testCA.allowed-subjects, which is what the "zero" authgroup checks. -# ----------------------------------------------------------------------- -[authgroup: zero] - -[mapping] -# every request authorized via the "zero" authgroup runs as griduser01 -map_to_user = zero griduser01 - -# ----------------------------------------------------------------------- -# A-REX core: where jobs' control/session data live -# ----------------------------------------------------------------------- -[arex] -user = root -controldir = /var/spool/arc/jobstatus -sessiondir = /var/spool/arc/sessiondir -runtimedir = /usr/share/arc/rte -delegationdb = sqlite - -[arex/ws] -wsurl = https://arc-ce/arex -allowaccess = zero - -[arex/ws/jobs] -allowaccess = zero - -# Data staging can be minimal for a CI job that just echoes something, -# but we enable it so xRSL inputfiles/outputfiles work if you extend the -# test job later. -[arex/data-staging] - -[arex/cache] -cachedir = /var/spool/arc/cache diff --git a/arc-ce-slurm-ci/docker/bootstrap.sh b/arc-ce-slurm-ci/docker/bootstrap.sh deleted file mode 100644 index 02fc606..0000000 --- a/arc-ce-slurm-ci/docker/bootstrap.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash -# ============================================================================= -# Runs once at container start (via arc-bootstrap.service, after munge, -# slurmctld and slurmd units). Responsibilities: -# 1. Wait for munge + SLURM to be actually usable -# 2. (Re)generate the ARC Test-CA and a host certificate bound to this -# container's *runtime* hostname (image build time hostname is random, -# so we can't bake a valid host cert into the image itself) -# 3. Start arc-arex / arc-arex-ws as configured in /etc/arc.conf -# 4. Mint a Test-CA user certificate for griduser01, which arcctl -# automatically whitelists in /etc/grid-security/testCA.allowed-subjects -# 5. Wait for the REST endpoint to answer, then signal readiness -# ============================================================================= -set -euo pipefail -LOG=/var/log/arc-bootstrap.log -exec > >(tee -a "$LOG") 2>&1 - -echo "== ARC CE / SLURM bootstrap starting: $(date -u) ==" - -HOSTNAME_FQDN="$(hostname)" -echo "Using hostname: ${HOSTNAME_FQDN}" - -wait_for() { - local desc="$1"; shift - local tries=0 - until "$@" >/dev/null 2>&1; do - tries=$((tries + 1)) - if [ "$tries" -gt 90 ]; then - echo "TIMED OUT waiting for: ${desc}" - return 1 - fi - sleep 2 - done - echo "${desc}: ready (after ${tries} tries)" -} - -# --- 1. munge, then SLURM control daemon ------------------------------------ -wait_for "munge" bash -c 'echo bootstrap-check | munge | unmunge' -wait_for "slurmctld (sinfo)" sinfo -h - -# --- 2. Test-CA + host certificate for the real runtime hostname ------------ -arcctl test-ca init -f -arcctl test-ca hostcert -n "${HOSTNAME_FQDN}" -f - -# --- 3. Start ARC CE services ------------------------------------------------- -arcctl service start --as-configured - -# --- 4. Test client certificate for griduser01 ------------------------------- -arcctl test-ca usercert --install-user griduser01 -f - -# Also export a portable tarball, useful if the GitLab job wants to drive -# arcsub/arcstat/arcget from *outside* this container (e.g. from the -# job's own shell talking to the CE over the docker network). -arcctl test-ca usercert -n griduser01 --export-tar -f || true -mv -f testcert-*.tar.gz /root/arc-test-client.tar.gz 2>/dev/null || true - -# --- 5. Wait until the REST endpoint actually answers ------------------------ -wait_for "arex REST endpoint" curl -sk "https://${HOSTNAME_FQDN}/arex/rest/1.0/info" - -touch /run/arc-ready -echo "== ARC CE / SLURM bootstrap complete: $(date -u) ==" diff --git a/arc-ce-slurm-ci/docker/cgroup.conf b/arc-ce-slurm-ci/docker/cgroup.conf deleted file mode 100644 index 4ca269d..0000000 --- a/arc-ce-slurm-ci/docker/cgroup.conf +++ /dev/null @@ -1,3 +0,0 @@ -# cgroup constraints are disabled: see slurm.conf comment. -# Kept here only because slurmd/slurmctld expect the file to exist. -CgroupPlugin=disabled diff --git a/arc-ce-slurm-ci/docker/healthcheck.sh b/arc-ce-slurm-ci/docker/healthcheck.sh deleted file mode 100644 index bc47b09..0000000 --- a/arc-ce-slurm-ci/docker/healthcheck.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -# Used by Dockerfile HEALTHCHECK and by the GitLab CI "wait for CE" step. -set -o pipefail - -[ -f /run/arc-ready ] || exit 1 - -sinfo -h >/dev/null 2>&1 || exit 1 - -curl -sk --max-time 3 -o /dev/null "https://$(hostname)/arex/rest/1.0/info" || exit 1 - -exit 0 diff --git a/arc-ce-slurm-ci/docker/slurm.conf b/arc-ce-slurm-ci/docker/slurm.conf deleted file mode 100644 index 23099c7..0000000 --- a/arc-ce-slurm-ci/docker/slurm.conf +++ /dev/null @@ -1,36 +0,0 @@ -# ============================================================================= -# slurm.conf - single-node SLURM cluster for CI/integration testing -# -# NOTE: TaskPlugin=task/none and ProctrackType=proctrack/linuxproc are used -# instead of the cgroup-based plugins because GitLab CI docker executors -# usually do NOT grant access to the host cgroup hierarchy needed by -# proctrack/cgroup or task/cgroup. If your runner is privileged and mounts -# /sys/fs/cgroup read-write, you can switch to the cgroup plugins for more -# realistic resource accounting. -# ============================================================================= -ClusterName=citest -SlurmctldHost=localhost - -SlurmUser=slurm -SlurmctldPort=6817 -SlurmdPort=6818 -AuthType=auth/munge -StateSaveLocation=/var/spool/slurmctld -SlurmdSpoolDir=/var/spool/slurmd -SlurmctldPidFile=/var/run/slurmctld.pid -SlurmdPidFile=/var/run/slurmd.pid -SlurmctldLogFile=/var/log/slurm/slurmctld.log -SlurmdLogFile=/var/log/slurm/slurmd.log - -ProctrackType=proctrack/linuxproc -TaskPlugin=task/none -SelectType=select/cons_tres -SelectTypeParameters=CR_Core_Memory - -SchedulerType=sched/backfill -ReturnToService=2 -MpiDefault=none - -# --- Node & partition: one fake compute node backing the whole cluster --- -NodeName=cnode01 NodeAddr=localhost CPUs=2 RealMemory=2000 State=UNKNOWN -PartitionName=main Nodes=cnode01 Default=YES MaxTime=INFINITE State=UP diff --git a/arc-ce-slurm-ci/test/job.xrsl b/arc-ce-slurm-ci/test/job.xrsl deleted file mode 100644 index 11d34c7..0000000 --- a/arc-ce-slurm-ci/test/job.xrsl +++ /dev/null @@ -1,11 +0,0 @@ -& -(executable = "/bin/sh") -(arguments = "run.sh") -(inputFiles = ("run.sh" "run.sh")) -(jobname = "ci-integration-test") -(stdout = "stdout.log") -(stderr = "stderr.log") -(outputFiles = ("result.txt" "")) -(queue = "main") -(walltime = "5") -(memory = "256") diff --git a/arc-ce-slurm-ci/test/run.sh b/arc-ce-slurm-ci/test/run.sh deleted file mode 100644 index 66f9bea..0000000 --- a/arc-ce-slurm-ci/test/run.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/sh -echo "Running on host: $(hostname)" -echo "Running as user: $(id -un)" -echo "SLURM_JOB_ID=${SLURM_JOB_ID:-unset}" -date -sleep 2 -echo "ok $(date -u +%FT%TZ)" > result.txt diff --git a/arc-ce-slurm-ci/test/run_integration_test.sh b/arc-ce-slurm-ci/test/run_integration_test.sh deleted file mode 100644 index e4c5272..0000000 --- a/arc-ce-slurm-ci/test/run_integration_test.sh +++ /dev/null @@ -1,112 +0,0 @@ -#!/bin/bash -# ============================================================================= -# Integration test: submit / monitor / retrieve a job through ARC CE, backed -# by SLURM. Meant to run *inside* the arc-ce-slurm container as griduser01 -# (that's who the test client cert + queue mapping point to), e.g.: -# -# docker exec -u griduser01 arc-ce-slurm-test /opt/arc-test/run_integration_test.sh -# -# Exit code 0 = pass, non-zero = fail (so GitLab CI can key off it directly). -# ============================================================================= -set -euo pipefail - -CE_HOST="$(hostname)" -CE_ENDPOINT="https://${CE_HOST}/arex" -JOB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -WORKDIR="$(mktemp -d)" -OUTDIR="${WORKDIR}/output" -POLL_INTERVAL=3 -POLL_TIMEOUT=180 - -log() { echo "[$(date -u +%T)] $*"; } - -fail() { log "FAIL: $*"; exit 1; } - -trap 'log "cleaning up ${WORKDIR}"; rm -rf "${WORKDIR}"' EXIT - -cd "${WORKDIR}" -cp "${JOB_DIR}/job.xrsl" . -cp "${JOB_DIR}/run.sh" . - -# ----------------------------------------------------------------------- -# 0. Sanity: we need a proxy. arcproxy reads cert/key from ~/.globus by -# default, which is exactly where `arcctl test-ca usercert --install-user` -# put them during bootstrap. -# ----------------------------------------------------------------------- -log "Generating proxy certificate for $(id -un)" -arcproxy || fail "arcproxy failed - is ~/.globus/usercert.pem present?" - -log "Querying CE info endpoint: ${CE_ENDPOINT}" -arcinfo -C "${CE_ENDPOINT}" || fail "arcinfo could not reach ${CE_ENDPOINT}" - -# ----------------------------------------------------------------------- -# 1. SUBMIT -# ----------------------------------------------------------------------- -log "Submitting job.xrsl to ${CE_ENDPOINT}" -SUBMIT_OUTPUT="$(arcsub -C "${CE_ENDPOINT}" job.xrsl 2>&1)" || { - echo "${SUBMIT_OUTPUT}" - fail "arcsub did not succeed" -} -echo "${SUBMIT_OUTPUT}" - -JOB_ID="$(echo "${SUBMIT_OUTPUT}" | grep -oE 'https://[^ ]+/jobs/[A-Za-z0-9]+' | head -n1)" -[ -n "${JOB_ID}" ] || fail "could not parse job id out of arcsub output" -log "Job submitted: ${JOB_ID}" - -# ----------------------------------------------------------------------- -# 2. MONITOR -# ----------------------------------------------------------------------- -log "Polling job state (timeout ${POLL_TIMEOUT}s)" -elapsed=0 -STATE="" -while [ "${elapsed}" -lt "${POLL_TIMEOUT}" ]; do - STAT_OUTPUT="$(arcstat "${JOB_ID}" 2>&1 || true)" - STATE="$(echo "${STAT_OUTPUT}" | awk -F': ' '/State:/{print $2; exit}')" - log "state=${STATE:-unknown}" - case "${STATE}" in - Finished|FINISHED) - break - ;; - Failed|FAILED|Killed|KILLED|Deleted) - echo "${STAT_OUTPUT}" - fail "job entered terminal failure state: ${STATE}" - ;; - esac - sleep "${POLL_INTERVAL}" - elapsed=$((elapsed + POLL_INTERVAL)) -done - -[ "${STATE}" = "Finished" ] || [ "${STATE}" = "FINISHED" ] || { - arcstat "${JOB_ID}" || true - arcctl job log "$(basename "${JOB_ID}")" --service || true - fail "job did not reach Finished state within ${POLL_TIMEOUT}s (last state: ${STATE:-unknown})" -} -log "Job reached Finished state" - -# ----------------------------------------------------------------------- -# 3. RETRIEVE -# ----------------------------------------------------------------------- -mkdir -p "${OUTDIR}" -log "Retrieving output with arcget into ${OUTDIR}" -( cd "${OUTDIR}" && arcget "${JOB_ID}" ) || fail "arcget failed" - -RESULT_FILE="$(find "${OUTDIR}" -name result.txt | head -n1)" -STDOUT_FILE="$(find "${OUTDIR}" -name stdout.log | head -n1)" - -[ -n "${RESULT_FILE}" ] || fail "result.txt was not retrieved" -[ -n "${STDOUT_FILE}" ] || fail "stdout.log was not retrieved" - -grep -q '^ok ' "${RESULT_FILE}" || fail "result.txt did not contain expected content: $(cat "${RESULT_FILE}")" -grep -q 'Running on host' "${STDOUT_FILE}" || fail "stdout.log missing expected marker" - -log "Output content:" -cat "${STDOUT_FILE}" -cat "${RESULT_FILE}" - -# ----------------------------------------------------------------------- -# 4. Cleanup the job from A-REX bookkeeping (not strictly required, but -# keeps repeated CI runs tidy) -# ----------------------------------------------------------------------- -arcclean "${JOB_ID}" || log "warning: arcclean failed (non-fatal)" - -log "PASS: submit -> monitor -> retrieve integration test succeeded" From e8f457d90a6fe8c58c3c82ab13dc2e522da78d22 Mon Sep 17 00:00:00 2001 From: Vijay Chakravarty Date: Thu, 2 Jul 2026 14:24:04 +0200 Subject: [PATCH 17/18] Update README.md --- README.md | 196 +++++++++++++++--------------------------------------- 1 file changed, 52 insertions(+), 144 deletions(-) diff --git a/README.md b/README.md index 5983cc6..d6c977f 100644 --- a/README.md +++ b/README.md @@ -1,144 +1,52 @@ -# ARC CE + SLURM integration test (GitLab CI) - -Spins up a single Docker container running a NorduGrid **ARC Compute -Element (ARC7)** wired to a single-node **SLURM** batch system, then -drives `arcsub` / `arcstat` / `arcget` against it to prove the whole -submit → monitor → retrieve path works end to end. Designed to run as -a GitLab CI pipeline (build stage + test stage), but also runnable -locally with `docker-compose`. - -## Layout - -``` -docker/ - Dockerfile AlmaLinux 9 image: munge + SLURM + ARC7 + systemd - slurm.conf single-node SLURM cluster config - cgroup.conf cgroups disabled (see note below) - arc.conf ARC CE config, LRMS=slurm, REST interface on :443 - bootstrap.sh one-shot startup script (systemd unit runs this) - arc-bootstrap.service systemd unit that runs bootstrap.sh at boot - healthcheck.sh Docker HEALTHCHECK / CI readiness probe -test/ - job.xrsl the test job description (xRSL) - run.sh payload script executed on the SLURM worker - run_integration_test.sh submit -> monitor -> retrieve driver script -.gitlab-ci.yml build_image + integration_test pipeline -docker-compose.yml local equivalent of the CI run -``` - -## How it fits together - -1. **Image build** installs `munge`, `slurm`/`slurm-slurmctld`/`slurm-slurmd`, - and ARC7 (`nordugrid-arc7-arex`, `nordugrid-arc7-client`, - `nordugrid-arc7-arcctl`) from EPEL on AlmaLinux 9, and enables - `systemd` as PID 1 — this matters because ARC's own tooling - (`arcctl`) and the SLURM/munge packages ship real systemd unit - files, and re-using those is far more reliable than hand-rolling a - supervisor script. - -2. **Container start** (`arc-bootstrap.service`, ordered after - `munge`/`slurmctld`/`slurmd`) runs `bootstrap.sh`, which: - - waits until `munge` and `sinfo` actually work, - - (re)generates the ARC **Test-CA** and a **host certificate** bound - to the container's *runtime* hostname (`arcctl test-ca hostcert -n - $(hostname) -f`) — this can't be baked into the image at build - time because the build-time hostname is a random ID, not `arc-ce`, - - starts `arc-arex` / `arc-arex-ws` (`arcctl service start - --as-configured`), - - mints a Test-CA **client certificate** for `griduser01` - (`arcctl test-ca usercert --install-user griduser01 -f`), which - `arcctl` automatically whitelists in - `/etc/grid-security/testCA.allowed-subjects` — this is what makes - the CE's default "closed by default" `[authgroup: zero]` accept - that user, - - waits for the REST endpoint to answer and writes `/run/arc-ready`. - -3. **Docker HEALTHCHECK** (`healthcheck.sh`) only reports `healthy` - once `/run/arc-ready` exists, `sinfo` works, and the REST endpoint - responds — the CI job polls this instead of guessing a fixed sleep. - -4. **The test itself** (`test/run_integration_test.sh`, run as - `griduser01` inside the container via `docker exec`): - - `arcproxy` — generate a short-lived proxy from the Test-CA user cert - - `arcinfo -C https://arc-ce/arex` — sanity-check the CE is reachable - - `arcsub -C https://arc-ce/arex job.xrsl` — **submit** - - poll `arcstat ` until `Finished` (or fail fast on - `Failed`/`Killed`) — **monitor** - - `arcget ` — **retrieve** `stdout.log` and `result.txt`, - then assert their contents - - `arcclean ` to tidy up - -## Why systemd + `--privileged` - -SLURM's daemons and ARC's `arcctl` assume a normal init system -(starting/stopping via `systemctl`, log rotation, etc). Running -`systemd` as PID 1 inside the container needs elevated privileges to -manage cgroups, so both the GitLab job and local `docker-compose` run -the container with `--privileged`. - -**In GitLab, this means your Runner's `config.toml` must allow -privileged containers for the `docker:dind` service:** - -```toml -[[runners]] - executor = "docker" - [runners.docker] - privileged = true -``` - -If you can't get a privileged runner, the alternative is to drop -systemd entirely and hand-roll process supervision (e.g. `supervisord` -calling `munged`, `slurmctld -D`, `slurmd -D`, and the `A-REX` daemon -binary directly) — more portable, but you lose the packaged unit files -and have to reproduce their startup ordering/flags yourself. - -## Why `cgroup.conf` disables cgroups - -`TaskPlugin=task/none` and `ProctrackType=proctrack/linuxproc` in -`slurm.conf` avoid SLURM's cgroup-based process tracking, which -typically isn't usable inside a CI container even with `--privileged` -unless you also bind-mount the host's cgroup hierarchy. Fine for an -integration test that just proves the plumbing works; not -representative of production resource enforcement. - -## Running locally - -```bash -docker compose up --build -d -# watch it come up -docker inspect -f '{{.State.Health.Status}}' arc-ce-slurm-test -# once "healthy": -docker cp test/. arc-ce-slurm-test:/opt/arc-test/ -docker exec arc-ce-slurm-test chown -R griduser01:griduser01 /opt/arc-test -docker exec arc-ce-slurm-test chmod +x /opt/arc-test/run_integration_test.sh /opt/arc-test/run.sh -docker exec -u griduser01 arc-ce-slurm-test /opt/arc-test/run_integration_test.sh -``` - -## Running in GitLab CI - -Just push this repo (or merge these files into yours) with -`.gitlab-ci.yml` at the root. The `build_image` stage builds and saves -the image as a job artifact; `integration_test` loads it, runs it -privileged, waits for the health check, executes the test script -inside the container, and archives ARC/SLURM logs as artifacts -regardless of pass/fail. - -## Things you'll likely want to change for a real environment - -- **Package versions**: this pins nothing beyond "ARC7 from EPEL on - EL9". For reproducible CI, pin `nordugrid-arc7-arex-` etc. - explicitly, or build from the upstream NorduGrid repo instead of - EPEL (see https://www.nordugrid.org/arc/arc7/common/repos/repository.html). -- **Multi-container topology**: this is deliberately an all-in-one - container (CE + SLURM + client in one box) to keep the CI pipeline - simple. For something closer to production, split into an `arc-ce` - service, a `slurmctld`/`slurmd` service (or a real multi-node SLURM - cluster), and a separate `client` container talking to the CE over - the Docker network, sharing a `munge.key` via a named volume. -- **Certificates**: this uses ARC's built-in Test-CA, which is exactly - what it's for (throwaway integration testing). Never use it for - anything reachable from outside your CI network. -- **Job payload**: `test/job.xrsl` / `test/run.sh` are a minimal - smoke test. Extend them to cover whatever your real batch workloads - look like (multi-core requests, input/output staging from object - storage, RunTime Environments, etc). +

+ interCEde +

+ +

+ Unified interfaces to Computing Elements and batch systems for DIRAC / DiracX and beyond —
+ submit, monitor, retrieve — validated against containerized backends.
+

+ +

+ CI + PyPI + Python versions + License +

+ +--- + +## What is interCEde? + +**interCEde** sits between Workload Management Systems, such as [DiracX](https://github.com/DIRACGrid/diracx) and the +many resources where jobs actually run. It provides a single, consistent interface for +talking to **Computing Elements (CEs)** and **batch systems** — submitting jobs, querying +their status, and retrieving their outputs — regardless of which backend is on the other +end. + +The name is the job description: the library *intercedes* on WMS' behalf, acting between +two parties so the rest of the stack never has to know whether it is talking to ARC, +HTCondor, Slurm over SSH, or a process on the local machine. + +Every interface ships with **integration tests that run against containerized instances** +of the real backends, so a given CE type is verified against multiple versions and +configurations rather than against a mock that drifts from reality. + +## Why it exists + +- **One contract, many backends.** Calling code submits a job the same way everywhere; the + backend-specific quirks live behind the interface. +- **Composable resources.** Backends combine — `SSH + Slurm`, `SSH + HTCondor`, + `ARC + HTCondor`, or a plain `local` runner — and each combination is just another + implementation of the same interface. +- **Tested against the real thing.** Containerized Slurm, HTCondor, and ARC instances are + spun up in CI so behavior is checked against actual schedulers, not stubs. +- **Version coverage.** The same test suite runs across a matrix of backend versions to + catch incompatibilities before they reach production. + +## Relationship to DIRAC / DiracX + +interCEde is part of the [DIRACGrid](https://github.com/DIRACGrid) ecosystem and is designed +to back the Computing Element layer used by [DiracX](https://github.com/DIRACGrid/diracx). +It can also be used standalone wherever a uniform interface to heterogeneous CEs and batch +systems is useful. From 78a11b3b1f6321ea36adbcb35a72735185d89dab Mon Sep 17 00:00:00 2001 From: Vijay Chakravarty Date: Thu, 2 Jul 2026 16:36:26 +0200 Subject: [PATCH 18/18] remove quotes from infosys --- docker/arc.conf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/arc.conf b/docker/arc.conf index c9bbe91..dfd9886 100644 --- a/docker/arc.conf +++ b/docker/arc.conf @@ -64,5 +64,5 @@ cachedir = /var/spool/arc/cache [infosys/glue2] [infosys/cluster] -cluster_alias = "ARC CE + SLURM CI integration test cluster" -comment = "Single-node, all-in-one container used for CI testing only" +alias =ARC CE and SLURM CI integration test cluster +comment =Single-node all-in-one container used for CI testing only