diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml new file mode 100644 index 0000000..08e6382 --- /dev/null +++ b/.github/workflows/integration-test.yml @@ -0,0 +1,80 @@ +name: arc-ce-slurm-integration-test + +on: + push: + branches: [main, add-arc-ce-slurm-ci] + pull_request: + workflow_dispatch: + +env: + IMAGE_NAME: arc-ce-slurm-test + CONTAINER_NAME: arc-ce-slurm-test + +jobs: + build-and-test: + runs-on: ubuntu-latest + steps: + - name: Check out repo + uses: actions/checkout@v4 + + - name: Build image + run: docker build -t "${IMAGE_NAME}:local" -f docker/Dockerfile docker/ + + # --privileged: this container runs systemd as PID 1 (needed by + # arcctl / SLURM's own unit files, and by slurmd's cgroup-scope + # setup via dbus-broker), which needs real cgroup access. + # GitHub-hosted runners allow this directly - no runner config + # or cluster admin approval needed, unlike a locked-down + # Kubernetes-executor GitLab runner. + - name: Run container + run: | + docker run -d --privileged --name "${CONTAINER_NAME}" --hostname arc-ce "${IMAGE_NAME}:local" + + - name: Wait for health check + run: | + status="starting" + for i in $(seq 1 60); do + status=$(docker inspect -f '{{.State.Health.Status}}' "${CONTAINER_NAME}" 2>/dev/null || echo "starting") + echo " [$i/60] health=${status}" + [ "${status}" = "healthy" ] && break + sleep 5 + done + if [ "${status}" != "healthy" ]; then + echo "::error::Container never became healthy" + docker logs "${CONTAINER_NAME}" || true + docker exec "${CONTAINER_NAME}" cat /var/log/arc-bootstrap.log || true + exit 1 + fi + + - name: Run integration test (submit -> monitor -> retrieve) + run: | + docker cp test/. "${CONTAINER_NAME}:/opt/arc-test/" + docker exec "${CONTAINER_NAME}" chown -R griduser01:griduser01 /opt/arc-test + docker exec "${CONTAINER_NAME}" chmod +x /opt/arc-test/run_integration_test.sh /opt/arc-test/run.sh + docker exec -u griduser01 "${CONTAINER_NAME}" /opt/arc-test/run_integration_test.sh + + - name: Collect logs + if: always() + run: | + docker logs "${CONTAINER_NAME}" > container-console.log 2>&1 || true + docker exec "${CONTAINER_NAME}" cat /var/log/arc/arex.log > arex.log 2>/dev/null || true + docker exec "${CONTAINER_NAME}" cat /var/log/arc-bootstrap.log > arc-bootstrap.log 2>/dev/null || true + docker exec "${CONTAINER_NAME}" cat /var/log/slurm/slurmctld.log > slurmctld.log 2>/dev/null || true + docker exec "${CONTAINER_NAME}" cat /var/log/slurm/slurmd.log > slurmd.log 2>/dev/null || true + + - name: Upload logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: arc-ce-slurm-logs + path: | + container-console.log + arex.log + arc-bootstrap.log + slurmctld.log + slurmd.log + retention-days: 7 + + - name: Clean up + if: always() + run: docker rm -f "${CONTAINER_NAME}" || true diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..3f6091f --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,19 @@ +version: "3.8" + +services: + arc-ce: + build: + context: ./docker + dockerfile: Dockerfile + image: arc-ce-slurm-test:local + container_name: arc-ce-slurm-test + hostname: arc-ce + privileged: true # needed for systemd-as-PID1 (see README) + ports: + - "8443:443" + healthcheck: + test: ["CMD", "/usr/local/bin/healthcheck.sh"] + interval: 5s + timeout: 5s + start_period: 90s + retries: 30 diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..d4f0ab3 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,144 @@ +# ============================================================================= +# ARC CE + SLURM "all-in-one" image for integration testing +# +# Runs, as real systemd-managed services (this is what arcctl/ARC packaging +# expects and is far more robust than re-implementing service supervision): +# - munge (SLURM auth) +# - slurmctld + slurmd (single-node SLURM cluster, one fake node) +# - arc-arex + arc-arex-ws (NorduGrid ARC CE, LRMS backend = slurm) +# +# Base: AlmaLinux 9. On EL9, ARC7 ships in EPEL as nordugrid-arc7-*. +# +# IMPORTANT: this container runs systemd as PID 1, which needs either +# docker run --privileged +# or (rootless-friendlier) +# docker run --cgroupns=host -v /sys/fs/cgroup:/sys/fs/cgroup:rw +# See the .gitlab-ci.yml in this repo for the CI-side flags. +# ============================================================================= +FROM almalinux:9 + +ENV container=docker + +# --------------------------------------------------------------------------- +# systemd + EPEL + SLURM + munge + ARC CE + ARC client tools +# --------------------------------------------------------------------------- +# --------------------------------------------------------------------------- +# systemd + EPEL + SLURM + munge + ARC CE + ARC client tools +# Split into separate RUN layers on purpose: easier to see exactly which +# step fails, and a failure in one can't be silently absorbed by a long +# && chain (which is what happened with the curl/curl-minimal conflict +# below before this was split out). +# --------------------------------------------------------------------------- +RUN dnf -y install epel-release dnf-plugins-core && \ + dnf config-manager --set-enabled crb + +RUN dnf -y update + +RUN dnf -y install systemd crypto-policies-scripts dbus-broker && \ + update-crypto-policies --set LEGACY + +# --allowerasing: AlmaLinux's base image ships curl-minimal, which +# conflicts with the full curl package. Let dnf swap it out rather than +# aborting the whole transaction (this was silently dropping munge/slurm +# from the install set entirely). +RUN dnf -y install --allowerasing --setopt=strict=1 \ + munge munge-libs \ + slurm slurm-slurmctld slurm-slurmd \ + openssl ca-certificates \ + procps-ng iproute net-tools which curl jq + +RUN dnf -y install --allowerasing --setopt=strict=1 \ + nordugrid-arc7-arex \ + nordugrid-arc7-client \ + nordugrid-arc7-arcctl + +RUN dnf clean all + +# Hard assertion, isolated in its own layer: this MUST fail the build +# (visibly, as its own numbered step) if any package didn't actually land. +RUN rpm -q munge munge-libs slurm slurm-slurmctld slurm-slurmd \ + nordugrid-arc7-arex nordugrid-arc7-client nordugrid-arc7-arcctl + +# munge's package should auto-generate /etc/munge/munge.key and create the +# "munge" system user via its post-install scriptlet, but neither reliably +# happens in this build environment - so do both explicitly. +RUN if ! getent group munge >/dev/null; then groupadd -r munge; fi && \ + if ! getent passwd munge >/dev/null; then \ + useradd -r -g munge -d /etc/munge -s /sbin/nologin -c "MUNGE Uid 'N' Gid Emporium" munge; \ + fi && \ + mkdir -p /etc/munge /var/lib/munge /var/log/munge /run/munge && \ + chown munge:munge /etc/munge /var/lib/munge /var/log/munge /run/munge && \ + chmod 0700 /etc/munge && \ + /usr/sbin/create-munge-key -f && \ + chown munge:munge /etc/munge/munge.key && \ + chmod 0400 /etc/munge/munge.key + +# standard "systemd in docker" cleanup: mask units that don't apply / fail in containers +RUN (cd /lib/systemd/system/sysinit.target.wants/; for i in *; do [ "$i" = systemd-tmpfiles-setup.service ] || rm -f "$i"; done) ; \ + rm -f /lib/systemd/system/multi-user.target.wants/* ; \ + rm -f /etc/systemd/system/*.wants/* ; \ + rm -f /lib/systemd/system/local-fs.target.wants/* ; \ + rm -f /lib/systemd/system/sockets.target.wants/*udev* ; \ + rm -f /lib/systemd/system/sockets.target.wants/*initctl* ; \ + rm -f /lib/systemd/system/basic.target.wants/* ; \ + rm -f /lib/systemd/system/anaconda.target.wants/* 2>/dev/null || true + +# --------------------------------------------------------------------------- +# slurm service account (EPEL's slurm packages do NOT create this user +# automatically, unlike some other distros' packaging - has to be done here) +# --------------------------------------------------------------------------- +RUN groupadd -r slurm --gid=990 && \ + useradd -r -c "SLURM workload manager" -d /var/lib/slurm -u 990 -g slurm -s /sbin/nologin slurm && \ + mkdir -p /var/lib/slurm && chown slurm:slurm /var/lib/slurm + +# --------------------------------------------------------------------------- +# Unprivileged pool account that grid jobs get mapped to + matching SLURM node +# --------------------------------------------------------------------------- +RUN useradd -m -s /bin/bash griduser01 + +# --------------------------------------------------------------------------- +# Config files +# --------------------------------------------------------------------------- +COPY slurm.conf /etc/slurm/slurm.conf +COPY cgroup.conf /etc/slurm/cgroup.conf +COPY arc.conf /etc/arc.conf + +COPY bootstrap.sh /usr/local/bin/bootstrap.sh +COPY healthcheck.sh /usr/local/bin/healthcheck.sh +COPY arc-bootstrap.service /etc/systemd/system/arc-bootstrap.service + +# Debug aid: prints exactly which systemd unit names the munge/slurm +# packages actually shipped. Check this in the build log if bootstrap.sh +# ever times out waiting for munge/slurmctld/slurmd at runtime. +# Debug aid (non-fatal): shows exactly what's installed for munge/slurm +# and whether they shipped systemd units. Read this in the build log if +# bootstrap.sh ever times out waiting for munge/slurmctld/slurmd later. +RUN set +e; \ + echo "== rpm -q (are the packages even installed?) =="; \ + rpm -q munge slurm slurm-slurmctld slurm-slurmd; \ + echo "== munge file list =="; rpm -ql munge 2>&1; \ + echo "== slurm file list =="; rpm -ql slurm 2>&1; \ + echo "== slurm-slurmctld file list =="; rpm -ql slurm-slurmctld 2>&1; \ + echo "== slurm-slurmd file list =="; rpm -ql slurm-slurmd 2>&1; \ + echo "== anything under /usr/lib/systemd/system matching munge or slurm =="; \ + ls -la /usr/lib/systemd/system/ | grep -iE 'munge|slurm'; \ + true + +RUN mkdir -p /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && \ + chown slurm:slurm /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && \ + chmod +x /usr/local/bin/bootstrap.sh /usr/local/bin/healthcheck.sh && \ + systemctl enable dbus-broker.service && \ + systemctl enable arc-bootstrap.service +# NOTE: munge.service / slurmctld.service / slurmd.service are deliberately +# NOT enabled here. arc-bootstrap.service declares Wants=/After= on all +# three, so systemd pulls them in automatically as dependencies when +# arc-bootstrap.service starts at boot - no need to double-enable them, +# and it avoids brittle unit-name lookups at build time. + +EXPOSE 443 + +HEALTHCHECK --interval=5s --timeout=5s --start-period=90s --retries=30 \ + CMD /usr/local/bin/healthcheck.sh + +STOPSIGNAL SIGRTMIN+3 +CMD ["/usr/sbin/init"] diff --git a/docker/arc-bootstrap.service b/docker/arc-bootstrap.service new file mode 100644 index 0000000..111f9a8 --- /dev/null +++ b/docker/arc-bootstrap.service @@ -0,0 +1,13 @@ +[Unit] +Description=Bootstrap ARC CE (arex/arex-ws) on top of local SLURM, mint test client cert +After=network.target dbus-broker.service munge.service slurmctld.service slurmd.service +Wants=dbus-broker.service munge.service slurmctld.service slurmd.service +ConditionPathExists=!/run/arc-ready + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=/usr/local/bin/bootstrap.sh + +[Install] +WantedBy=multi-user.target diff --git a/docker/arc.conf b/docker/arc.conf new file mode 100644 index 0000000..dfd9886 --- /dev/null +++ b/docker/arc.conf @@ -0,0 +1,68 @@ +# ============================================================================= +# /etc/arc.conf - ARC7 CE configuration for the integration-test container +# +# This intentionally overrides the packaged "zero configuration" so we have +# an explicit, reviewable setup. Cross-check option names against your +# installed ARC version's reference doc if you bump versions: +# /usr/share/doc/nordugrid-arc7-arex/arc.conf.reference +# https://www.nordugrid.org/arc/arc7/admins/reference.html +# ============================================================================= + +[common] +hostname = arc-ce + +# ----------------------------------------------------------------------- +# LRMS: point A-REX at the local single-node SLURM cluster +# ----------------------------------------------------------------------- +[lrms] +lrms = slurm +slurm_use_sacct = no +slurm_wakeupperiod = 5 + +[queue: main] +comment = CI integration-test queue backed by local SLURM partition "main" + +# ----------------------------------------------------------------------- +# Authorization: accept anyone holding a cert signed by the ARC Test-CA +# that arcctl generates at install time. The test client cert we mint in +# entrypoint.sh (arcctl test-ca usercert) is auto-appended to +# testCA.allowed-subjects, which is what the "zero" authgroup checks. +# ----------------------------------------------------------------------- +[authgroup: zero] + +[mapping] +# every request authorized via the "zero" authgroup runs as griduser01 +map_to_user = zero griduser01 + +# ----------------------------------------------------------------------- +# A-REX core: where jobs' control/session data live +# ----------------------------------------------------------------------- +[arex] +user = root +controldir = /var/spool/arc/jobstatus +sessiondir = /var/spool/arc/sessiondir +runtimedir = /usr/share/arc/rte +delegationdb = sqlite + +[arex/ws] +wsurl = https://arc-ce/arex + + +[arex/ws/jobs] +allowaccess = zero + +# Data staging can be minimal for a CI job that just echoes something, +# but we enable it so xRSL inputfiles/outputfiles work if you extend the +# test job later. +[arex/data-staging] + +[arex/cache] +cachedir = /var/spool/arc/cache + +[infosys] + +[infosys/glue2] + +[infosys/cluster] +alias =ARC CE and SLURM CI integration test cluster +comment =Single-node all-in-one container used for CI testing only diff --git a/docker/bootstrap.sh b/docker/bootstrap.sh new file mode 100644 index 0000000..02fc606 --- /dev/null +++ b/docker/bootstrap.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# ============================================================================= +# Runs once at container start (via arc-bootstrap.service, after munge, +# slurmctld and slurmd units). Responsibilities: +# 1. Wait for munge + SLURM to be actually usable +# 2. (Re)generate the ARC Test-CA and a host certificate bound to this +# container's *runtime* hostname (image build time hostname is random, +# so we can't bake a valid host cert into the image itself) +# 3. Start arc-arex / arc-arex-ws as configured in /etc/arc.conf +# 4. Mint a Test-CA user certificate for griduser01, which arcctl +# automatically whitelists in /etc/grid-security/testCA.allowed-subjects +# 5. Wait for the REST endpoint to answer, then signal readiness +# ============================================================================= +set -euo pipefail +LOG=/var/log/arc-bootstrap.log +exec > >(tee -a "$LOG") 2>&1 + +echo "== ARC CE / SLURM bootstrap starting: $(date -u) ==" + +HOSTNAME_FQDN="$(hostname)" +echo "Using hostname: ${HOSTNAME_FQDN}" + +wait_for() { + local desc="$1"; shift + local tries=0 + until "$@" >/dev/null 2>&1; do + tries=$((tries + 1)) + if [ "$tries" -gt 90 ]; then + echo "TIMED OUT waiting for: ${desc}" + return 1 + fi + sleep 2 + done + echo "${desc}: ready (after ${tries} tries)" +} + +# --- 1. munge, then SLURM control daemon ------------------------------------ +wait_for "munge" bash -c 'echo bootstrap-check | munge | unmunge' +wait_for "slurmctld (sinfo)" sinfo -h + +# --- 2. Test-CA + host certificate for the real runtime hostname ------------ +arcctl test-ca init -f +arcctl test-ca hostcert -n "${HOSTNAME_FQDN}" -f + +# --- 3. Start ARC CE services ------------------------------------------------- +arcctl service start --as-configured + +# --- 4. Test client certificate for griduser01 ------------------------------- +arcctl test-ca usercert --install-user griduser01 -f + +# Also export a portable tarball, useful if the GitLab job wants to drive +# arcsub/arcstat/arcget from *outside* this container (e.g. from the +# job's own shell talking to the CE over the docker network). +arcctl test-ca usercert -n griduser01 --export-tar -f || true +mv -f testcert-*.tar.gz /root/arc-test-client.tar.gz 2>/dev/null || true + +# --- 5. Wait until the REST endpoint actually answers ------------------------ +wait_for "arex REST endpoint" curl -sk "https://${HOSTNAME_FQDN}/arex/rest/1.0/info" + +touch /run/arc-ready +echo "== ARC CE / SLURM bootstrap complete: $(date -u) ==" diff --git a/docker/build.log b/docker/build.log new file mode 100644 index 0000000..d4b302e --- /dev/null +++ b/docker/build.log @@ -0,0 +1,323 @@ +--progress is a global compose flag, better use `docker compose --progress xx build ... +time="2026-07-02T10:17:34+02:00" level=warning msg="/home/vijay/Downloads/intercede/docker-compose.yml: the attribute `version` is obsolete, it will be ignored, please remove it to avoid potential confusion" + Image arc-ce-slurm-test:local Building +#1 [internal] load local bake definitions +#1 reading from stdin 563B done +#1 DONE 0.0s + +#2 [internal] load build definition from Dockerfile +#2 transferring dockerfile: 5.76kB done +#2 DONE 0.0s + +#3 [internal] load metadata for docker.io/library/almalinux:9 +#3 DONE 0.9s + +#4 [internal] load .dockerignore +#4 transferring context: 2B done +#4 DONE 0.0s + +#5 [ 1/12] FROM docker.io/library/almalinux:9@sha256:d2515c769e7b73f95c4fde38c0a505336ff38f14990c0b7253b77060a049a743 +#5 resolve docker.io/library/almalinux:9@sha256:d2515c769e7b73f95c4fde38c0a505336ff38f14990c0b7253b77060a049a743 0.0s done +#5 CACHED + +#6 [internal] load build context +#6 transferring context: 198B done +#6 DONE 0.0s + +#7 [ 2/12] RUN dnf -y install epel-release && dnf -y install dnf-plugins-core && dnf config-manager --set-enabled crb && dnf -y update && dnf -y install systemd crypto-policies-scripts && update-crypto-policies --set LEGACY && dnf -y install --setopt=strict=1 munge munge-libs slurm slurm-slurmctld slurm-slurmd openssl ca-certificates procps-ng iproute net-tools which curl jq && dnf -y install --setopt=strict=1 nordugrid-arc7-arex nordugrid-arc7-client nordugrid-arc7-arcctl && dnf clean all && rpm -q munge munge-libs slurm slurm-slurmctld slurm-slurmd nordugrid-arc7-arex nordugrid-arc7-client nordugrid-arc7-arcctl && (cd /lib/systemd/system/sysinit.target.wants/; for i in *; do [ "$i" = systemd-tmpfiles-setup.service ] || rm -f "$i"; done) ; rm -f /lib/systemd/system/multi-user.target.wants/* ; rm -f /etc/systemd/system/*.wants/* ; rm -f /lib/systemd/system/local-fs.target.wants/* ; rm -f /lib/systemd/system/sockets.target.wants/*udev* ; rm -f /lib/systemd/system/sockets.target.wants/*initctl* ; rm -f /lib/systemd/system/basic.target.wants/* ; rm -f /lib/systemd/system/anaconda.target.wants/* 2>/dev/null || true +#7 1.573 AlmaLinux 9 - AppStream 13 MB/s | 15 MB 00:01 +#7 5.018 AlmaLinux 9 - BaseOS 11 MB/s | 14 MB 00:01 +#7 7.019 AlmaLinux 9 - Extras 33 kB/s | 22 kB 00:00 +#7 7.571 Dependencies resolved. +#7 7.572 ================================================================================ +#7 7.572 Package Architecture Version Repository Size +#7 7.572 ================================================================================ +#7 7.572 Installing: +#7 7.572 epel-release noarch 9-9.el9 extras 18 k +#7 7.572 Installing weak dependencies: +#7 7.572 dnf-plugins-core noarch 4.3.0-26.el9 baseos 35 k +#7 7.572 +#7 7.572 Transaction Summary +#7 7.572 ================================================================================ +#7 7.572 Install 2 Packages +#7 7.572 +#7 7.573 Total download size: 53 k +#7 7.573 Installed size: 48 k +#7 7.573 Downloading Packages: +#7 8.870 (1/2): dnf-plugins-core-4.3.0-26.el9.noarch.rpm 612 kB/s | 35 kB 00:00 +#7 8.903 (2/2): epel-release-9-9.el9.noarch.rpm 204 kB/s | 18 kB 00:00 +#7 8.904 -------------------------------------------------------------------------------- +#7 8.904 Total 40 kB/s | 53 kB 00:01 +#7 8.943 Running transaction check +#7 8.952 Transaction check succeeded. +#7 8.952 Running transaction test +#7 8.959 Transaction test succeeded. +#7 8.959 Running transaction +#7 8.998 Preparing : 1/1 +#7 9.034 Installing : dnf-plugins-core-4.3.0-26.el9.noarch 1/2 +#7 9.072 Installing : epel-release-9-9.el9.noarch 2/2 +#7 9.076 Running scriptlet: epel-release-9-9.el9.noarch 2/2 +#7 9.086 Many EPEL packages require the CodeReady Builder (CRB) repository. +#7 9.086 It is recommended that you run /usr/bin/crb enable to enable the CRB repository. +#7 9.086 +#7 9.167 Verifying : dnf-plugins-core-4.3.0-26.el9.noarch 1/2 +#7 9.167 Verifying : epel-release-9-9.el9.noarch 2/2 +#7 9.223 +#7 9.223 Installed: +#7 9.223 dnf-plugins-core-4.3.0-26.el9.noarch epel-release-9-9.el9.noarch +#7 9.223 +#7 9.223 Complete! +#7 11.69 Extra Packages for Enterprise Linux 9 - x86_64 9.6 MB/s | 21 MB 00:02 +#7 18.74 Extra Packages for Enterprise Linux 9 openh264 3.8 kB/s | 2.5 kB 00:00 +#7 19.94 Package dnf-plugins-core-4.3.0-26.el9.noarch is already installed. +#7 19.96 Dependencies resolved. +#7 19.97 Nothing to do. +#7 19.97 Complete! +#7 21.36 AlmaLinux 9 - CRB 4.3 MB/s | 3.9 MB 00:00 +#7 22.21 Last metadata expiration check: 0:00:01 ago on Thu Jul 2 08:17:56 2026. +#7 23.45 Dependencies resolved. +#7 23.45 ================================================================================ +#7 23.45 Package Arch Version Repo Size +#7 23.45 ================================================================================ +#7 23.45 Upgrading: +#7 23.45 coreutils-single x86_64 8.32-41.el9_8 baseos 598 k +#7 23.45 epel-release noarch 9-11.el9 epel 19 k +#7 23.45 expat x86_64 2.5.0-6.el9_8.1 baseos 117 k +#7 23.45 glibc x86_64 2.34-272.el9_8 baseos 2.0 M +#7 23.45 glibc-common x86_64 2.34-272.el9_8 baseos 299 k +#7 23.45 glibc-minimal-langpack x86_64 2.34-272.el9_8 baseos 27 k +#7 23.45 libeconf x86_64 0.4.1-7.el9_8 baseos 26 k +#7 23.45 libsolv x86_64 0.7.24-5.el9_8 baseos 402 k +#7 23.45 libtasn1 x86_64 4.16.0-10.el9_8 baseos 73 k +#7 23.45 libxml2 x86_64 2.9.13-14.el9_8.1 baseos 746 k +#7 23.45 openssl x86_64 1:3.5.5-4.el9_8 baseos 1.4 M +#7 23.45 openssl-fips-provider x86_64 1:3.5.5-4.el9_8 baseos 816 k +#7 23.45 openssl-libs x86_64 1:3.5.5-4.el9_8 baseos 2.3 M +#7 23.45 systemd x86_64 252-67.el9_8.4.alma.1 baseos 4.0 M +#7 23.45 systemd-libs x86_64 252-67.el9_8.4.alma.1 baseos 651 k +#7 23.45 systemd-pam x86_64 252-67.el9_8.4.alma.1 baseos 258 k +#7 23.45 systemd-rpm-macros noarch 252-67.el9_8.4.alma.1 baseos 46 k +#7 23.45 vim-minimal x86_64 2:8.2.2637-26.el9_8.6 baseos 672 k +#7 23.45 +#7 23.45 Transaction Summary +#7 23.45 ================================================================================ +#7 23.45 Upgrade 18 Packages +#7 23.45 +#7 23.45 Total download size: 14 M +#7 23.45 Downloading Packages: +#7 24.28 (1/18): expat-2.5.0-6.el9_8.1.x86_64.rpm 1.8 MB/s | 117 kB 00:00 +#7 24.33 (2/18): coreutils-single-8.32-41.el9_8.x86_64.r 5.1 MB/s | 598 kB 00:00 +#7 24.35 (3/18): glibc-minimal-langpack-2.34-272.el9_8.x 1.6 MB/s | 27 kB 00:00 +#7 24.35 (4/18): glibc-common-2.34-272.el9_8.x86_64.rpm 3.8 MB/s | 299 kB 00:00 +#7 24.36 (5/18): libeconf-0.4.1-7.el9_8.x86_64.rpm 1.7 MB/s | 26 kB 00:00 +#7 24.39 (6/18): libtasn1-4.16.0-10.el9_8.x86_64.rpm 2.5 MB/s | 73 kB 00:00 +#7 24.42 (7/18): glibc-2.34-272.el9_8.x86_64.rpm 9.5 MB/s | 2.0 MB 00:00 +#7 24.43 (8/18): libsolv-0.7.24-5.el9_8.x86_64.rpm 5.4 MB/s | 402 kB 00:00 +#7 24.53 (9/18): openssl-fips-provider-3.5.5-4.el9_8.x86 7.9 MB/s | 816 kB 00:00 +#7 24.56 (10/18): openssl-3.5.5-4.el9_8.x86_64.rpm 10 MB/s | 1.4 MB 00:00 +#7 24.82 (11/18): systemd-252-67.el9_8.4.alma.1.x86_64.r 15 MB/s | 4.0 MB 00:00 +#7 24.85 (12/18): openssl-libs-3.5.5-4.el9_8.x86_64.rpm 7.2 MB/s | 2.3 MB 00:00 +#7 24.86 (13/18): libxml2-2.9.13-14.el9_8.1.x86_64.rpm 1.6 MB/s | 746 kB 00:00 +#7 24.87 (14/18): systemd-pam-252-67.el9_8.4.alma.1.x86_ 12 MB/s | 258 kB 00:00 +#7 24.88 (15/18): systemd-libs-252-67.el9_8.4.alma.1.x86 10 MB/s | 651 kB 00:00 +#7 24.89 (16/18): systemd-rpm-macros-252-67.el9_8.4.alma 2.0 MB/s | 46 kB 00:00 +#7 24.92 (17/18): vim-minimal-8.2.2637-26.el9_8.6.x86_64 15 MB/s | 672 kB 00:00 +#7 25.00 (18/18): epel-release-9-11.el9.noarch.rpm 162 kB/s | 19 kB 00:00 +#7 25.00 -------------------------------------------------------------------------------- +#7 25.00 Total 9.2 MB/s | 14 MB 00:01 +#7 25.47 Extra Packages for Enterprise Linux 9 - x86_64 1.6 MB/s | 1.6 kB 00:00 +#7 25.59 Importing GPG key 0x3228467C: +#7 25.59 Userid : "Fedora (epel9) " +#7 25.59 Fingerprint: FF8A D134 4597 106E CE81 3B91 8A38 72BF 3228 467C +#7 25.59 From : /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-9 +#7 25.60 Key imported successfully +#7 25.62 Running transaction check +#7 25.71 Transaction check succeeded. +#7 25.71 Running transaction test +#7 25.95 Transaction test succeeded. +#7 25.95 Running transaction +#7 26.33 Preparing : 1/1 +#7 26.44 Upgrading : glibc-common-2.34-272.el9_8.x86_64 1/36 +#7 26.45 Upgrading : glibc-minimal-langpack-2.34-272.el9_8.x86_64 2/36 +#7 26.45 Running scriptlet: glibc-2.34-272.el9_8.x86_64 3/36 +#7 26.68 Upgrading : glibc-2.34-272.el9_8.x86_64 3/36 +#7 26.69 Running scriptlet: glibc-2.34-272.el9_8.x86_64 3/36 +#7 26.78 Upgrading : coreutils-single-8.32-41.el9_8.x86_64 4/36 +#7 26.83 Upgrading : libxml2-2.9.13-14.el9_8.1.x86_64 5/36 +#7 26.87 Upgrading : openssl-fips-provider-1:3.5.5-4.el9_8.x86_64 6/36 +#7 26.97 Upgrading : openssl-libs-1:3.5.5-4.el9_8.x86_64 7/36 +#7 27.01 Upgrading : systemd-libs-252-67.el9_8.4.alma.1.x86_64 8/36 +#7 27.02 Running scriptlet: systemd-libs-252-67.el9_8.4.alma.1.x86_64 8/36 +#7 27.05 Upgrading : systemd-rpm-macros-252-67.el9_8.4.alma.1.noarch 9/36 +#7 27.07 Upgrading : systemd-pam-252-67.el9_8.4.alma.1.x86_64 10/36 +#7 27.08 Running scriptlet: systemd-252-67.el9_8.4.alma.1.x86_64 11/36 +#7 27.83 Upgrading : systemd-252-67.el9_8.4.alma.1.x86_64 11/36 +#7 27.85 Running scriptlet: systemd-252-67.el9_8.4.alma.1.x86_64 11/36 +#7 27.90 Upgrading : libsolv-0.7.24-5.el9_8.x86_64 12/36 +#7 27.96 Upgrading : openssl-1:3.5.5-4.el9_8.x86_64 13/36 +#7 27.98 Upgrading : expat-2.5.0-6.el9_8.1.x86_64 14/36 +#7 28.00 Upgrading : libeconf-0.4.1-7.el9_8.x86_64 15/36 +#7 28.02 Upgrading : libtasn1-4.16.0-10.el9_8.x86_64 16/36 +#7 28.06 Upgrading : vim-minimal-2:8.2.2637-26.el9_8.6.x86_64 17/36 +#7 28.07 Upgrading : epel-release-9-11.el9.noarch 18/36 +#7 28.07 Running scriptlet: epel-release-9-11.el9.noarch 18/36 +#7 28.22 Cleanup : systemd-252-67.el9_8.2.alma.1.x86_64 19/36 +#7 28.22 Running scriptlet: systemd-252-67.el9_8.2.alma.1.x86_64 19/36 +#7 28.35 Cleanup : systemd-libs-252-67.el9_8.2.alma.1.x86_64 20/36 +#7 28.36 Cleanup : systemd-pam-252-67.el9_8.2.alma.1.x86_64 21/36 +#7 28.37 Cleanup : openssl-1:3.5.5-2.el9_8.x86_64 22/36 +#7 28.38 Cleanup : coreutils-single-8.32-40.el9.x86_64 23/36 +#7 28.39 Cleanup : libsolv-0.7.24-4.el9.x86_64 24/36 +#7 28.40 Cleanup : libxml2-2.9.13-14.el9_7.x86_64 25/36 +#7 28.41 Cleanup : vim-minimal-2:8.2.2637-26.el9_8.4.x86_64 26/36 +#7 28.42 Cleanup : expat-2.5.0-6.el9.x86_64 27/36 +#7 28.42 Cleanup : openssl-fips-provider-1:3.5.5-2.el9_8.x86_64 28/36 +#7 28.44 Cleanup : openssl-libs-1:3.5.5-2.el9_8.x86_64 29/36 +#7 28.45 Cleanup : libeconf-0.4.1-5.el9.x86_64 30/36 +#7 28.46 Cleanup : libtasn1-4.16.0-9.el9.x86_64 31/36 +#7 28.46 Cleanup : systemd-rpm-macros-252-67.el9_8.2.alma.1.noarch 32/36 +#7 28.47 Cleanup : epel-release-9-9.el9.noarch 33/36 +#7 28.48 Cleanup : glibc-2.34-270.el9_8.x86_64 34/36 +#7 28.49 Cleanup : glibc-minimal-langpack-2.34-270.el9_8.x86_64 35/36 +#7 28.49 Cleanup : glibc-common-2.34-270.el9_8.x86_64 36/36 +#7 28.51 Running scriptlet: glibc-common-2.34-270.el9_8.x86_64 36/36 +#7 28.70 Verifying : coreutils-single-8.32-41.el9_8.x86_64 1/36 +#7 28.70 Verifying : coreutils-single-8.32-40.el9.x86_64 2/36 +#7 28.70 Verifying : expat-2.5.0-6.el9_8.1.x86_64 3/36 +#7 28.70 Verifying : expat-2.5.0-6.el9.x86_64 4/36 +#7 28.70 Verifying : glibc-2.34-272.el9_8.x86_64 5/36 +#7 28.70 Verifying : glibc-2.34-270.el9_8.x86_64 6/36 +#7 28.70 Verifying : glibc-common-2.34-272.el9_8.x86_64 7/36 +#7 28.70 Verifying : glibc-common-2.34-270.el9_8.x86_64 8/36 +#7 28.70 Verifying : glibc-minimal-langpack-2.34-272.el9_8.x86_64 9/36 +#7 28.70 Verifying : glibc-minimal-langpack-2.34-270.el9_8.x86_64 10/36 +#7 28.70 Verifying : libeconf-0.4.1-7.el9_8.x86_64 11/36 +#7 28.70 Verifying : libeconf-0.4.1-5.el9.x86_64 12/36 +#7 28.70 Verifying : libsolv-0.7.24-5.el9_8.x86_64 13/36 +#7 28.70 Verifying : libsolv-0.7.24-4.el9.x86_64 14/36 +#7 28.70 Verifying : libtasn1-4.16.0-10.el9_8.x86_64 15/36 +#7 28.70 Verifying : libtasn1-4.16.0-9.el9.x86_64 16/36 +#7 28.70 Verifying : libxml2-2.9.13-14.el9_8.1.x86_64 17/36 +#7 28.70 Verifying : libxml2-2.9.13-14.el9_7.x86_64 18/36 +#7 28.70 Verifying : openssl-1:3.5.5-4.el9_8.x86_64 19/36 +#7 28.70 Verifying : openssl-1:3.5.5-2.el9_8.x86_64 20/36 +#7 28.70 Verifying : openssl-fips-provider-1:3.5.5-4.el9_8.x86_64 21/36 +#7 28.70 Verifying : openssl-fips-provider-1:3.5.5-2.el9_8.x86_64 22/36 +#7 28.70 Verifying : openssl-libs-1:3.5.5-4.el9_8.x86_64 23/36 +#7 28.70 Verifying : openssl-libs-1:3.5.5-2.el9_8.x86_64 24/36 +#7 28.70 Verifying : systemd-252-67.el9_8.4.alma.1.x86_64 25/36 +#7 28.70 Verifying : systemd-252-67.el9_8.2.alma.1.x86_64 26/36 +#7 28.70 Verifying : systemd-libs-252-67.el9_8.4.alma.1.x86_64 27/36 +#7 28.70 Verifying : systemd-libs-252-67.el9_8.2.alma.1.x86_64 28/36 +#7 28.70 Verifying : systemd-pam-252-67.el9_8.4.alma.1.x86_64 29/36 +#7 28.70 Verifying : systemd-pam-252-67.el9_8.2.alma.1.x86_64 30/36 +#7 28.70 Verifying : systemd-rpm-macros-252-67.el9_8.4.alma.1.noarch 31/36 +#7 28.70 Verifying : systemd-rpm-macros-252-67.el9_8.2.alma.1.noarch 32/36 +#7 28.70 Verifying : vim-minimal-2:8.2.2637-26.el9_8.6.x86_64 33/36 +#7 28.70 Verifying : vim-minimal-2:8.2.2637-26.el9_8.4.x86_64 34/36 +#7 28.70 Verifying : epel-release-9-11.el9.noarch 35/36 +#7 28.70 Verifying : epel-release-9-9.el9.noarch 36/36 +#7 28.80 +#7 28.80 Upgraded: +#7 28.80 coreutils-single-8.32-41.el9_8.x86_64 +#7 28.80 epel-release-9-11.el9.noarch +#7 28.80 expat-2.5.0-6.el9_8.1.x86_64 +#7 28.80 glibc-2.34-272.el9_8.x86_64 +#7 28.80 glibc-common-2.34-272.el9_8.x86_64 +#7 28.80 glibc-minimal-langpack-2.34-272.el9_8.x86_64 +#7 28.80 libeconf-0.4.1-7.el9_8.x86_64 +#7 28.80 libsolv-0.7.24-5.el9_8.x86_64 +#7 28.80 libtasn1-4.16.0-10.el9_8.x86_64 +#7 28.80 libxml2-2.9.13-14.el9_8.1.x86_64 +#7 28.80 openssl-1:3.5.5-4.el9_8.x86_64 +#7 28.80 openssl-fips-provider-1:3.5.5-4.el9_8.x86_64 +#7 28.80 openssl-libs-1:3.5.5-4.el9_8.x86_64 +#7 28.80 systemd-252-67.el9_8.4.alma.1.x86_64 +#7 28.80 systemd-libs-252-67.el9_8.4.alma.1.x86_64 +#7 28.80 systemd-pam-252-67.el9_8.4.alma.1.x86_64 +#7 28.80 systemd-rpm-macros-252-67.el9_8.4.alma.1.noarch +#7 28.80 vim-minimal-2:8.2.2637-26.el9_8.6.x86_64 +#7 28.80 +#7 28.80 Complete! +#7 29.38 Last metadata expiration check: 0:00:08 ago on Thu Jul 2 08:17:56 2026. +#7 29.50 Package systemd-252-67.el9_8.4.alma.1.x86_64 is already installed. +#7 29.50 Package crypto-policies-scripts-20260224-1.gitea0f072.el9_8.noarch is already installed. +#7 29.52 Dependencies resolved. +#7 29.52 Nothing to do. +#7 29.52 Complete! +#7 29.71 Setting system policy to LEGACY +#7 29.71 Note: System-wide crypto policies are applied on application start-up. +#7 29.71 It is recommended to restart the system for the change of policies +#7 29.71 to fully take place. +#7 30.18 Last metadata expiration check: 0:00:09 ago on Thu Jul 2 08:17:56 2026. +#7 30.30 Package openssl-1:3.5.5-4.el9_8.x86_64 is already installed. +#7 30.30 Package ca-certificates-2025.2.80_v9.0.305-91.el9.noarch is already installed. +#7 30.33 Error: +#7 30.33 Problem: problem with installed package curl-minimal-7.76.1-40.el9.x86_64 +#7 30.33 - package curl-minimal-7.76.1-40.el9.x86_64 from @System conflicts with curl provided by curl-7.76.1-40.el9.x86_64 from baseos +#7 30.33 - package curl-minimal-7.76.1-40.el9.x86_64 from baseos conflicts with curl provided by curl-7.76.1-40.el9.x86_64 from baseos +#7 30.33 - conflicting requests +#7 30.33 (try to add '--allowerasing' to command line to replace conflicting packages or '--skip-broken' to skip uninstallable packages or '--nobest' to use not only best candidate packages) +#7 DONE 30.4s + +#8 [ 3/12] RUN groupadd -r slurm --gid=990 && useradd -r -c "SLURM workload manager" -d /var/lib/slurm -u 990 -g slurm -s /sbin/nologin slurm && mkdir -p /var/lib/slurm && chown slurm:slurm /var/lib/slurm +#8 DONE 0.4s + +#9 [ 4/12] RUN useradd -m -s /bin/bash griduser01 +#9 DONE 0.4s + +#10 [ 5/12] COPY slurm.conf /etc/slurm/slurm.conf +#10 DONE 0.1s + +#11 [ 6/12] COPY cgroup.conf /etc/slurm/cgroup.conf +#11 DONE 0.1s + +#12 [ 7/12] COPY arc.conf /etc/arc.conf +#12 DONE 0.0s + +#13 [ 8/12] COPY bootstrap.sh /usr/local/bin/bootstrap.sh +#13 DONE 0.1s + +#14 [ 9/12] COPY healthcheck.sh /usr/local/bin/healthcheck.sh +#14 DONE 0.1s + +#15 [10/12] COPY arc-bootstrap.service /etc/systemd/system/arc-bootstrap.service +#15 DONE 0.1s + +#16 [11/12] RUN set +e; echo "== rpm -q (are the packages even installed?) =="; rpm -q munge slurm slurm-slurmctld slurm-slurmd; echo "== munge file list =="; rpm -ql munge 2>&1; echo "== slurm file list =="; rpm -ql slurm 2>&1; echo "== slurm-slurmctld file list =="; rpm -ql slurm-slurmctld 2>&1; echo "== slurm-slurmd file list =="; rpm -ql slurm-slurmd 2>&1; echo "== anything under /usr/lib/systemd/system matching munge or slurm =="; ls -la /usr/lib/systemd/system/ | grep -iE 'munge|slurm'; true +#16 0.222 == rpm -q (are the packages even installed?) == +#16 0.248 package munge is not installed +#16 0.248 package slurm is not installed +#16 0.248 package slurm-slurmctld is not installed +#16 0.248 package slurm-slurmd is not installed +#16 0.248 == munge file list == +#16 0.256 package munge is not installed +#16 0.257 == slurm file list == +#16 0.266 package slurm is not installed +#16 0.266 == slurm-slurmctld file list == +#16 0.275 package slurm-slurmctld is not installed +#16 0.276 == slurm-slurmd file list == +#16 0.285 package slurm-slurmd is not installed +#16 0.286 == anything under /usr/lib/systemd/system matching munge or slurm == +#16 DONE 0.3s + +#17 [12/12] RUN mkdir -p /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && chown slurm:slurm /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && chmod +x /usr/local/bin/bootstrap.sh /usr/local/bin/healthcheck.sh && systemctl enable arc-bootstrap.service +#17 0.311 Created symlink /etc/systemd/system/multi-user.target.wants/arc-bootstrap.service → /etc/systemd/system/arc-bootstrap.service. +#17 DONE 0.3s + +#18 exporting to image +#18 exporting layers +#18 exporting layers 6.1s done +#18 exporting manifest sha256:226d793415d4629cbb1ee8fec1dbff3e3b3c1ce058acbc368514171513b45a22 0.0s done +#18 exporting config sha256:c54e2a223ca5d80ee52cd7013c35a56c197c04af5b0cd5696d87186b6565690a 0.0s done +#18 exporting attestation manifest sha256:34d3aa14e734c1e12b9d839a3c1f6b956b0bfa84239bac21a6c2f6e56ce9ccb7 0.0s done +#18 exporting manifest list sha256:b6e6a0cae27d441603e8d8f478fa9c8bbe094dbbac2e8d481adf0635323a057c +#18 exporting manifest list sha256:b6e6a0cae27d441603e8d8f478fa9c8bbe094dbbac2e8d481adf0635323a057c 0.0s done +#18 naming to docker.io/library/arc-ce-slurm-test:local done +#18 unpacking to docker.io/library/arc-ce-slurm-test:local +#18 unpacking to docker.io/library/arc-ce-slurm-test:local 1.2s done +#18 DONE 7.6s + +#19 resolving provenance for metadata file +#19 DONE 0.0s + Image arc-ce-slurm-test:local Built diff --git a/docker/cgroup.conf b/docker/cgroup.conf new file mode 100644 index 0000000..d833173 --- /dev/null +++ b/docker/cgroup.conf @@ -0,0 +1,5 @@ +CgroupAutomount=no +ConstrainCores=no +ConstrainRAMSpace=no +ConstrainSwapSpace=no +ConstrainDevices=no diff --git a/docker/healthcheck.sh b/docker/healthcheck.sh new file mode 100644 index 0000000..bc47b09 --- /dev/null +++ b/docker/healthcheck.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# Used by Dockerfile HEALTHCHECK and by the GitLab CI "wait for CE" step. +set -o pipefail + +[ -f /run/arc-ready ] || exit 1 + +sinfo -h >/dev/null 2>&1 || exit 1 + +curl -sk --max-time 3 -o /dev/null "https://$(hostname)/arex/rest/1.0/info" || exit 1 + +exit 0 diff --git a/docker/slurm.conf b/docker/slurm.conf new file mode 100644 index 0000000..1dd2e9b --- /dev/null +++ b/docker/slurm.conf @@ -0,0 +1,36 @@ +# ============================================================================= +# slurm.conf - single-node SLURM cluster for CI/integration testing +# +# NOTE: TaskPlugin=task/none and ProctrackType=proctrack/linuxproc are used +# instead of the cgroup-based plugins because GitLab CI docker executors +# usually do NOT grant access to the host cgroup hierarchy needed by +# proctrack/cgroup or task/cgroup. If your runner is privileged and mounts +# /sys/fs/cgroup read-write, you can switch to the cgroup plugins for more +# realistic resource accounting. +# ============================================================================= +ClusterName=citest +SlurmctldHost=localhost + +SlurmUser=slurm +SlurmctldPort=6817 +SlurmdPort=6818 +AuthType=auth/munge +StateSaveLocation=/var/spool/slurmctld +SlurmdSpoolDir=/var/spool/slurmd +SlurmctldPidFile=/var/run/slurmctld.pid +SlurmdPidFile=/var/run/slurmd.pid +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmdLogFile=/var/log/slurm/slurmd.log + +ProctrackType=proctrack/linuxproc +TaskPlugin=task/none +SelectType=select/cons_tres +SelectTypeParameters=CR_Core_Memory + +SchedulerType=sched/backfill +ReturnToService=2 +MpiDefault=none + +# --- Node & partition: one fake compute node backing the whole cluster --- +NodeName=arc-ce NodeAddr=localhost CPUs=2 RealMemory=2000 State=UNKNOWN +PartitionName=main Nodes=arc-ce Default=YES MaxTime=INFINITE State=UP diff --git a/test/job.xrsl b/test/job.xrsl new file mode 100644 index 0000000..11d34c7 --- /dev/null +++ b/test/job.xrsl @@ -0,0 +1,11 @@ +& +(executable = "/bin/sh") +(arguments = "run.sh") +(inputFiles = ("run.sh" "run.sh")) +(jobname = "ci-integration-test") +(stdout = "stdout.log") +(stderr = "stderr.log") +(outputFiles = ("result.txt" "")) +(queue = "main") +(walltime = "5") +(memory = "256") diff --git a/test/readme_arcce_slurm_int_test.md b/test/readme_arcce_slurm_int_test.md new file mode 100644 index 0000000..e7aff9e --- /dev/null +++ b/test/readme_arcce_slurm_int_test.md @@ -0,0 +1,118 @@ +# ARC CE + SLURM integration test + +Spins up a single Docker container running a NorduGrid **ARC Compute +Element (ARC7)** wired to a single-node **SLURM** batch system, then +drives `arcsub` / `arcstat` / `arcget` against it to prove the whole +submit → monitor → retrieve path works end to end. Designed to run as +a GitHub Workflow pipeline (build and test), but also runnable +locally with `docker-compose`. + +## Layout + +``` +docker/ + Dockerfile AlmaLinux 9 image: munge + SLURM + ARC7 + systemd + slurm.conf single-node SLURM cluster config + cgroup.conf cgroups config (although dbus preferred) + arc.conf ARC CE config, LRMS=slurm, REST interface on :443 + bootstrap.sh one-shot startup script (systemd unit runs this) + arc-bootstrap.service systemd unit that runs bootstrap.sh at boot + healthcheck.sh Docker HEALTHCHECK / CI readiness probe +test/ + job.xrsl the test job description (xRSL) + run.sh payload script executed on the SLURM worker + run_integration_test.sh submit -> monitor -> retrieve driver script +.github/workflows/ + integration-test.yml +docker-compose.yml local equivalent of the CI run +``` + +## How it fits together + +1. **Image build** installs `munge`, `slurm`/`slurm-slurmctld`/`slurm-slurmd`, + and ARC7 (`nordugrid-arc7-arex`, `nordugrid-arc7-client`, + `nordugrid-arc7-arcctl`) from EPEL on AlmaLinux 9, and enables + `systemd` as PID 1 — this matters because ARC's own tooling + (`arcctl`) and the SLURM/munge packages ship real systemd unit + files, and re-using those is far more reliable than hand-rolling a + supervisor script. + +2. **Container start** (`arc-bootstrap.service`, ordered after + `munge`/`slurmctld`/`slurmd`) runs `bootstrap.sh`, which: + - waits until `munge` and `sinfo` actually work, + - (re)generates the ARC **Test-CA** and a **host certificate** bound + to the container's *runtime* hostname (`arcctl test-ca hostcert -n + $(hostname) -f`) — this can't be baked into the image at build + time because the build-time hostname is a random ID, not `arc-ce`, + - starts `arc-arex` / `arc-arex-ws` (`arcctl service start + --as-configured`), + - mints a Test-CA **client certificate** for `griduser01` + (`arcctl test-ca usercert --install-user griduser01 -f`), which + `arcctl` automatically whitelists in + `/etc/grid-security/testCA.allowed-subjects` — this is what makes + the CE's default "closed by default" `[authgroup: zero]` accept + that user, + - waits for the REST endpoint to answer and writes `/run/arc-ready`. + +3. **Docker HEALTHCHECK** (`healthcheck.sh`) only reports `healthy` + once `/run/arc-ready` exists, `sinfo` works, and the REST endpoint + responds — the CI job polls this instead of guessing a fixed sleep. + +4. **The test itself** (`test/run_integration_test.sh`, run as + `griduser01` inside the container via `docker exec`): + - `arcproxy` — generate a short-lived proxy from the Test-CA user cert + - `arcinfo -C https://arc-ce/arex` — sanity-check the CE is reachable + - `arcsub -C https://arc-ce/arex job.xrsl` — **submit** + - poll `arcstat ` until `Finished` (or fail fast on + `Failed`/`Killed`) — **monitor** + - `arcget ` — **retrieve** `stdout.log` and `result.txt`, + then assert their contents + - `arcclean ` to tidy up + +## Why systemd + `--privileged` + +SLURM's daemons and ARC's `arcctl` assume a normal init system +(starting/stopping via `systemctl`, log rotation, etc). Running +`systemd` as PID 1 inside the container needs elevated privileges to +manage cgroups, so both the GitLab job and local `docker-compose` run +the container with `--privileged`. +``` + +If you can't get a privileged runner, the alternative is to drop +systemd entirely and hand-roll process supervision (e.g. `supervisord` +calling `munged`, `slurmctld -D`, `slurmd -D`, and the `A-REX` daemon +binary directly) — more portable, but you lose the packaged unit files +and have to reproduce their startup ordering/flags yourself. + +## Running locally + +```bash +docker compose up --build -d +# watch it come up +docker inspect -f '{{.State.Health.Status}}' arc-ce-slurm-test +# once "healthy": +docker cp test/. arc-ce-slurm-test:/opt/arc-test/ +docker exec arc-ce-slurm-test chown -R griduser01:griduser01 /opt/arc-test +docker exec arc-ce-slurm-test chmod +x /opt/arc-test/run_integration_test.sh /opt/arc-test/run.sh +docker exec -u griduser01 arc-ce-slurm-test /opt/arc-test/run_integration_test.sh +``` + +## Things you'll likely want to change for a real environment + +- **Package versions**: this pins nothing beyond "ARC7 from EPEL on + EL9". For reproducible CI, pin `nordugrid-arc7-arex-` etc. + explicitly, or build from the upstream NorduGrid repo instead of + EPEL (see https://www.nordugrid.org/arc/arc7/common/repos/repository.html). +- **Multi-container topology**: this is deliberately an all-in-one + container (CE + SLURM + client in one box) to keep the CI pipeline + simple. For something closer to production, split into an `arc-ce` + service, a `slurmctld`/`slurmd` service (or a real multi-node SLURM + cluster), and a separate `client` container talking to the CE over + the Docker network, sharing a `munge.key` via a named volume. +- **Certificates**: this uses ARC's built-in Test-CA, which is exactly + what it's for (throwaway integration testing). Never use it for + anything reachable from outside your CI network. +- **Job payload**: `test/job.xrsl` / `test/run.sh` are a minimal + smoke test. Extend them to cover whatever your real batch workloads + look like (multi-core requests, input/output staging from object + storage, RunTime Environments, etc). diff --git a/test/run.sh b/test/run.sh new file mode 100644 index 0000000..66f9bea --- /dev/null +++ b/test/run.sh @@ -0,0 +1,7 @@ +#!/bin/sh +echo "Running on host: $(hostname)" +echo "Running as user: $(id -un)" +echo "SLURM_JOB_ID=${SLURM_JOB_ID:-unset}" +date +sleep 2 +echo "ok $(date -u +%FT%TZ)" > result.txt diff --git a/test/run_integration_test.sh b/test/run_integration_test.sh new file mode 100644 index 0000000..e4c5272 --- /dev/null +++ b/test/run_integration_test.sh @@ -0,0 +1,112 @@ +#!/bin/bash +# ============================================================================= +# Integration test: submit / monitor / retrieve a job through ARC CE, backed +# by SLURM. Meant to run *inside* the arc-ce-slurm container as griduser01 +# (that's who the test client cert + queue mapping point to), e.g.: +# +# docker exec -u griduser01 arc-ce-slurm-test /opt/arc-test/run_integration_test.sh +# +# Exit code 0 = pass, non-zero = fail (so GitLab CI can key off it directly). +# ============================================================================= +set -euo pipefail + +CE_HOST="$(hostname)" +CE_ENDPOINT="https://${CE_HOST}/arex" +JOB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +WORKDIR="$(mktemp -d)" +OUTDIR="${WORKDIR}/output" +POLL_INTERVAL=3 +POLL_TIMEOUT=180 + +log() { echo "[$(date -u +%T)] $*"; } + +fail() { log "FAIL: $*"; exit 1; } + +trap 'log "cleaning up ${WORKDIR}"; rm -rf "${WORKDIR}"' EXIT + +cd "${WORKDIR}" +cp "${JOB_DIR}/job.xrsl" . +cp "${JOB_DIR}/run.sh" . + +# ----------------------------------------------------------------------- +# 0. Sanity: we need a proxy. arcproxy reads cert/key from ~/.globus by +# default, which is exactly where `arcctl test-ca usercert --install-user` +# put them during bootstrap. +# ----------------------------------------------------------------------- +log "Generating proxy certificate for $(id -un)" +arcproxy || fail "arcproxy failed - is ~/.globus/usercert.pem present?" + +log "Querying CE info endpoint: ${CE_ENDPOINT}" +arcinfo -C "${CE_ENDPOINT}" || fail "arcinfo could not reach ${CE_ENDPOINT}" + +# ----------------------------------------------------------------------- +# 1. SUBMIT +# ----------------------------------------------------------------------- +log "Submitting job.xrsl to ${CE_ENDPOINT}" +SUBMIT_OUTPUT="$(arcsub -C "${CE_ENDPOINT}" job.xrsl 2>&1)" || { + echo "${SUBMIT_OUTPUT}" + fail "arcsub did not succeed" +} +echo "${SUBMIT_OUTPUT}" + +JOB_ID="$(echo "${SUBMIT_OUTPUT}" | grep -oE 'https://[^ ]+/jobs/[A-Za-z0-9]+' | head -n1)" +[ -n "${JOB_ID}" ] || fail "could not parse job id out of arcsub output" +log "Job submitted: ${JOB_ID}" + +# ----------------------------------------------------------------------- +# 2. MONITOR +# ----------------------------------------------------------------------- +log "Polling job state (timeout ${POLL_TIMEOUT}s)" +elapsed=0 +STATE="" +while [ "${elapsed}" -lt "${POLL_TIMEOUT}" ]; do + STAT_OUTPUT="$(arcstat "${JOB_ID}" 2>&1 || true)" + STATE="$(echo "${STAT_OUTPUT}" | awk -F': ' '/State:/{print $2; exit}')" + log "state=${STATE:-unknown}" + case "${STATE}" in + Finished|FINISHED) + break + ;; + Failed|FAILED|Killed|KILLED|Deleted) + echo "${STAT_OUTPUT}" + fail "job entered terminal failure state: ${STATE}" + ;; + esac + sleep "${POLL_INTERVAL}" + elapsed=$((elapsed + POLL_INTERVAL)) +done + +[ "${STATE}" = "Finished" ] || [ "${STATE}" = "FINISHED" ] || { + arcstat "${JOB_ID}" || true + arcctl job log "$(basename "${JOB_ID}")" --service || true + fail "job did not reach Finished state within ${POLL_TIMEOUT}s (last state: ${STATE:-unknown})" +} +log "Job reached Finished state" + +# ----------------------------------------------------------------------- +# 3. RETRIEVE +# ----------------------------------------------------------------------- +mkdir -p "${OUTDIR}" +log "Retrieving output with arcget into ${OUTDIR}" +( cd "${OUTDIR}" && arcget "${JOB_ID}" ) || fail "arcget failed" + +RESULT_FILE="$(find "${OUTDIR}" -name result.txt | head -n1)" +STDOUT_FILE="$(find "${OUTDIR}" -name stdout.log | head -n1)" + +[ -n "${RESULT_FILE}" ] || fail "result.txt was not retrieved" +[ -n "${STDOUT_FILE}" ] || fail "stdout.log was not retrieved" + +grep -q '^ok ' "${RESULT_FILE}" || fail "result.txt did not contain expected content: $(cat "${RESULT_FILE}")" +grep -q 'Running on host' "${STDOUT_FILE}" || fail "stdout.log missing expected marker" + +log "Output content:" +cat "${STDOUT_FILE}" +cat "${RESULT_FILE}" + +# ----------------------------------------------------------------------- +# 4. Cleanup the job from A-REX bookkeeping (not strictly required, but +# keeps repeated CI runs tidy) +# ----------------------------------------------------------------------- +arcclean "${JOB_ID}" || log "warning: arcclean failed (non-fatal)" + +log "PASS: submit -> monitor -> retrieve integration test succeeded"