Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions .github/workflows/integration-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
name: arc-ce-slurm-integration-test

on:
push:
branches: [main, add-arc-ce-slurm-ci]
pull_request:
workflow_dispatch:

env:
IMAGE_NAME: arc-ce-slurm-test
CONTAINER_NAME: arc-ce-slurm-test

jobs:
build-and-test:
runs-on: ubuntu-latest
steps:
- name: Check out repo
uses: actions/checkout@v4

- name: Build image
run: docker build -t "${IMAGE_NAME}:local" -f docker/Dockerfile docker/

# --privileged: this container runs systemd as PID 1 (needed by
# arcctl / SLURM's own unit files, and by slurmd's cgroup-scope
# setup via dbus-broker), which needs real cgroup access.
# GitHub-hosted runners allow this directly - no runner config
# or cluster admin approval needed, unlike a locked-down
# Kubernetes-executor GitLab runner.
- name: Run container
run: |
docker run -d --privileged --name "${CONTAINER_NAME}" --hostname arc-ce "${IMAGE_NAME}:local"

- name: Wait for health check
run: |
status="starting"
for i in $(seq 1 60); do
status=$(docker inspect -f '{{.State.Health.Status}}' "${CONTAINER_NAME}" 2>/dev/null || echo "starting")
echo " [$i/60] health=${status}"
[ "${status}" = "healthy" ] && break
sleep 5
done
if [ "${status}" != "healthy" ]; then
echo "::error::Container never became healthy"
docker logs "${CONTAINER_NAME}" || true
docker exec "${CONTAINER_NAME}" cat /var/log/arc-bootstrap.log || true
exit 1
fi

- name: Run integration test (submit -> monitor -> retrieve)
run: |
docker cp test/. "${CONTAINER_NAME}:/opt/arc-test/"
docker exec "${CONTAINER_NAME}" chown -R griduser01:griduser01 /opt/arc-test
docker exec "${CONTAINER_NAME}" chmod +x /opt/arc-test/run_integration_test.sh /opt/arc-test/run.sh
docker exec -u griduser01 "${CONTAINER_NAME}" /opt/arc-test/run_integration_test.sh

- name: Collect logs
if: always()
run: |
docker logs "${CONTAINER_NAME}" > container-console.log 2>&1 || true
docker exec "${CONTAINER_NAME}" cat /var/log/arc/arex.log > arex.log 2>/dev/null || true
docker exec "${CONTAINER_NAME}" cat /var/log/arc-bootstrap.log > arc-bootstrap.log 2>/dev/null || true
docker exec "${CONTAINER_NAME}" cat /var/log/slurm/slurmctld.log > slurmctld.log 2>/dev/null || true
docker exec "${CONTAINER_NAME}" cat /var/log/slurm/slurmd.log > slurmd.log 2>/dev/null || true

- name: Upload logs
if: always()
uses: actions/upload-artifact@v4
with:
name: arc-ce-slurm-logs
path: |
container-console.log
arex.log
arc-bootstrap.log
slurmctld.log
slurmd.log
retention-days: 7

- name: Clean up
if: always()
run: docker rm -f "${CONTAINER_NAME}" || true
19 changes: 19 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
version: "3.8"

services:
arc-ce:
build:
context: ./docker
dockerfile: Dockerfile
image: arc-ce-slurm-test:local
container_name: arc-ce-slurm-test
hostname: arc-ce
privileged: true # needed for systemd-as-PID1 (see README)
ports:
- "8443:443"
healthcheck:
test: ["CMD", "/usr/local/bin/healthcheck.sh"]
interval: 5s
timeout: 5s
start_period: 90s
retries: 30
144 changes: 144 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
# =============================================================================
# ARC CE + SLURM "all-in-one" image for integration testing
#
# Runs, as real systemd-managed services (this is what arcctl/ARC packaging
# expects and is far more robust than re-implementing service supervision):
# - munge (SLURM auth)
# - slurmctld + slurmd (single-node SLURM cluster, one fake node)
# - arc-arex + arc-arex-ws (NorduGrid ARC CE, LRMS backend = slurm)
#
# Base: AlmaLinux 9. On EL9, ARC7 ships in EPEL as nordugrid-arc7-*.
#
# IMPORTANT: this container runs systemd as PID 1, which needs either
# docker run --privileged
# or (rootless-friendlier)
# docker run --cgroupns=host -v /sys/fs/cgroup:/sys/fs/cgroup:rw
# See the .gitlab-ci.yml in this repo for the CI-side flags.
# =============================================================================
FROM almalinux:9

ENV container=docker

# ---------------------------------------------------------------------------
# systemd + EPEL + SLURM + munge + ARC CE + ARC client tools
# ---------------------------------------------------------------------------
# ---------------------------------------------------------------------------
# systemd + EPEL + SLURM + munge + ARC CE + ARC client tools
# Split into separate RUN layers on purpose: easier to see exactly which
# step fails, and a failure in one can't be silently absorbed by a long
# && chain (which is what happened with the curl/curl-minimal conflict
# below before this was split out).
# ---------------------------------------------------------------------------
RUN dnf -y install epel-release dnf-plugins-core && \
dnf config-manager --set-enabled crb

RUN dnf -y update

RUN dnf -y install systemd crypto-policies-scripts dbus-broker && \
update-crypto-policies --set LEGACY

# --allowerasing: AlmaLinux's base image ships curl-minimal, which
# conflicts with the full curl package. Let dnf swap it out rather than
# aborting the whole transaction (this was silently dropping munge/slurm
# from the install set entirely).
RUN dnf -y install --allowerasing --setopt=strict=1 \
munge munge-libs \
slurm slurm-slurmctld slurm-slurmd \
openssl ca-certificates \
procps-ng iproute net-tools which curl jq

RUN dnf -y install --allowerasing --setopt=strict=1 \
nordugrid-arc7-arex \
nordugrid-arc7-client \
nordugrid-arc7-arcctl

RUN dnf clean all

# Hard assertion, isolated in its own layer: this MUST fail the build
# (visibly, as its own numbered step) if any package didn't actually land.
RUN rpm -q munge munge-libs slurm slurm-slurmctld slurm-slurmd \
nordugrid-arc7-arex nordugrid-arc7-client nordugrid-arc7-arcctl

# munge's package should auto-generate /etc/munge/munge.key and create the
# "munge" system user via its post-install scriptlet, but neither reliably
# happens in this build environment - so do both explicitly.
RUN if ! getent group munge >/dev/null; then groupadd -r munge; fi && \
if ! getent passwd munge >/dev/null; then \
useradd -r -g munge -d /etc/munge -s /sbin/nologin -c "MUNGE Uid 'N' Gid Emporium" munge; \
fi && \
mkdir -p /etc/munge /var/lib/munge /var/log/munge /run/munge && \
chown munge:munge /etc/munge /var/lib/munge /var/log/munge /run/munge && \
chmod 0700 /etc/munge && \
/usr/sbin/create-munge-key -f && \
chown munge:munge /etc/munge/munge.key && \
chmod 0400 /etc/munge/munge.key

# standard "systemd in docker" cleanup: mask units that don't apply / fail in containers
RUN (cd /lib/systemd/system/sysinit.target.wants/; for i in *; do [ "$i" = systemd-tmpfiles-setup.service ] || rm -f "$i"; done) ; \
rm -f /lib/systemd/system/multi-user.target.wants/* ; \
rm -f /etc/systemd/system/*.wants/* ; \
rm -f /lib/systemd/system/local-fs.target.wants/* ; \
rm -f /lib/systemd/system/sockets.target.wants/*udev* ; \
rm -f /lib/systemd/system/sockets.target.wants/*initctl* ; \
rm -f /lib/systemd/system/basic.target.wants/* ; \
rm -f /lib/systemd/system/anaconda.target.wants/* 2>/dev/null || true

# ---------------------------------------------------------------------------
# slurm service account (EPEL's slurm packages do NOT create this user
# automatically, unlike some other distros' packaging - has to be done here)
# ---------------------------------------------------------------------------
RUN groupadd -r slurm --gid=990 && \
useradd -r -c "SLURM workload manager" -d /var/lib/slurm -u 990 -g slurm -s /sbin/nologin slurm && \
mkdir -p /var/lib/slurm && chown slurm:slurm /var/lib/slurm

# ---------------------------------------------------------------------------
# Unprivileged pool account that grid jobs get mapped to + matching SLURM node
# ---------------------------------------------------------------------------
RUN useradd -m -s /bin/bash griduser01

# ---------------------------------------------------------------------------
# Config files
# ---------------------------------------------------------------------------
COPY slurm.conf /etc/slurm/slurm.conf
COPY cgroup.conf /etc/slurm/cgroup.conf
COPY arc.conf /etc/arc.conf

COPY bootstrap.sh /usr/local/bin/bootstrap.sh
COPY healthcheck.sh /usr/local/bin/healthcheck.sh
COPY arc-bootstrap.service /etc/systemd/system/arc-bootstrap.service

# Debug aid: prints exactly which systemd unit names the munge/slurm
# packages actually shipped. Check this in the build log if bootstrap.sh
# ever times out waiting for munge/slurmctld/slurmd at runtime.
# Debug aid (non-fatal): shows exactly what's installed for munge/slurm
# and whether they shipped systemd units. Read this in the build log if
# bootstrap.sh ever times out waiting for munge/slurmctld/slurmd later.
RUN set +e; \
echo "== rpm -q (are the packages even installed?) =="; \
rpm -q munge slurm slurm-slurmctld slurm-slurmd; \
echo "== munge file list =="; rpm -ql munge 2>&1; \
echo "== slurm file list =="; rpm -ql slurm 2>&1; \
echo "== slurm-slurmctld file list =="; rpm -ql slurm-slurmctld 2>&1; \
echo "== slurm-slurmd file list =="; rpm -ql slurm-slurmd 2>&1; \
echo "== anything under /usr/lib/systemd/system matching munge or slurm =="; \
ls -la /usr/lib/systemd/system/ | grep -iE 'munge|slurm'; \
true

RUN mkdir -p /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && \
chown slurm:slurm /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && \
chmod +x /usr/local/bin/bootstrap.sh /usr/local/bin/healthcheck.sh && \
systemctl enable dbus-broker.service && \
systemctl enable arc-bootstrap.service
# NOTE: munge.service / slurmctld.service / slurmd.service are deliberately
# NOT enabled here. arc-bootstrap.service declares Wants=/After= on all
# three, so systemd pulls them in automatically as dependencies when
# arc-bootstrap.service starts at boot - no need to double-enable them,
# and it avoids brittle unit-name lookups at build time.

EXPOSE 443

HEALTHCHECK --interval=5s --timeout=5s --start-period=90s --retries=30 \
CMD /usr/local/bin/healthcheck.sh

STOPSIGNAL SIGRTMIN+3
CMD ["/usr/sbin/init"]
13 changes: 13 additions & 0 deletions docker/arc-bootstrap.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[Unit]
Description=Bootstrap ARC CE (arex/arex-ws) on top of local SLURM, mint test client cert
After=network.target dbus-broker.service munge.service slurmctld.service slurmd.service
Wants=dbus-broker.service munge.service slurmctld.service slurmd.service
ConditionPathExists=!/run/arc-ready

[Service]
Type=oneshot
RemainAfterExit=yes
ExecStart=/usr/local/bin/bootstrap.sh

[Install]
WantedBy=multi-user.target
68 changes: 68 additions & 0 deletions docker/arc.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# =============================================================================
# /etc/arc.conf - ARC7 CE configuration for the integration-test container
#
# This intentionally overrides the packaged "zero configuration" so we have
# an explicit, reviewable setup. Cross-check option names against your
# installed ARC version's reference doc if you bump versions:
# /usr/share/doc/nordugrid-arc7-arex/arc.conf.reference
# https://www.nordugrid.org/arc/arc7/admins/reference.html
# =============================================================================

[common]
hostname = arc-ce

# -----------------------------------------------------------------------
# LRMS: point A-REX at the local single-node SLURM cluster
# -----------------------------------------------------------------------
[lrms]
lrms = slurm
slurm_use_sacct = no
slurm_wakeupperiod = 5

[queue: main]
comment = CI integration-test queue backed by local SLURM partition "main"

# -----------------------------------------------------------------------
# Authorization: accept anyone holding a cert signed by the ARC Test-CA
# that arcctl generates at install time. The test client cert we mint in
# entrypoint.sh (arcctl test-ca usercert) is auto-appended to
# testCA.allowed-subjects, which is what the "zero" authgroup checks.
# -----------------------------------------------------------------------
[authgroup: zero]

[mapping]
# every request authorized via the "zero" authgroup runs as griduser01
map_to_user = zero griduser01

# -----------------------------------------------------------------------
# A-REX core: where jobs' control/session data live
# -----------------------------------------------------------------------
[arex]
user = root
controldir = /var/spool/arc/jobstatus
sessiondir = /var/spool/arc/sessiondir
runtimedir = /usr/share/arc/rte
delegationdb = sqlite

[arex/ws]
wsurl = https://arc-ce/arex


[arex/ws/jobs]
allowaccess = zero

# Data staging can be minimal for a CI job that just echoes something,
# but we enable it so xRSL inputfiles/outputfiles work if you extend the
# test job later.
[arex/data-staging]

[arex/cache]
cachedir = /var/spool/arc/cache

[infosys]

[infosys/glue2]

[infosys/cluster]
alias =ARC CE and SLURM CI integration test cluster
comment =Single-node all-in-one container used for CI testing only
61 changes: 61 additions & 0 deletions docker/bootstrap.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/bin/bash
# =============================================================================
# Runs once at container start (via arc-bootstrap.service, after munge,
# slurmctld and slurmd units). Responsibilities:
# 1. Wait for munge + SLURM to be actually usable
# 2. (Re)generate the ARC Test-CA and a host certificate bound to this
# container's *runtime* hostname (image build time hostname is random,
# so we can't bake a valid host cert into the image itself)
# 3. Start arc-arex / arc-arex-ws as configured in /etc/arc.conf
# 4. Mint a Test-CA user certificate for griduser01, which arcctl
# automatically whitelists in /etc/grid-security/testCA.allowed-subjects
# 5. Wait for the REST endpoint to answer, then signal readiness
# =============================================================================
set -euo pipefail
LOG=/var/log/arc-bootstrap.log
exec > >(tee -a "$LOG") 2>&1

echo "== ARC CE / SLURM bootstrap starting: $(date -u) =="

HOSTNAME_FQDN="$(hostname)"
echo "Using hostname: ${HOSTNAME_FQDN}"

wait_for() {
local desc="$1"; shift
local tries=0
until "$@" >/dev/null 2>&1; do
tries=$((tries + 1))
if [ "$tries" -gt 90 ]; then
echo "TIMED OUT waiting for: ${desc}"
return 1
fi
sleep 2
done
echo "${desc}: ready (after ${tries} tries)"
}

# --- 1. munge, then SLURM control daemon ------------------------------------
wait_for "munge" bash -c 'echo bootstrap-check | munge | unmunge'
wait_for "slurmctld (sinfo)" sinfo -h

# --- 2. Test-CA + host certificate for the real runtime hostname ------------
arcctl test-ca init -f
arcctl test-ca hostcert -n "${HOSTNAME_FQDN}" -f

# --- 3. Start ARC CE services -------------------------------------------------
arcctl service start --as-configured

# --- 4. Test client certificate for griduser01 -------------------------------
arcctl test-ca usercert --install-user griduser01 -f

# Also export a portable tarball, useful if the GitLab job wants to drive
# arcsub/arcstat/arcget from *outside* this container (e.g. from the
# job's own shell talking to the CE over the docker network).
arcctl test-ca usercert -n griduser01 --export-tar -f || true
mv -f testcert-*.tar.gz /root/arc-test-client.tar.gz 2>/dev/null || true

# --- 5. Wait until the REST endpoint actually answers ------------------------
wait_for "arex REST endpoint" curl -sk "https://${HOSTNAME_FQDN}/arex/rest/1.0/info"

touch /run/arc-ready
echo "== ARC CE / SLURM bootstrap complete: $(date -u) =="
Loading
Loading