From 4a85b547ee9c5b00bbbfbb6956a994cbe6d4787c Mon Sep 17 00:00:00 2001 From: Juraj Smiesko Date: Thu, 2 Jul 2026 14:45:44 +0200 Subject: [PATCH 1/2] ci: add containerised Slurm test cluster Single-node Slurm 25.11.2 cluster running in two Podman containers (controller + compute node) on Ubuntu 26.04, for testing job submission locally and in GitHub CI without real HPC infrastructure. --- slurm-test/.github/workflows/slurm-test.yml | 69 +++++++++++ slurm-test/Containerfile | 29 +++++ slurm-test/README.md | 121 ++++++++++++++++++++ slurm-test/compose.yml | 23 ++++ slurm-test/configs/cgroup.conf | 3 + slurm-test/configs/slurm.conf | 31 +++++ slurm-test/scripts/entrypoint.sh | 34 ++++++ slurm-test/test-jobs/array.sh | 10 ++ slurm-test/test-jobs/collatz.sh | 28 +++++ slurm-test/test-jobs/simple.sh | 10 ++ 10 files changed, 358 insertions(+) create mode 100644 slurm-test/.github/workflows/slurm-test.yml create mode 100644 slurm-test/Containerfile create mode 100644 slurm-test/README.md create mode 100644 slurm-test/compose.yml create mode 100644 slurm-test/configs/cgroup.conf create mode 100644 slurm-test/configs/slurm.conf create mode 100644 slurm-test/scripts/entrypoint.sh create mode 100644 slurm-test/test-jobs/array.sh create mode 100644 slurm-test/test-jobs/collatz.sh create mode 100644 slurm-test/test-jobs/simple.sh diff --git a/slurm-test/.github/workflows/slurm-test.yml b/slurm-test/.github/workflows/slurm-test.yml new file mode 100644 index 0000000..68faaf6 --- /dev/null +++ b/slurm-test/.github/workflows/slurm-test.yml @@ -0,0 +1,69 @@ +name: Slurm Tests + +on: + push: + pull_request: + +jobs: + slurm-test: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Install podman-compose + run: pip install podman-compose + + - name: Build image + run: podman build -t slurm-test . + + - name: Start cluster + run: | + podman network create slurm + podman run -d --name intercede-slurmctld --hostname intercede-slurmctld \ + --network slurm --privileged slurm-test controller + podman run -d --name intercede-c1 --hostname intercede-c1 \ + --network slurm --privileged slurm-test worker + + - name: Wait for cluster to be ready + run: | + for i in $(seq 1 20); do + if podman exec intercede-slurmctld sinfo --noheader 2>/dev/null | grep -q "idle"; then + echo "Cluster ready" + break + fi + echo "Waiting for cluster... ($i/20)" + sleep 3 + done + podman exec intercede-slurmctld sinfo + + - name: Submit simple job + run: podman exec intercede-slurmctld sbatch /test-jobs/simple.sh + + - name: Submit array job + run: podman exec intercede-slurmctld sbatch --array=1-4 /test-jobs/array.sh + + - name: Wait for all jobs to complete + run: | + for i in $(seq 1 30); do + PENDING=$(podman exec intercede-slurmctld squeue --noheader 2>/dev/null | wc -l) + if [ "$PENDING" -eq 0 ]; then + echo "All jobs completed" + break + fi + echo "Waiting... $PENDING job(s) still in queue (attempt $i/30)" + podman exec intercede-slurmctld squeue + sleep 5 + done + + - name: Show job output + run: | + podman exec intercede-c1 bash -c 'cat /tmp/slurm-*.out 2>/dev/null || echo "No output files found"' + + - name: Show logs on failure + if: failure() + run: | + echo "=== slurmctld log ===" + podman exec intercede-slurmctld cat /var/log/slurm/slurmctld.log || true + echo "=== slurmd log ===" + podman exec intercede-c1 cat /var/log/slurm/slurmd.log || true diff --git a/slurm-test/Containerfile b/slurm-test/Containerfile new file mode 100644 index 0000000..d90b690 --- /dev/null +++ b/slurm-test/Containerfile @@ -0,0 +1,29 @@ +FROM ubuntu:26.04 + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install -y \ + slurmctld \ + slurmd \ + slurm-client \ + munge && \ + rm -rf /var/lib/apt/lists/* + +RUN mkdir -p /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && \ + chown slurm:slurm /var/spool/slurmctld /var/spool/slurmd /var/log/slurm + +# Bake a shared munge key so all containers from this image authenticate each other +# without needing a runtime volume. Fine for testing; never do this in production. +RUN dd if=/dev/urandom bs=1 count=1024 2>/dev/null > /etc/munge/munge.key && \ + chown munge:munge /etc/munge/munge.key && \ + chmod 400 /etc/munge/munge.key + +COPY configs/slurm.conf /etc/slurm/slurm.conf +COPY configs/cgroup.conf /etc/slurm/cgroup.conf +COPY test-jobs/ /test-jobs/ +COPY scripts/entrypoint.sh /entrypoint.sh + +RUN chmod +x /entrypoint.sh /test-jobs/*.sh + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/slurm-test/README.md b/slurm-test/README.md new file mode 100644 index 0000000..9e4a9d3 --- /dev/null +++ b/slurm-test/README.md @@ -0,0 +1,121 @@ +# Slurm Test Cluster + +A containerised single-node Slurm cluster for testing job submission locally and in GitHub CI. + +## Architecture + +Two containers are built from the same image and run on a shared network: + +| Container | Hostname | Role | +|---|---|---| +| `intercede-slurmctld` | `intercede-slurmctld` | Controller (`slurmctld`) — submit jobs here | +| `intercede-c1` | `intercede-c1` | Compute node (`slurmd`) | + +The munge authentication key is baked into the image at build time so both containers share it automatically — no runtime volume coordination needed. + +## Prerequisites + +- **Fedora/RHEL:** `dnf install podman podman-compose` +- **Ubuntu 26.04 / 24.04:** `apt install podman podman-compose` +- **Ubuntu 22.04:** `apt install podman && pip install podman-compose` (the apt package is too old) + +## Local usage + +**Build and start the cluster:** + +```bash +podman-compose up --build +``` + +**In a separate terminal, check the cluster is up:** + +```bash +podman exec intercede-slurmctld sinfo +``` + +You should see `intercede-c1` with state `idle`. + +**Submit a job:** + +```bash +podman exec intercede-slurmctld sbatch /test-jobs/simple.sh +``` + +**Watch the queue:** + +```bash +podman exec intercede-slurmctld squeue +``` + +**Submit a job array (4 tasks):** + +```bash +podman exec intercede-slurmctld sbatch --array=1-4 /test-jobs/array.sh +``` + +**Read job output** (written to `/tmp/` inside the compute node): + +```bash +podman exec intercede-c1 bash -c 'cat /tmp/slurm-*.out' +``` + +**Tail the Slurm logs:** + +```bash +# Controller log +podman exec intercede-slurmctld tail -f /var/log/slurm/slurmctld.log + +# Compute node log +podman exec intercede-c1 tail -f /var/log/slurm/slurmd.log +``` + +**Tear down:** + +```bash +podman-compose down +``` + +## Running your own job script + +Write a batch script with `#SBATCH` directives and submit it via the controller: + +```bash +podman cp my-job.sh intercede-slurmctld:/tmp/my-job.sh +podman exec intercede-slurmctld sbatch /tmp/my-job.sh +``` + +## GitHub CI + +The workflow in `.github/workflows/slurm-test.yml` runs automatically on push and pull request. It: + +1. Builds the image with `podman build` +2. Creates a `slurm` network and starts both containers +3. Polls `sinfo` until the compute node shows `idle` +4. Submits the simple and array test jobs +5. Waits for all jobs to leave the queue +6. Prints job output +7. On failure, dumps the `slurmctld` and `slurmd` logs + +## Project structure + +``` +. +├── Containerfile # Rocky Linux 9 + EPEL Slurm + Munge +├── compose.yml # Local development with podman-compose +├── configs/ +│ └── slurm.conf # Slurm configuration +├── scripts/ +│ └── entrypoint.sh # Starts the right daemon based on role arg +├── test-jobs/ +│ ├── simple.sh # Single-task job +│ └── array.sh # 4-task array job +└── .github/ + └── workflows/ + └── slurm-test.yml # GitHub Actions CI workflow +``` + +## Configuration notes + +- `ProctrackType=proctrack/linuxproc` — avoids cgroup kernel requirements inside containers +- `TaskPlugin=task/none` — likewise avoids cgroup task management +- `ReturnToService=2` — nodes automatically return to service after being down, useful when the compute container starts slightly after the controller diff --git a/slurm-test/compose.yml b/slurm-test/compose.yml new file mode 100644 index 0000000..20c6081 --- /dev/null +++ b/slurm-test/compose.yml @@ -0,0 +1,23 @@ +services: + intercede-slurmctld: + build: . + container_name: intercede-slurmctld + hostname: intercede-slurmctld + command: ["controller"] + privileged: true + networks: + - slurm + + intercede-c1: + build: . + container_name: intercede-c1 + hostname: intercede-c1 + command: ["worker"] + privileged: true + depends_on: + - intercede-slurmctld + networks: + - slurm + +networks: + slurm: diff --git a/slurm-test/configs/cgroup.conf b/slurm-test/configs/cgroup.conf new file mode 100644 index 0000000..efc385f --- /dev/null +++ b/slurm-test/configs/cgroup.conf @@ -0,0 +1,3 @@ +# Tell slurmstepd not to attempt systemd scope creation via dbus. +# Required when running inside a container without systemd. +IgnoreSystemd=yes diff --git a/slurm-test/configs/slurm.conf b/slurm-test/configs/slurm.conf new file mode 100644 index 0000000..76ddb40 --- /dev/null +++ b/slurm-test/configs/slurm.conf @@ -0,0 +1,31 @@ +ClusterName=intercede-test +SlurmctldHost=intercede-slurmctld + +MpiDefault=none +# linuxproc avoids cgroup kernel requirements — required inside containers +ProctrackType=proctrack/linuxproc +TaskPlugin=task/none +SwitchType=switch/none + +ReturnToService=2 +SlurmctldPidFile=/var/run/slurmctld.pid +SlurmdPidFile=/var/run/slurmd.pid +SlurmdSpoolDir=/var/spool/slurmd +SlurmUser=slurm +StateSaveLocation=/var/spool/slurmctld + +SchedulerType=sched/backfill +SelectType=select/cons_tres +SelectTypeParameters=CR_Core + +AccountingStorageType=accounting_storage/none +JobCompType=jobcomp/none +JobAcctGatherType=jobacct_gather/none + +SlurmctldDebug=info +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmdDebug=info +SlurmdLogFile=/var/log/slurm/slurmd.log + +NodeName=intercede-c1 State=UNKNOWN +PartitionName=test Nodes=ALL Default=YES MaxTime=INFINITE State=UP diff --git a/slurm-test/scripts/entrypoint.sh b/slurm-test/scripts/entrypoint.sh new file mode 100644 index 0000000..e65b438 --- /dev/null +++ b/slurm-test/scripts/entrypoint.sh @@ -0,0 +1,34 @@ +#!/bin/bash +set -euo pipefail + +ROLE="${1:-controller}" + +start_munge() { + mkdir -p /var/run/munge + chown munge:munge /var/run/munge + runuser -u munge -- munged + sleep 1 +} + +case "$ROLE" in + controller) + start_munge + slurmctld + sleep 2 + echo "==> slurmctld ready" + sinfo || true + exec tail -f --retry /var/log/slurm/slurmctld.log + ;; + + worker) + start_munge + slurmd + sleep 1 + echo "==> slurmd ready on $(hostname)" + exec tail -f --retry /var/log/slurm/slurmd.log + ;; + + *) + exec "$@" + ;; +esac diff --git a/slurm-test/test-jobs/array.sh b/slurm-test/test-jobs/array.sh new file mode 100644 index 0000000..b6f914e --- /dev/null +++ b/slurm-test/test-jobs/array.sh @@ -0,0 +1,10 @@ +#!/bin/bash +#SBATCH --job-name=array-test +#SBATCH --array=1-4 +#SBATCH --ntasks=1 +#SBATCH --time=00:01:00 +#SBATCH --output=/tmp/slurm-%A_%a.out + +echo "Array job $SLURM_ARRAY_JOB_ID task $SLURM_ARRAY_TASK_ID on $(hostname)" +sleep 2 +echo "Task $SLURM_ARRAY_TASK_ID done" diff --git a/slurm-test/test-jobs/collatz.sh b/slurm-test/test-jobs/collatz.sh new file mode 100644 index 0000000..b4e07b7 --- /dev/null +++ b/slurm-test/test-jobs/collatz.sh @@ -0,0 +1,28 @@ +#!/bin/bash +#SBATCH --job-name=collatz +#SBATCH --ntasks=1 +#SBATCH --time=00:01:00 +#SBATCH --output=/tmp/slurm-%j.out + +start=$(( SLURM_JOB_ID % 1000 )) +# Avoid starting at 0 +[ "$start" -eq 0 ] && start=1000 + +echo "Job $SLURM_JOB_ID: Collatz sequence from $start" + +n=$start +steps=0 +sequence="$n" + +while [ "$n" -ne 1 ]; do + if [ $(( n % 2 )) -eq 0 ]; then + n=$(( n / 2 )) + else + n=$(( n * 3 + 1 )) + fi + steps=$(( steps + 1 )) + sequence="$sequence → $n" +done + +echo "Sequence: $sequence" +echo "Steps: $steps" diff --git a/slurm-test/test-jobs/simple.sh b/slurm-test/test-jobs/simple.sh new file mode 100644 index 0000000..e861af9 --- /dev/null +++ b/slurm-test/test-jobs/simple.sh @@ -0,0 +1,10 @@ +#!/bin/bash +#SBATCH --job-name=hello +#SBATCH --ntasks=1 +#SBATCH --time=00:01:00 +#SBATCH --output=/tmp/slurm-%j.out + +echo "Hello from Slurm job $SLURM_JOB_ID on $(hostname)" +date +sleep 3 +echo "Done" From d68faa3643ca38508c05013d283e08e301a861af Mon Sep 17 00:00:00 2001 From: Juraj Smiesko Date: Thu, 2 Jul 2026 14:59:43 +0200 Subject: [PATCH 2/2] ci: rename slurm-test to standalone-slurm, add integration workflow - Rename slurm-test/ to standalone-slurm/ - Add .github/workflows/slurm-integration.yml triggered on changes to standalone-slurm/ or src/, replacing the misplaced workflow that was inside the directory and never picked up by GitHub Actions --- .../workflows/slurm-integration.yml | 36 +++++++++++++------ .../Containerfile | 0 {slurm-test => standalone-slurm}/README.md | 22 ++++++------ {slurm-test => standalone-slurm}/compose.yml | 0 .../configs/cgroup.conf | 0 .../configs/slurm.conf | 0 .../scripts/entrypoint.sh | 0 .../test-jobs/array.sh | 0 .../test-jobs/collatz.sh | 0 .../test-jobs/simple.sh | 0 10 files changed, 36 insertions(+), 22 deletions(-) rename slurm-test/.github/workflows/slurm-test.yml => .github/workflows/slurm-integration.yml (71%) rename {slurm-test => standalone-slurm}/Containerfile (100%) rename {slurm-test => standalone-slurm}/README.md (79%) rename {slurm-test => standalone-slurm}/compose.yml (100%) rename {slurm-test => standalone-slurm}/configs/cgroup.conf (100%) rename {slurm-test => standalone-slurm}/configs/slurm.conf (100%) rename {slurm-test => standalone-slurm}/scripts/entrypoint.sh (100%) rename {slurm-test => standalone-slurm}/test-jobs/array.sh (100%) rename {slurm-test => standalone-slurm}/test-jobs/collatz.sh (100%) rename {slurm-test => standalone-slurm}/test-jobs/simple.sh (100%) diff --git a/slurm-test/.github/workflows/slurm-test.yml b/.github/workflows/slurm-integration.yml similarity index 71% rename from slurm-test/.github/workflows/slurm-test.yml rename to .github/workflows/slurm-integration.yml index 68faaf6..b1a00e3 100644 --- a/slurm-test/.github/workflows/slurm-test.yml +++ b/.github/workflows/slurm-integration.yml @@ -1,21 +1,35 @@ -name: Slurm Tests +name: Slurm Integration Tests on: push: + branches: + - main + paths: + - 'standalone-slurm/**' + - 'src/**' pull_request: + branches: + - main + paths: + - 'standalone-slurm/**' + - 'src/**' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true jobs: - slurm-test: + slurm-integration: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - name: Install podman-compose run: pip install podman-compose - - name: Build image - run: podman build -t slurm-test . + - name: Build Slurm image + run: podman build -t slurm-test standalone-slurm/ - name: Start cluster run: | @@ -37,11 +51,11 @@ jobs: done podman exec intercede-slurmctld sinfo - - name: Submit simple job - run: podman exec intercede-slurmctld sbatch /test-jobs/simple.sh - - - name: Submit array job - run: podman exec intercede-slurmctld sbatch --array=1-4 /test-jobs/array.sh + - name: Submit test jobs + run: | + podman exec intercede-slurmctld sbatch /test-jobs/simple.sh + podman exec intercede-slurmctld sbatch --array=1-4 /test-jobs/array.sh + podman exec intercede-slurmctld sbatch /test-jobs/collatz.sh - name: Wait for all jobs to complete run: | @@ -58,7 +72,7 @@ jobs: - name: Show job output run: | - podman exec intercede-c1 bash -c 'cat /tmp/slurm-*.out 2>/dev/null || echo "No output files found"' + podman exec intercede-c1 bash -c 'cat /tmp/slurm-*.out' - name: Show logs on failure if: failure() diff --git a/slurm-test/Containerfile b/standalone-slurm/Containerfile similarity index 100% rename from slurm-test/Containerfile rename to standalone-slurm/Containerfile diff --git a/slurm-test/README.md b/standalone-slurm/README.md similarity index 79% rename from slurm-test/README.md rename to standalone-slurm/README.md index 9e4a9d3..f24dd83 100644 --- a/slurm-test/README.md +++ b/standalone-slurm/README.md @@ -86,12 +86,13 @@ podman exec intercede-slurmctld sbatch /tmp/my-job.sh ## GitHub CI -The workflow in `.github/workflows/slurm-test.yml` runs automatically on push and pull request. It: +The workflow in `.github/workflows/slurm-integration.yml` runs automatically on push and pull +request when files under `standalone-slurm/` or `src/` change. It: 1. Builds the image with `podman build` 2. Creates a `slurm` network and starts both containers 3. Polls `sinfo` until the compute node shows `idle` -4. Submits the simple and array test jobs +4. Submits the simple, array, and Collatz test jobs 5. Waits for all jobs to leave the queue 6. Prints job output 7. On failure, dumps the `slurmctld` and `slurmd` logs @@ -99,19 +100,18 @@ The workflow in `.github/workflows/slurm-test.yml` runs automatically on push an ## Project structure ``` -. -├── Containerfile # Rocky Linux 9 + EPEL Slurm + Munge +standalone-slurm/ +├── Containerfile # Ubuntu 26.04 + Slurm + Munge ├── compose.yml # Local development with podman-compose ├── configs/ -│ └── slurm.conf # Slurm configuration +│ ├── slurm.conf # Slurm configuration +│ └── cgroup.conf # Disables systemd scope creation (container-safe) ├── scripts/ │ └── entrypoint.sh # Starts the right daemon based on role arg -├── test-jobs/ -│ ├── simple.sh # Single-task job -│ └── array.sh # 4-task array job -└── .github/ - └── workflows/ - └── slurm-test.yml # GitHub Actions CI workflow +└── test-jobs/ + ├── simple.sh # Single-task job + ├── array.sh # 4-task array job + └── collatz.sh # Collatz sequence (starting number = job ID % 1000) ``` ## Configuration notes diff --git a/slurm-test/compose.yml b/standalone-slurm/compose.yml similarity index 100% rename from slurm-test/compose.yml rename to standalone-slurm/compose.yml diff --git a/slurm-test/configs/cgroup.conf b/standalone-slurm/configs/cgroup.conf similarity index 100% rename from slurm-test/configs/cgroup.conf rename to standalone-slurm/configs/cgroup.conf diff --git a/slurm-test/configs/slurm.conf b/standalone-slurm/configs/slurm.conf similarity index 100% rename from slurm-test/configs/slurm.conf rename to standalone-slurm/configs/slurm.conf diff --git a/slurm-test/scripts/entrypoint.sh b/standalone-slurm/scripts/entrypoint.sh similarity index 100% rename from slurm-test/scripts/entrypoint.sh rename to standalone-slurm/scripts/entrypoint.sh diff --git a/slurm-test/test-jobs/array.sh b/standalone-slurm/test-jobs/array.sh similarity index 100% rename from slurm-test/test-jobs/array.sh rename to standalone-slurm/test-jobs/array.sh diff --git a/slurm-test/test-jobs/collatz.sh b/standalone-slurm/test-jobs/collatz.sh similarity index 100% rename from slurm-test/test-jobs/collatz.sh rename to standalone-slurm/test-jobs/collatz.sh diff --git a/slurm-test/test-jobs/simple.sh b/standalone-slurm/test-jobs/simple.sh similarity index 100% rename from slurm-test/test-jobs/simple.sh rename to standalone-slurm/test-jobs/simple.sh