diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index fb159a032..bcc7de2fb 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -129,6 +129,7 @@ jobs:
outputs:
configurations: ${{ steps.compute.outputs.configurations }}
run_fuzz: ${{ steps.compute.outputs.run_fuzz }}
+ run_reliability: ${{ steps.compute.outputs.run_reliability }}
steps:
- name: Debounce label events
if: github.event.action == 'labeled'
@@ -161,8 +162,14 @@ jobs:
else
echo "run_fuzz=false" >> $GITHUB_OUTPUT
fi
+ if echo "$labels" | grep -Fq "test:reliability"; then
+ echo "run_reliability=true" >> $GITHUB_OUTPUT
+ else
+ echo "run_reliability=false" >> $GITHUB_OUTPUT
+ fi
else
echo "run_fuzz=false" >> $GITHUB_OUTPUT
+ echo "run_reliability=false" >> $GITHUB_OUTPUT
fi
configs="$configs]"
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 97e1eb2c8..83ce0def5 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -8,6 +8,9 @@ variables:
FORCE_BUILD:
value: ""
description: "Force build even if no new commits (any non-empty value)"
+ RUN_RELIABILITY:
+ value: "false"
+ description: "Run reliability and chaos tests. Set automatically when the test:reliability label is on the PR."
MAVEN_REPOSITORY_PROXY: "https://depot-read-api-java.us1.ddbuild.io/magicmirror/magicmirror/@current/"
default:
@@ -153,6 +156,66 @@ jdk-integration-test:
forward:
pipeline_variables: true
+# Generates a child pipeline YAML for reliability/chaos tests when the PR
+# carries the test:reliability label (RUN_RELIABILITY=true in build.env).
+generate-reliability-child-pipeline:
+ stage: reliability
+ tags: ["arch:amd64"]
+ image: $PREPARE_IMAGE
+ needs:
+ - job: prepare:start
+ artifacts: true
+ rules:
+ - if: '$CI_PIPELINE_SOURCE == "schedule"'
+ when: never
+ - if: '$JDK_VERSION != null || $DEBUG_LEVEL != null || $HASH != null || $DOWNSTREAM != null'
+ when: never
+ - when: on_success
+ script:
+ - |
+ if [ "${RUN_RELIABILITY:-}" = "true" ]; then
+ echo "Label test:reliability detected — enabling reliability child pipeline"
+ cp .gitlab/reliability/pr-child.gitlab-ci.yml generated-reliability.yml
+ else
+ cat > generated-reliability.yml << 'NOOP'
+ skip-reliability:
+ image: registry.ddbuild.io/images/benchmarking-platform-tools-ubuntu:latest
+ tags: ["arch:amd64"]
+ script:
+ - echo "Label test:reliability not set — skipping"
+ rules:
+ - when: always
+ NOOP
+ fi
+ artifacts:
+ paths:
+ - generated-reliability.yml
+ expire_in: 1 day
+
+run-reliability-tests:
+ stage: reliability
+ variables:
+ DDPROF_COMMIT_BRANCH: "$DDPROF_COMMIT_BRANCH"
+ DDPROF_COMMIT_SHA: "$DDPROF_COMMIT_SHA"
+ needs:
+ - job: generate-reliability-child-pipeline
+ artifacts: true
+ - job: prepare:start
+ artifacts: true
+ rules:
+ - if: '$CI_PIPELINE_SOURCE == "schedule"'
+ when: never
+ - if: '$JDK_VERSION != null || $DEBUG_LEVEL != null || $HASH != null || $DOWNSTREAM != null'
+ when: never
+ - when: on_success
+ trigger:
+ include:
+ - artifact: generated-reliability.yml
+ job: generate-reliability-child-pipeline
+ strategy: depend
+ forward:
+ pipeline_variables: true
+
include:
- local: .gitlab/common.yml
- local: .adms/python/gitlab.yaml
diff --git a/.gitlab/benchmarks/.gitlab-ci.yml b/.gitlab/benchmarks/.gitlab-ci.yml
index 893322996..960383af6 100644
--- a/.gitlab/benchmarks/.gitlab-ci.yml
+++ b/.gitlab/benchmarks/.gitlab-ci.yml
@@ -17,13 +17,17 @@ variables:
rules:
- if: '$JDK_VERSION != null || $DEBUG_LEVEL != null || $HASH != null || $DOWNSTREAM != null'
when: never
- - if: '$CI_PIPELINE_SOURCE == "trigger" || $CI_PIPELINE_SOURCE == "pipeline"'
- when: on_success
+ - if: '$CI_PIPELINE_SOURCE == "schedule"'
+ when: never
+ - if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
+ when: never
- if: '$CI_PIPELINE_SOURCE == "web"'
when: manual
allow_failure: true
- - if: '$CI_PIPELINE_SOURCE == "push"'
- when: manual
+ # Run automatically and non-blocking on any other source (push/trigger/api/
+ # etc.) — mirrors the integration-test rules. The before_script CANCELLED
+ # gate skips branches with no open PR.
+ - when: on_success
allow_failure: true
script: |
# setup the env
@@ -36,8 +40,8 @@ variables:
if [ -z "${CANDIDATE_VERSION}" ]; then echo "Missing candidate version. Skipping."; exit 0; fi
# fetch the common platform scripts
- git config --global url."https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.ddbuild.io/DataDog/".insteadOf "https://github.com/DataDog/"
- git clone --branch dd-trace-go https://github.com/DataDog/benchmarking-platform ${PLATFORM_DIR}
+ git -c url."https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.ddbuild.io/DataDog/".insteadOf="https://github.com/DataDog/" \
+ clone --branch dd-trace-go https://github.com/DataDog/benchmarking-platform ${PLATFORM_DIR}
# apply the specific step scripts
cp -r .gitlab/benchmarks/steps/* ${PLATFORM_DIR}/steps/
@@ -52,7 +56,6 @@ variables:
${PLATFORM_DIR}/steps/run-benchmarks.sh
${PLATFORM_DIR}/steps/analyze-results.sh
${PLATFORM_DIR}/steps/upload-results-to-s3.sh
- ${PLATFORM_DIR}/steps/post-pr-comment.sh
parallel:
matrix:
- RUN_MODE: ["cpu", "wall", "alloc", "memleak", "cpu,wall", "memleak,alloc", "cpu,wall,alloc,memleak"]
@@ -76,6 +79,36 @@ benchmarks-candidate-aarch64:
KUBERNETES_MEMORY_REQUEST: 200Gi
KUBERNETES_MEMORY_LIMIT: 200Gi
+post-benchmarks-pr-comment:
+ extends: .retry-config
+ stage: benchmarks
+ tags: ["arch:arm64"]
+ image: registry.ddbuild.io/images/dd-octo-sts-ci-base:2025.06-1
+ id_tokens:
+ DDOCTOSTS_ID_TOKEN:
+ aud: dd-octo-sts
+ needs:
+ - job: prepare:start
+ artifacts: true
+ - job: benchmarks-candidate-amd64
+ artifacts: true
+ - job: benchmarks-candidate-aarch64
+ artifacts: true
+ rules:
+ - if: '$JDK_VERSION != null || $DEBUG_LEVEL != null || $HASH != null || $DOWNSTREAM != null'
+ when: never
+ - if: '$CI_PIPELINE_SOURCE == "schedule"'
+ when: never
+ - if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
+ when: never
+ # Always run when the candidate jobs ran, regardless of source, so results
+ # are posted back to the PR.
+ - when: always
+ timeout: 5m
+ script:
+ - .gitlab/benchmarks/post-pr-comment.sh reports
+ allow_failure: true
+
publish-benchmark-gh-pages:
stage: benchmarks
tags: ["arch:arm64"]
diff --git a/.gitlab/benchmarks/post-pr-comment.sh b/.gitlab/benchmarks/post-pr-comment.sh
new file mode 100755
index 000000000..de410dff9
--- /dev/null
+++ b/.gitlab/benchmarks/post-pr-comment.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+# Post aggregated benchmark comparison results as a single PR comment.
+#
+# Expects all per-cell comparison-baseline-vs-candidate_*.md reports to be
+# present under REPORTS_DIR (default: reports/).
+#
+# Required env:
+# DDPROF_COMMIT_BRANCH – branch name used to locate the open PR
+# Optional env:
+# CI_PIPELINE_URL, DDPROF_COMMIT_SHA
+
+set -euo pipefail
+
+REPORTS_DIR="${1:-reports}"
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Aggregate all per-cell reports into a single comment body
+SECTIONS=""
+for md in "${REPORTS_DIR}"/comparison-baseline-vs-candidate_*.md; do
+ [ -f "${md}" ] || continue
+ label=$(basename "${md}" .md | sed 's/comparison-baseline-vs-candidate_//')
+ SECTIONS="${SECTIONS}
+${label}
+
+$(cat "${md}")
+
+
+"
+done
+
+if [ -z "${SECTIONS}" ]; then
+ echo "No benchmark reports found under ${REPORTS_DIR} — skipping comment"
+ exit 0
+fi
+
+BODY_FILE=$(mktemp)
+trap 'rm -f "${BODY_FILE}"' EXIT
+cat > "${BODY_FILE}" <&2
+ CRASH_MSG="Chaos harness crashed (RC=${RC})"
+ HS_ERR="${HERE}/../../hs_err.log"
+ if [ -f "${HS_ERR}" ]; then
+ SIG=$(grep -m1 '^siginfo:' "${HS_ERR}" 2>/dev/null | tr -d '\n' | cut -c1-120)
+ FRAME=$(grep -m1 'libjavaProfiler\|AsyncProfiler' "${HS_ERR}" 2>/dev/null | sed 's/^[[:space:]]*//' | tr -d '\n' | cut -c1-120)
+ [ -n "${SIG}" ] && CRASH_MSG="${CRASH_MSG};${SIG}"
+ [ -n "${FRAME}" ] && CRASH_MSG="${CRASH_MSG};${FRAME}"
+ fi
+ echo "FAIL:${CRASH_MSG}" >&2
exit 1
fi
diff --git a/.gitlab/reliability/post-pr-comment.sh b/.gitlab/reliability/post-pr-comment.sh
new file mode 100755
index 000000000..8be574a9c
--- /dev/null
+++ b/.gitlab/reliability/post-pr-comment.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+# Post aggregated reliability + chaos test results as a single PR comment.
+#
+# Reads REASON_* variables written to build.env by the reliability/chaos jobs
+# and emits a ✅/❌ matrix with failure blocks.
+#
+# Required env:
+# DDPROF_COMMIT_BRANCH – branch name used to locate the open PR
+# Optional env:
+# CI_PIPELINE_URL
+
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# ── Collect failures from REASON_* env vars ────────────────────────────────────
+rel_fail=0; rel_failures=""
+chaos_fail=0; chaos_failures=""
+
+for key in $(compgen -v | grep -E '^REASON_.*X(jit|memory)$' | sort); do
+ reason="${!key}"
+ label="${key#REASON_}"
+ rel_fail=$((rel_fail + 1))
+ detail=$(printf '%s' "${reason//\`/}" | tr ';' '\n')
+ rel_failures="${rel_failures}
+❌ ${label//_/ }
+
+\`\`\`
+${detail}
+\`\`\`
+
+ "
+done
+
+for key in $(compgen -v | grep -E '^REASON_.*Xchaos$' | sort); do
+ reason="${!key}"
+ label="${key#REASON_}"
+ chaos_fail=$((chaos_fail + 1))
+ detail=$(printf '%s' "${reason//\`/}" | tr ';' '\n')
+ chaos_failures="${chaos_failures}
+❌ chaos: ${label//_/ }
+
+\`\`\`
+${detail}
+\`\`\`
+
+ "
+done
+
+# ── Assemble comment ────────────────────────────────────────────────────────────
+total_fail=$((rel_fail + chaos_fail))
+if [ "${total_fail}" -gt 0 ]; then
+ overall="❌ **${total_fail} failure(s) detected**"
+else
+ overall="✅ **All reliability & chaos checks passed**"
+fi
+
+BODY_FILE=$(mktemp)
+trap 'rm -f "${BODY_FILE}"' EXIT
+cat > "${BODY_FILE}" <err.log 1>out.log
+ - REASON=$(grep -m1 'FAIL:' err.log | cut -f2- -d':' | tr -d '\n') || true
+ - if [ -n "${REASON}" ]; then _key=$(printf 'REASON_%s_%s_%sX%s' "${CONFIG}" "${ALLOCATOR}" "${ARCH}" "${VARIANT}" | tr '+' '_'); echo "${_key}=${REASON}" >> build.env; exit 1; fi
+ after_script:
+ - |
+ if [[ "$CI_JOB_STATUS" == "failed" ]]; then
+ _key=$(printf 'REASON_%s_%s_%sX%s' "${CONFIG}" "${ALLOCATOR}" "${ARCH}" "${VARIANT}" | tr '+' '_')
+ grep -q "${_key}=" build.env 2>/dev/null || echo "${_key}=Unknown failure, perhaps timeout" >> build.env
+ fi
+ artifacts:
+ name: "results-${ARCH}"
+ when: always
+ paths:
+ - memwatch.log
+ - memwatch-trend.png
+ - hs_err.log
+ - err.log
+ - out.log
+ reports:
+ dotenv: build.env
+ expire_in: 1 day
+
+reliability-amd64:
+ extends: .reliability_pr_job
+ tags: ["arch:amd64"]
+ image: $BENCHMARK_IMAGE_AMD64
+ variables:
+ ARCH: amd64
+
+reliability-aarch64:
+ extends: .reliability_pr_job
+ tags: ["arch:arm64"]
+ image: $BENCHMARK_IMAGE_ARM64
+ variables:
+ ARCH: aarch64
+
+# ── Chaos ────────────────────────────────────────────────────────────────────
+# chaos_check.sh builds chaos.jar inline (via Gradle) when the artifact is
+# absent, and downloads ddprof from Maven snapshots when no local jar exists.
+
+.reliability_chaos_pr_job:
+ stage: test
+ timeout: 6h
+ variables:
+ RUNTIME: "120"
+ needs:
+ - job: get-versions
+ artifacts: true
+ rules:
+ - when: on_success
+ parallel:
+ matrix:
+ - CONFIG: ["profiler", "profiler+tracer"]
+ ALLOCATOR: ["gmalloc", "jemalloc", "tcmalloc"]
+ CHAOS_JDK: ["21.0.3-tem", "25.0.3-tem"]
+ script:
+ - set +e
+ - echo "runtime=${RUNTIME}, config=${CONFIG}, allocator=${ALLOCATOR}, arch=${ARCH}, jdk=${CHAOS_JDK}"
+ - CHAOS_JDK="${CHAOS_JDK}" .gitlab/reliability/chaos_check.sh "$RUNTIME" "$CONFIG" "$ALLOCATOR" 2>err.log 1>out.log
+ - REASON=$(grep -m1 'FAIL:' err.log | cut -f2- -d':' | tr -d '\n') || true
+ - if [ -n "${REASON}" ]; then _key=$(printf 'REASON_%s_%s_%s_%sXchaos' "${CONFIG}" "${ALLOCATOR}" "${ARCH}" "${CHAOS_JDK//[.-]/_}" | tr '+' '_'); echo "${_key}=${REASON}" >> build.env; exit 1; fi
+ after_script:
+ - |
+ if [[ "$CI_JOB_STATUS" == "failed" ]]; then
+ _key=$(printf 'REASON_%s_%s_%s_%sXchaos' "${CONFIG}" "${ALLOCATOR}" "${ARCH}" "${CHAOS_JDK//[.-]/_}" | tr '+' '_')
+ grep -q "${_key}=" build.env 2>/dev/null || echo "${_key}=Unknown failure, perhaps timeout" >> build.env
+ fi
+ artifacts:
+ name: "chaos-results-${ARCH}"
+ when: always
+ paths:
+ - hs_err.log
+ - err.log
+ - out.log
+ reports:
+ dotenv: build.env
+ expire_in: 1 day
+
+reliability-chaos-amd64:
+ extends: .reliability_chaos_pr_job
+ tags: ["arch:amd64"]
+ image: $BENCHMARK_IMAGE_AMD64
+ variables:
+ ARCH: amd64
+
+reliability-chaos-aarch64:
+ extends: .reliability_chaos_pr_job
+ tags: ["arch:arm64"]
+ image: $BENCHMARK_IMAGE_ARM64
+ variables:
+ ARCH: aarch64
+
+# ── PR comment ───────────────────────────────────────────────────────────────
+
+post-reliability-pr-comment:
+ extends: .retry-config
+ stage: notify
+ tags: ["arch:arm64"]
+ image: registry.ddbuild.io/images/dd-octo-sts-ci-base:2025.06-1
+ id_tokens:
+ DDOCTOSTS_ID_TOKEN:
+ aud: dd-octo-sts
+ needs:
+ - job: reliability-amd64
+ artifacts: true
+ - job: reliability-aarch64
+ artifacts: true
+ - job: reliability-chaos-amd64
+ artifacts: true
+ - job: reliability-chaos-aarch64
+ artifacts: true
+ rules:
+ - when: always
+ timeout: 5m
+ script:
+ - .gitlab/reliability/post-pr-comment.sh
+ allow_failure: true
diff --git a/.gitlab/scripts/prepare.sh b/.gitlab/scripts/prepare.sh
index 1b803c0c0..38f841730 100755
--- a/.gitlab/scripts/prepare.sh
+++ b/.gitlab/scripts/prepare.sh
@@ -22,6 +22,14 @@ if [ "${CI_PIPELINE_SOURCE}" == "push" ] || [ "${CI_PIPELINE_SOURCE}" == "trigge
echo "CANCELLED=true" >> build.env
exit 0
fi
+ # Detect PR labels and export flags for downstream jobs
+ if command -v jq >/dev/null 2>&1; then
+ if echo "${API_RESPONSE}" | jq -e '[.[0].labels[].name] | any(. == "test:reliability")' >/dev/null 2>&1; then
+ echo "RUN_RELIABILITY=true" >> build.env
+ fi
+ elif echo "${API_RESPONSE}" | grep -q '"test:reliability"'; then
+ echo "RUN_RELIABILITY=true" >> build.env
+ fi
fi
fi
diff --git a/.gitlab/scripts/upsert-github-pr-comment.sh b/.gitlab/scripts/upsert-github-pr-comment.sh
new file mode 100755
index 000000000..c7a60a738
--- /dev/null
+++ b/.gitlab/scripts/upsert-github-pr-comment.sh
@@ -0,0 +1,99 @@
+#!/usr/bin/env bash
+# Upsert a comment on the java-profiler GitHub PR for the current branch.
+#
+# Posts (or replaces) a single marker-tagged comment using a short-lived GitHub
+# token obtained via dd-octo-sts. No pr-commenter / benchmarking-platform clone
+# is required — only dd-octo-sts (present in dd-octo-sts-ci-base) plus curl/jq.
+#
+# Usage:
+# upsert-github-pr-comment.sh
+#
+# comment-id : unique slug used as an HTML marker to find/replace the comment
+# branch : head branch name used to locate the open PR
+# body-file : path to a file holding the markdown comment body
+#
+# Requires in CI: dd-octo-sts CLI + DDOCTOSTS_ID_TOKEN id_token, curl, jq.
+# Token policy async-profiler-build.ci grants issues:write + pull_requests:read.
+
+set -euo pipefail
+
+COMMENT_ID="${1:?comment-id required}"
+BRANCH="${2:?branch required}"
+BODY_FILE="${3:?body-file required}"
+REPO="DataDog/java-profiler"
+API="https://api.github.com/repos/${REPO}"
+
+log() { echo "[upsert-pr-comment] $*" >&2; }
+
+# gh_api [data] — performs a GitHub API call, capturing both the
+# response body and HTTP status. On HTTP >= 400 it logs the status and body
+# (turning opaque "curl 403" failures into actionable diagnostics) and returns 1.
+# On success the response body is written to stdout.
+gh_api() {
+ local method="$1" url="$2" data="${3:-}"
+ local args=(-sS -X "${method}"
+ -H "Authorization: Bearer ${TOKEN}"
+ -H "Accept: application/vnd.github+json"
+ -H "X-GitHub-Api-Version: 2022-11-28"
+ -H "User-Agent: java-profiler-ci"
+ -w $'\n%{http_code}')
+ [ -n "${data}" ] && args+=(-d "${data}")
+ local resp status body
+ resp=$(curl "${args[@]}" "${url}") || { log "curl failed for ${method} ${url}"; return 1; }
+ status="${resp##*$'\n'}"
+ body="${resp%$'\n'*}"
+ if [ "${status}" -ge 400 ]; then
+ log "GitHub API ${method} ${url} -> HTTP ${status}"
+ log "Response: ${body}"
+ return 1
+ fi
+ printf '%s' "${body}"
+}
+
+if [ -z "${BRANCH}" ] || [ "${BRANCH}" = "main" ] || [ "${BRANCH}" = "master" ]; then
+ log "Skipping PR comment for branch: ${BRANCH:-}"
+ exit 0
+fi
+if [ ! -s "${BODY_FILE}" ]; then
+ log "Empty body file (${BODY_FILE}) — nothing to post"
+ exit 0
+fi
+
+# 1. Obtain a GitHub token via dd-octo-sts (no stored secrets). Trim whitespace
+# and validate the format, mirroring publish-gh-pages.sh — a token polluted
+# with log noise/newlines produces a malformed header and a GitHub 403.
+TOKEN=$(dd-octo-sts token --scope "${REPO}" --policy async-profiler-build.ci 2>/tmp/octo-sts.err || true)
+TOKEN="${TOKEN//[$'\t\r\n ']/}"
+if [ -z "${TOKEN}" ]; then
+ log "Failed to obtain GitHub token via dd-octo-sts — skipping comment"
+ [ -s /tmp/octo-sts.err ] && log "dd-octo-sts: $(head -3 /tmp/octo-sts.err)"
+ exit 0
+fi
+if [[ ! "${TOKEN}" =~ ^(ghs_|ghp_|github_pat_|v1\.|[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.) ]]; then
+ log "dd-octo-sts returned an unexpected token format (first 8 chars: ${TOKEN:0:8}) — skipping"
+ exit 0
+fi
+
+# 2. Resolve the open PR for this branch.
+PR=$(gh_api GET "${API}/pulls?head=DataDog:${BRANCH}&state=open&per_page=1" | jq -r '.[0].number // empty')
+if [ -z "${PR}" ]; then
+ log "No open PR found for branch ${BRANCH} — skipping comment"
+ exit 0
+fi
+
+# 3. Prepend a stable marker and build the JSON payload safely.
+MARKER=""
+BODY="${MARKER}"$'\n'"$(cat "${BODY_FILE}")"
+PAYLOAD=$(jq -n --arg body "${BODY}" '{body: $body}')
+
+# 4. Find an existing marker comment and PATCH it, otherwise POST a new one.
+CID=$(gh_api GET "${API}/issues/${PR}/comments?per_page=100" \
+ | jq -r --arg m "${MARKER}" '.[] | select(.body | contains($m)) | .id' | head -n1)
+
+if [ -n "${CID}" ]; then
+ gh_api PATCH "${API}/issues/comments/${CID}" "${PAYLOAD}" >/dev/null
+ log "Updated comment ${CID} on PR #${PR}"
+else
+ gh_api POST "${API}/issues/${PR}/comments" "${PAYLOAD}" >/dev/null
+ log "Created comment on PR #${PR}"
+fi