diff --git a/ci-operator/config/openshift-online/rosa-e2e/openshift-online-rosa-e2e-main__ocm-fvt-rosa-hcp-production.yaml b/ci-operator/config/openshift-online/rosa-e2e/openshift-online-rosa-e2e-main__ocm-fvt-rosa-hcp-production.yaml index 847dd02f9961b..debfc61a69f71 100644 --- a/ci-operator/config/openshift-online/rosa-e2e/openshift-online-rosa-e2e-main__ocm-fvt-rosa-hcp-production.yaml +++ b/ci-operator/config/openshift-online/rosa-e2e/openshift-online-rosa-e2e-main__ocm-fvt-rosa-hcp-production.yaml @@ -28,9 +28,15 @@ tests: cron: 0 3 * * 1 nested_podman: true steps: + allow_best_effort_post_steps: true env: OCM_FVT_JOB_NAME: cs-rosa-hcp-ad-production-main OCM_FVT_OCM_ENV: production + post: + - ref: rosa-e2e-unsilence-alerts + pre: + - ref: rosa-e2e-record-start-time + - ref: rosa-e2e-silence-alerts test: - ref: rosa-e2e-ocm-fvt timeout: 5h0m0s diff --git a/ci-operator/step-registry/rosa/e2e/silence-alerts/OWNERS b/ci-operator/step-registry/rosa/e2e/silence-alerts/OWNERS new file mode 100644 index 0000000000000..91b6e6a61b26e --- /dev/null +++ b/ci-operator/step-registry/rosa/e2e/silence-alerts/OWNERS @@ -0,0 +1,16 @@ +approvers: +- bmeng +- dustman9000 +- gdbranco +- jfrazierredhat +- lucasponce +- ravitri +- tiwillia +reviewers: +- bmeng +- dustman9000 +- gdbranco +- jfrazierredhat +- lucasponce +- ravitri +- tiwillia diff --git a/ci-operator/step-registry/rosa/e2e/silence-alerts/rosa-e2e-silence-alerts-commands.sh b/ci-operator/step-registry/rosa/e2e/silence-alerts/rosa-e2e-silence-alerts-commands.sh new file mode 100644 index 0000000000000..5f96659ce61a0 --- /dev/null +++ b/ci-operator/step-registry/rosa/e2e/silence-alerts/rosa-e2e-silence-alerts-commands.sh @@ -0,0 +1,107 @@ +#!/bin/bash +set -o nounset +set -o errexit +set -o pipefail + +RHOBS_ENV="${RHOBS_ENV:-production}" +SILENCE_MATCHER_NAME="${SILENCE_MATCHER_NAME:-_id}" +SILENCE_MATCHER_VALUE="${SILENCE_MATCHER_VALUE:-cs-ci-.*}" +SILENCE_DURATION_HOURS="${SILENCE_DURATION_HOURS:-6}" + +case "$RHOBS_ENV" in + production) + CELLS=( + "https://us-east-1-0.rhobs.api.openshift.com" + "https://us-east-1-1.rhobs.api.openshift.com" + "https://us-east-1-2.rhobs.api.openshift.com" + "https://us-west-2-0.rhobs.api.openshift.com" + "https://eu-west-1-0.rhobs.api.openshift.com" + "https://eu-central-1-0.rhobs.api.openshift.com" + "https://sa-east-1-0.rhobs.api.openshift.com" + "https://ap-northeast-1-0.rhobs.api.openshift.com" + "https://ap-southeast-2-0.rhobs.api.openshift.com" + ) + ;; + staging) + CELLS=( + "https://us-east-1-0.rhobs.api.stage.openshift.com" + "https://us-west-2-0.rhobs.api.stage.openshift.com" + ) + ;; + *) + echo "ERROR: RHOBS_ENV must be production or staging" + exit 1 + ;; +esac + +CLIENT_ID=$(cat /usr/local/rhobs-oidc/client_id) +CLIENT_SECRET=$(cat /usr/local/rhobs-oidc/client_secret) +ISSUER_URL=$(cat /usr/local/rhobs-oidc/oidc_issuer_url 2>/dev/null || echo "https://sso.redhat.com/auth/realms/redhat-external/protocol/openid-connect/token") + +TOKEN=$(curl -sf -X POST "$ISSUER_URL" \ + -d "grant_type=client_credentials" \ + -d "client_id=$CLIENT_ID" \ + -d "client_secret=$CLIENT_SECRET" | python3 -c "import sys,json; print(json.load(sys.stdin)['access_token'])" 2>/dev/null) || { + echo "WARNING: Failed to get RHOBS token, skipping silence creation" + exit 0 +} + +START=$(date -u +"%Y-%m-%dT%H:%M:%S.000Z") +END=$(python3 -c " +from datetime import datetime, timedelta, timezone +end = datetime.now(timezone.utc) + timedelta(hours=$SILENCE_DURATION_HOURS) +print(end.strftime('%Y-%m-%dT%H:%M:%S.000Z')) +") + +JOB_URL="https://prow.ci.openshift.org/view/gs/test-platform-results/" +if [[ -n "${PULL_NUMBER:-}" ]]; then + JOB_URL="${JOB_URL}pr-logs/pull/${REPO_OWNER:-}_${REPO_NAME:-}/${PULL_NUMBER}/${JOB_NAME}/${BUILD_ID}" +else + JOB_URL="${JOB_URL}logs/${JOB_NAME:-unknown}/${BUILD_ID:-0}" +fi + +COMMENT="ROSAENG-60057: Silencing ${SILENCE_MATCHER_NAME}=~${SILENCE_MATCHER_VALUE} for FVT job ${JOB_URL}" + +echo "Creating silences on ${#CELLS[@]} ${RHOBS_ENV} RHOBS cells" +echo " Matcher: ${SILENCE_MATCHER_NAME} =~ ${SILENCE_MATCHER_VALUE}" +echo " Duration: ${SILENCE_DURATION_HOURS}h (${START} -> ${END})" + +: > "${SHARED_DIR}/silence-ids" + +CREATED=0 +for CELL in "${CELLS[@]}"; do + SILENCE_URL="${CELL}/api/metrics/v1/hcp/am/api/v2/silences" + + RESULT=$(curl -sf --max-time 10 \ + -X POST \ + -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + "$SILENCE_URL" \ + -d "{ + \"matchers\": [{ + \"name\": \"${SILENCE_MATCHER_NAME}\", + \"value\": \"${SILENCE_MATCHER_VALUE}\", + \"isRegex\": true, + \"isEqual\": true + }], + \"startsAt\": \"${START}\", + \"endsAt\": \"${END}\", + \"createdBy\": \"rosa-ci-prow\", + \"comment\": \"${COMMENT}\" + }" 2>/dev/null) || { + echo " WARNING: Failed to create silence on ${CELL}" + continue + } + + SILENCE_ID=$(echo "$RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('silenceID',''))" 2>/dev/null) + + if [[ -n "$SILENCE_ID" ]]; then + echo " ${CELL}: ${SILENCE_ID}" + echo "${CELL}|${SILENCE_ID}" >> "${SHARED_DIR}/silence-ids" + CREATED=$((CREATED + 1)) + else + echo " WARNING: No silence ID returned from ${CELL}" + fi +done + +echo "Created ${CREATED}/${#CELLS[@]} silences" diff --git a/ci-operator/step-registry/rosa/e2e/silence-alerts/rosa-e2e-silence-alerts-ref.metadata.json b/ci-operator/step-registry/rosa/e2e/silence-alerts/rosa-e2e-silence-alerts-ref.metadata.json new file mode 100644 index 0000000000000..f6bce973475bd --- /dev/null +++ b/ci-operator/step-registry/rosa/e2e/silence-alerts/rosa-e2e-silence-alerts-ref.metadata.json @@ -0,0 +1,23 @@ +{ + "path": "rosa/e2e/silence-alerts/rosa-e2e-silence-alerts-ref.yaml", + "owners": { + "approvers": [ + "bmeng", + "dustman9000", + "gdbranco", + "jfrazierredhat", + "lucasponce", + "ravitri", + "tiwillia" + ], + "reviewers": [ + "bmeng", + "dustman9000", + "gdbranco", + "jfrazierredhat", + "lucasponce", + "ravitri", + "tiwillia" + ] + } +} \ No newline at end of file diff --git a/ci-operator/step-registry/rosa/e2e/silence-alerts/rosa-e2e-silence-alerts-ref.yaml b/ci-operator/step-registry/rosa/e2e/silence-alerts/rosa-e2e-silence-alerts-ref.yaml new file mode 100644 index 0000000000000..eacdcd9f4a68a --- /dev/null +++ b/ci-operator/step-registry/rosa/e2e/silence-alerts/rosa-e2e-silence-alerts-ref.yaml @@ -0,0 +1,43 @@ +ref: + as: rosa-e2e-silence-alerts + from_image: + namespace: ocp + name: "4.18" + tag: cli + commands: rosa-e2e-silence-alerts-commands.sh + resources: + requests: + cpu: 10m + memory: 50Mi + timeout: 3m0s + grace_period: 30s + credentials: + - namespace: ci + name: rhobs-oidc-production + mount_path: /usr/local/rhobs-oidc + env: + - name: RHOBS_ENV + default: production + documentation: |- + RHOBS environment to create silences on. + Valid values: production, staging. + - name: SILENCE_MATCHER_NAME + default: _id + documentation: |- + Alert label name to match for silencing. + - name: SILENCE_MATCHER_VALUE + default: "cs-ci-.*" + documentation: |- + Regex pattern to match against the label. All alerts matching + this pattern will be silenced for the duration of the job. + - name: SILENCE_DURATION_HOURS + default: "6" + documentation: |- + Duration of the silence in hours. Should match or slightly exceed + the job timeout to ensure alerts stay silenced for the full run. + documentation: |- + Creates alertmanager silences on all RHOBS cells for the specified + environment, matching alerts by a regex pattern on a label (default: + _id =~ "cs-ci-.*"). Used to suppress alerts from FVT test clusters + during production CI runs. Silence IDs are saved to SHARED_DIR for + cleanup by the rosa-e2e-unsilence-alerts post-step. diff --git a/ci-operator/step-registry/rosa/e2e/unsilence-alerts/OWNERS b/ci-operator/step-registry/rosa/e2e/unsilence-alerts/OWNERS new file mode 100644 index 0000000000000..91b6e6a61b26e --- /dev/null +++ b/ci-operator/step-registry/rosa/e2e/unsilence-alerts/OWNERS @@ -0,0 +1,16 @@ +approvers: +- bmeng +- dustman9000 +- gdbranco +- jfrazierredhat +- lucasponce +- ravitri +- tiwillia +reviewers: +- bmeng +- dustman9000 +- gdbranco +- jfrazierredhat +- lucasponce +- ravitri +- tiwillia diff --git a/ci-operator/step-registry/rosa/e2e/unsilence-alerts/rosa-e2e-unsilence-alerts-commands.sh b/ci-operator/step-registry/rosa/e2e/unsilence-alerts/rosa-e2e-unsilence-alerts-commands.sh new file mode 100644 index 0000000000000..c33b8a645d03b --- /dev/null +++ b/ci-operator/step-registry/rosa/e2e/unsilence-alerts/rosa-e2e-unsilence-alerts-commands.sh @@ -0,0 +1,43 @@ +#!/bin/bash +set -o nounset +set -o errexit +set -o pipefail + +SILENCE_FILE="${SHARED_DIR}/silence-ids" + +if [[ ! -f "$SILENCE_FILE" ]] || [[ ! -s "$SILENCE_FILE" ]]; then + echo "No silences to expire (file missing or empty)" + exit 0 +fi + +CLIENT_ID=$(cat /usr/local/rhobs-oidc/client_id) +CLIENT_SECRET=$(cat /usr/local/rhobs-oidc/client_secret) +ISSUER_URL=$(cat /usr/local/rhobs-oidc/oidc_issuer_url 2>/dev/null || echo "https://sso.redhat.com/auth/realms/redhat-external/protocol/openid-connect/token") + +TOKEN=$(curl -sf -X POST "$ISSUER_URL" \ + -d "grant_type=client_credentials" \ + -d "client_id=$CLIENT_ID" \ + -d "client_secret=$CLIENT_SECRET" | python3 -c "import sys,json; print(json.load(sys.stdin)['access_token'])" 2>/dev/null) || { + echo "WARNING: Failed to get RHOBS token, silences will expire naturally" + exit 0 +} + +EXPIRED=0 +TOTAL=0 + +while IFS='|' read -r CELL SILENCE_ID; do + [[ -z "$CELL" || -z "$SILENCE_ID" ]] && continue + TOTAL=$((TOTAL + 1)) + + if curl -sf --max-time 10 \ + -X DELETE \ + -H "Authorization: Bearer $TOKEN" \ + "${CELL}/api/metrics/v1/hcp/am/api/v2/silence/${SILENCE_ID}" 2>/dev/null; then + echo " Expired: ${CELL} ${SILENCE_ID}" + EXPIRED=$((EXPIRED + 1)) + else + echo " WARNING: Failed to expire ${SILENCE_ID} on ${CELL}" + fi +done < "$SILENCE_FILE" + +echo "Expired ${EXPIRED}/${TOTAL} silences" diff --git a/ci-operator/step-registry/rosa/e2e/unsilence-alerts/rosa-e2e-unsilence-alerts-ref.metadata.json b/ci-operator/step-registry/rosa/e2e/unsilence-alerts/rosa-e2e-unsilence-alerts-ref.metadata.json new file mode 100644 index 0000000000000..453a6d3e5b8ef --- /dev/null +++ b/ci-operator/step-registry/rosa/e2e/unsilence-alerts/rosa-e2e-unsilence-alerts-ref.metadata.json @@ -0,0 +1,23 @@ +{ + "path": "rosa/e2e/unsilence-alerts/rosa-e2e-unsilence-alerts-ref.yaml", + "owners": { + "approvers": [ + "bmeng", + "dustman9000", + "gdbranco", + "jfrazierredhat", + "lucasponce", + "ravitri", + "tiwillia" + ], + "reviewers": [ + "bmeng", + "dustman9000", + "gdbranco", + "jfrazierredhat", + "lucasponce", + "ravitri", + "tiwillia" + ] + } +} \ No newline at end of file diff --git a/ci-operator/step-registry/rosa/e2e/unsilence-alerts/rosa-e2e-unsilence-alerts-ref.yaml b/ci-operator/step-registry/rosa/e2e/unsilence-alerts/rosa-e2e-unsilence-alerts-ref.yaml new file mode 100644 index 0000000000000..a6959534967b5 --- /dev/null +++ b/ci-operator/step-registry/rosa/e2e/unsilence-alerts/rosa-e2e-unsilence-alerts-ref.yaml @@ -0,0 +1,21 @@ +ref: + as: rosa-e2e-unsilence-alerts + from_image: + namespace: ocp + name: "4.18" + tag: cli + commands: rosa-e2e-unsilence-alerts-commands.sh + resources: + requests: + cpu: 10m + memory: 50Mi + timeout: 3m0s + grace_period: 30s + credentials: + - namespace: ci + name: rhobs-oidc-production + mount_path: /usr/local/rhobs-oidc + documentation: |- + Expires alertmanager silences created by the rosa-e2e-silence-alerts + pre-step. Reads silence IDs from SHARED_DIR/silence-ids and DELETEs + each one via the RHOBS gateway API.