Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,15 @@ tests:
cron: 0 3 * * 1
nested_podman: true
steps:
allow_best_effort_post_steps: true
env:
OCM_FVT_JOB_NAME: cs-rosa-hcp-ad-production-main
OCM_FVT_OCM_ENV: production
post:
- ref: rosa-e2e-unsilence-alerts
pre:
- ref: rosa-e2e-record-start-time
- ref: rosa-e2e-silence-alerts
Comment thread
coderabbitai[bot] marked this conversation as resolved.
test:
- ref: rosa-e2e-ocm-fvt
timeout: 5h0m0s
Expand Down
16 changes: 16 additions & 0 deletions ci-operator/step-registry/rosa/e2e/silence-alerts/OWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
approvers:
- bmeng
- dustman9000
- gdbranco
- jfrazierredhat
- lucasponce
- ravitri
- tiwillia
reviewers:
- bmeng
- dustman9000
- gdbranco
- jfrazierredhat
- lucasponce
- ravitri
- tiwillia
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/bin/bash
set -o nounset
set -o errexit
set -o pipefail

RHOBS_ENV="${RHOBS_ENV:-production}"
SILENCE_MATCHER_NAME="${SILENCE_MATCHER_NAME:-_id}"
SILENCE_MATCHER_VALUE="${SILENCE_MATCHER_VALUE:-cs-ci-.*}"
SILENCE_DURATION_HOURS="${SILENCE_DURATION_HOURS:-6}"

case "$RHOBS_ENV" in
production)
CELLS=(
"https://us-east-1-0.rhobs.api.openshift.com"
"https://us-east-1-1.rhobs.api.openshift.com"
"https://us-east-1-2.rhobs.api.openshift.com"
"https://us-west-2-0.rhobs.api.openshift.com"
"https://eu-west-1-0.rhobs.api.openshift.com"
"https://eu-central-1-0.rhobs.api.openshift.com"
"https://sa-east-1-0.rhobs.api.openshift.com"
"https://ap-northeast-1-0.rhobs.api.openshift.com"
"https://ap-southeast-2-0.rhobs.api.openshift.com"
)
;;
staging)
CELLS=(
"https://us-east-1-0.rhobs.api.stage.openshift.com"
"https://us-west-2-0.rhobs.api.stage.openshift.com"
)
;;
*)
echo "ERROR: RHOBS_ENV must be production or staging"
exit 1
;;
esac

CLIENT_ID=$(cat /usr/local/rhobs-oidc/client_id)
CLIENT_SECRET=$(cat /usr/local/rhobs-oidc/client_secret)
ISSUER_URL=$(cat /usr/local/rhobs-oidc/oidc_issuer_url 2>/dev/null || echo "https://sso.redhat.com/auth/realms/redhat-external/protocol/openid-connect/token")

TOKEN=$(curl -sf -X POST "$ISSUER_URL" \
-d "grant_type=client_credentials" \
-d "client_id=$CLIENT_ID" \
-d "client_secret=$CLIENT_SECRET" | python3 -c "import sys,json; print(json.load(sys.stdin)['access_token'])" 2>/dev/null) || {
echo "WARNING: Failed to get RHOBS token, skipping silence creation"
exit 0
}

START=$(date -u +"%Y-%m-%dT%H:%M:%S.000Z")
END=$(python3 -c "
from datetime import datetime, timedelta, timezone
end = datetime.now(timezone.utc) + timedelta(hours=$SILENCE_DURATION_HOURS)
print(end.strftime('%Y-%m-%dT%H:%M:%S.000Z'))
")

JOB_URL="https://prow.ci.openshift.org/view/gs/test-platform-results/"
if [[ -n "${PULL_NUMBER:-}" ]]; then
JOB_URL="${JOB_URL}pr-logs/pull/${REPO_OWNER:-}_${REPO_NAME:-}/${PULL_NUMBER}/${JOB_NAME}/${BUILD_ID}"
else
JOB_URL="${JOB_URL}logs/${JOB_NAME:-unknown}/${BUILD_ID:-0}"
fi

COMMENT="ROSAENG-60057: Silencing ${SILENCE_MATCHER_NAME}=~${SILENCE_MATCHER_VALUE} for FVT job ${JOB_URL}"

echo "Creating silences on ${#CELLS[@]} ${RHOBS_ENV} RHOBS cells"
echo " Matcher: ${SILENCE_MATCHER_NAME} =~ ${SILENCE_MATCHER_VALUE}"
echo " Duration: ${SILENCE_DURATION_HOURS}h (${START} -> ${END})"

: > "${SHARED_DIR}/silence-ids"

CREATED=0
for CELL in "${CELLS[@]}"; do
SILENCE_URL="${CELL}/api/metrics/v1/hcp/am/api/v2/silences"

RESULT=$(curl -sf --max-time 10 \
-X POST \
-H "Authorization: Bearer $TOKEN" \
-H "Content-Type: application/json" \
"$SILENCE_URL" \
-d "{
\"matchers\": [{
\"name\": \"${SILENCE_MATCHER_NAME}\",
\"value\": \"${SILENCE_MATCHER_VALUE}\",
\"isRegex\": true,
\"isEqual\": true
}],
\"startsAt\": \"${START}\",
\"endsAt\": \"${END}\",
\"createdBy\": \"rosa-ci-prow\",
\"comment\": \"${COMMENT}\"
}" 2>/dev/null) || {
echo " WARNING: Failed to create silence on ${CELL}"
continue
}

SILENCE_ID=$(echo "$RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('silenceID',''))" 2>/dev/null)

if [[ -n "$SILENCE_ID" ]]; then
echo " ${CELL}: ${SILENCE_ID}"
echo "${CELL}|${SILENCE_ID}" >> "${SHARED_DIR}/silence-ids"
CREATED=$((CREATED + 1))
else
echo " WARNING: No silence ID returned from ${CELL}"
fi
done

echo "Created ${CREATED}/${#CELLS[@]} silences"
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"path": "rosa/e2e/silence-alerts/rosa-e2e-silence-alerts-ref.yaml",
"owners": {
"approvers": [
"bmeng",
"dustman9000",
"gdbranco",
"jfrazierredhat",
"lucasponce",
"ravitri",
"tiwillia"
],
"reviewers": [
"bmeng",
"dustman9000",
"gdbranco",
"jfrazierredhat",
"lucasponce",
"ravitri",
"tiwillia"
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
ref:
as: rosa-e2e-silence-alerts
from_image:
namespace: ocp
name: "4.18"
tag: cli
commands: rosa-e2e-silence-alerts-commands.sh
resources:
requests:
cpu: 10m
memory: 50Mi
timeout: 3m0s
grace_period: 30s
credentials:
- namespace: ci
name: rhobs-oidc-production
mount_path: /usr/local/rhobs-oidc
env:
- name: RHOBS_ENV
default: production
documentation: |-
RHOBS environment to create silences on.
Valid values: production, staging.
- name: SILENCE_MATCHER_NAME
default: _id
documentation: |-
Alert label name to match for silencing.
- name: SILENCE_MATCHER_VALUE
default: "cs-ci-.*"
documentation: |-
Regex pattern to match against the label. All alerts matching
this pattern will be silenced for the duration of the job.
- name: SILENCE_DURATION_HOURS
default: "6"
documentation: |-
Duration of the silence in hours. Should match or slightly exceed
the job timeout to ensure alerts stay silenced for the full run.
documentation: |-
Creates alertmanager silences on all RHOBS cells for the specified
environment, matching alerts by a regex pattern on a label (default:
_id =~ "cs-ci-.*"). Used to suppress alerts from FVT test clusters
during production CI runs. Silence IDs are saved to SHARED_DIR for
cleanup by the rosa-e2e-unsilence-alerts post-step.
16 changes: 16 additions & 0 deletions ci-operator/step-registry/rosa/e2e/unsilence-alerts/OWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
approvers:
- bmeng
- dustman9000
- gdbranco
- jfrazierredhat
- lucasponce
- ravitri
- tiwillia
reviewers:
- bmeng
- dustman9000
- gdbranco
- jfrazierredhat
- lucasponce
- ravitri
- tiwillia
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/bash
set -o nounset
set -o errexit
set -o pipefail

SILENCE_FILE="${SHARED_DIR}/silence-ids"

if [[ ! -f "$SILENCE_FILE" ]] || [[ ! -s "$SILENCE_FILE" ]]; then
echo "No silences to expire (file missing or empty)"
exit 0
fi

CLIENT_ID=$(cat /usr/local/rhobs-oidc/client_id)
CLIENT_SECRET=$(cat /usr/local/rhobs-oidc/client_secret)
ISSUER_URL=$(cat /usr/local/rhobs-oidc/oidc_issuer_url 2>/dev/null || echo "https://sso.redhat.com/auth/realms/redhat-external/protocol/openid-connect/token")

TOKEN=$(curl -sf -X POST "$ISSUER_URL" \
-d "grant_type=client_credentials" \
-d "client_id=$CLIENT_ID" \
-d "client_secret=$CLIENT_SECRET" | python3 -c "import sys,json; print(json.load(sys.stdin)['access_token'])" 2>/dev/null) || {
echo "WARNING: Failed to get RHOBS token, silences will expire naturally"
exit 0
}

EXPIRED=0
TOTAL=0

while IFS='|' read -r CELL SILENCE_ID; do
[[ -z "$CELL" || -z "$SILENCE_ID" ]] && continue
TOTAL=$((TOTAL + 1))

if curl -sf --max-time 10 \
-X DELETE \
-H "Authorization: Bearer $TOKEN" \
"${CELL}/api/metrics/v1/hcp/am/api/v2/silence/${SILENCE_ID}" 2>/dev/null; then
echo " Expired: ${CELL} ${SILENCE_ID}"
EXPIRED=$((EXPIRED + 1))
else
echo " WARNING: Failed to expire ${SILENCE_ID} on ${CELL}"
fi
done < "$SILENCE_FILE"

echo "Expired ${EXPIRED}/${TOTAL} silences"
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"path": "rosa/e2e/unsilence-alerts/rosa-e2e-unsilence-alerts-ref.yaml",
"owners": {
"approvers": [
"bmeng",
"dustman9000",
"gdbranco",
"jfrazierredhat",
"lucasponce",
"ravitri",
"tiwillia"
],
"reviewers": [
"bmeng",
"dustman9000",
"gdbranco",
"jfrazierredhat",
"lucasponce",
"ravitri",
"tiwillia"
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
ref:
as: rosa-e2e-unsilence-alerts
from_image:
namespace: ocp
name: "4.18"
tag: cli
commands: rosa-e2e-unsilence-alerts-commands.sh
resources:
requests:
cpu: 10m
memory: 50Mi
timeout: 3m0s
grace_period: 30s
credentials:
- namespace: ci
name: rhobs-oidc-production
mount_path: /usr/local/rhobs-oidc
documentation: |-
Expires alertmanager silences created by the rosa-e2e-silence-alerts
pre-step. Reads silence IDs from SHARED_DIR/silence-ids and DELETEs
each one via the RHOBS gateway API.