From 2f8bb96335d562ff15d3ea93d979fa978ce3cdfd Mon Sep 17 00:00:00 2001 From: Simon Schrottner Date: Thu, 30 Apr 2026 11:23:51 +0200 Subject: [PATCH 1/8] =?UTF-8?q?adventure:=20=F0=9F=A7=AA=20Blind=20by=20De?= =?UTF-8?q?sign=20=E2=80=94=20=F0=9F=94=B4=20Expert?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire the OpenTelemetry meter provider, register the OpenFeature MetricsHook + TracesHook, author a ContextSpanHook that copies the merged evaluation context onto Tempo spans, then diagnose and roll back a misbehaving fractional rollout (vision_amplifier_v2) on the Grafana LGTM dashboard โ€” no redeploy. Replaces the placeholder expert.md stub with the full level doc, ships the Expert solution walkthrough, broken-state code (including the dashboard JSON and k6 loadgen), verify.sh, and devcontainer. Stacked on top of #43 (๐ŸŸก Intermediate). Review #42 then #43 first. This is the last PR in the series, so it closes the tracking issue. Closes #41 Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Simon Schrottner --- .../devcontainer.json | 41 +++ .../docker-compose.yml | 69 +++++ .../post-create.sh | 31 ++ .../post-start.sh | 52 ++++ .../planned/00-blind-by-design/docs/expert.md | 292 ++++++++++++++++++ .../docs/solutions/expert.md | 280 +++++++++++++++++ .../.mvn/wrapper/maven-wrapper.properties | 1 + .../expert/.vscode/launch.json | 14 + .../expert/.vscode/tasks.json | 14 + .../expert/dashboards/feature-flags.json | 135 ++++++++ .../00-blind-by-design/expert/flags.json | 46 +++ .../expert/loadgen/k6/script.js | 63 ++++ .../planned/00-blind-by-design/expert/mvnw | 259 ++++++++++++++++ .../00-blind-by-design/expert/mvnw.cmd | 149 +++++++++ .../planned/00-blind-by-design/expert/pom.xml | 101 ++++++ .../openfeature/demo/java/demo/AuditHook.java | 53 ++++ .../demo/java/demo/Laboratory.java | 13 + .../demo/java/demo/OpenFeatureConfig.java | 74 +++++ .../demo/java/demo/OpenTelemetryConfig.java | 73 +++++ .../demo/java/demo/SpeciesInterceptor.java | 45 +++ .../dev/openfeature/demo/java/demo/Trial.java | 59 ++++ .../src/main/resources/application.properties | 11 + .../00-blind-by-design/expert/verify.sh | 227 ++++++++++++++ 23 files changed, 2102 insertions(+) create mode 100644 .devcontainer/00-blind-by-design_03-expert/devcontainer.json create mode 100644 .devcontainer/00-blind-by-design_03-expert/docker-compose.yml create mode 100755 .devcontainer/00-blind-by-design_03-expert/post-create.sh create mode 100755 .devcontainer/00-blind-by-design_03-expert/post-start.sh create mode 100644 adventures/planned/00-blind-by-design/docs/expert.md create mode 100644 adventures/planned/00-blind-by-design/docs/solutions/expert.md create mode 100644 adventures/planned/00-blind-by-design/expert/.mvn/wrapper/maven-wrapper.properties create mode 100644 adventures/planned/00-blind-by-design/expert/.vscode/launch.json create mode 100644 adventures/planned/00-blind-by-design/expert/.vscode/tasks.json create mode 100644 adventures/planned/00-blind-by-design/expert/dashboards/feature-flags.json create mode 100644 adventures/planned/00-blind-by-design/expert/flags.json create mode 100644 adventures/planned/00-blind-by-design/expert/loadgen/k6/script.js create mode 100755 adventures/planned/00-blind-by-design/expert/mvnw create mode 100644 adventures/planned/00-blind-by-design/expert/mvnw.cmd create mode 100644 adventures/planned/00-blind-by-design/expert/pom.xml create mode 100644 adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/AuditHook.java create mode 100644 adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/Laboratory.java create mode 100644 adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenFeatureConfig.java create mode 100644 adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenTelemetryConfig.java create mode 100644 adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/SpeciesInterceptor.java create mode 100644 adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/Trial.java create mode 100644 adventures/planned/00-blind-by-design/expert/src/main/resources/application.properties create mode 100755 adventures/planned/00-blind-by-design/expert/verify.sh diff --git a/.devcontainer/00-blind-by-design_03-expert/devcontainer.json b/.devcontainer/00-blind-by-design_03-expert/devcontainer.json new file mode 100644 index 00000000..44fb84e5 --- /dev/null +++ b/.devcontainer/00-blind-by-design_03-expert/devcontainer.json @@ -0,0 +1,41 @@ +{ + "name": "๐Ÿงช Adventure 00 | ๐Ÿ”ด Expert (Phase 3 โ€” read the chart)", + "dockerComposeFile": "docker-compose.yml", + "service": "workspace", + "workspaceFolder": "/workspaces/${localWorkspaceFolderBasename}/adventures/planned/00-blind-by-design/expert", + "postCreateCommand": "bash /workspaces/${localWorkspaceFolderBasename}/.devcontainer/00-blind-by-design_03-expert/post-create.sh", + "postStartCommand": "bash /workspaces/${localWorkspaceFolderBasename}/.devcontainer/00-blind-by-design_03-expert/post-start.sh", + "customizations": { + "vscode": { + "extensions": [ + "vscjava.vscode-java-pack", + "redhat.vscode-yaml", + "ms-azuretools.vscode-docker" + ] + }, + "codespaces": { + "openFiles": [ + "adventures/planned/00-blind-by-design/docs/expert.md", + "adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenTelemetryConfig.java", + "adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenFeatureConfig.java", + "adventures/planned/00-blind-by-design/expert/flags.json" + ] + } + }, + "forwardPorts": [8080, 3000, 4317, 4318, 9090, 3200, 8013, 8014, 8015, 8016], + "portsAttributes": { + "8080": { "label": "Spring Boot lab", "onAutoForward": "notify" }, + "3000": { "label": "Grafana", "onAutoForward": "notify" }, + "4317": { "label": "OTLP gRPC", "onAutoForward": "ignore" }, + "4318": { "label": "OTLP HTTP", "onAutoForward": "ignore" }, + "9090": { "label": "Prometheus", "onAutoForward": "ignore" }, + "3200": { "label": "Tempo HTTP API", "onAutoForward": "ignore" }, + "8013": { "label": "flagd gRPC eval", "onAutoForward": "ignore" }, + "8014": { "label": "flagd management/metrics", "onAutoForward": "ignore" }, + "8015": { "label": "flagd sync (IN_PROCESS)", "onAutoForward": "ignore" }, + "8016": { "label": "flagd OFREP", "onAutoForward": "ignore" } + }, + "otherPortsAttributes": { + "onAutoForward": "ignore" + } +} diff --git a/.devcontainer/00-blind-by-design_03-expert/docker-compose.yml b/.devcontainer/00-blind-by-design_03-expert/docker-compose.yml new file mode 100644 index 00000000..842b3fce --- /dev/null +++ b/.devcontainer/00-blind-by-design_03-expert/docker-compose.yml @@ -0,0 +1,69 @@ +# Multi-container devcontainer for Expert. The lab itself runs in +# `workspace`; flagd, the Grafana LGTM stack, and the k6 loadgen run as +# sibling services. No Docker-in-Docker โ€” the devcontainer attaches to +# `workspace` and the rest of the stack is already up. +# +# Inside `workspace`, services are reachable by service name +# (flagd:8013, lgtm:4317, etc.). FLAGD_HOST and OTEL_EXPORTER_OTLP_ENDPOINT +# are pre-set so the participant does not have to hard-code hostnames. +# Codespaces also forwards each port to localhost on the host so verify.sh +# and curl can keep using localhost:NNNN unchanged. + +services: + workspace: + image: mcr.microsoft.com/devcontainers/java:1-21 + volumes: + - ../..:/workspaces/${localWorkspaceFolderBasename:-open-ecosystem-challenges}:cached + command: sleep infinity + environment: + - FLAGD_HOST=flagd + - FLAGD_PORT=8013 + - OTEL_EXPORTER_OTLP_ENDPOINT=http://lgtm:4317 + - OTEL_EXPORTER_OTLP_PROTOCOL=grpc + - OTEL_SERVICE_NAME=fun-with-flags-java-spring + # Trial country of registration. Read by OpenFeatureConfig via + # System.getenv("COUNTRY") and put on the global eval context. + - COUNTRY=de + + flagd: + image: ghcr.io/open-feature/flagd:latest + volumes: + - ../..:/workspaces/${localWorkspaceFolderBasename:-open-ecosystem-challenges}:ro + command: + - start + - --uri + - file:/workspaces/${localWorkspaceFolderBasename:-open-ecosystem-challenges}/adventures/planned/00-blind-by-design/expert/flags.json + ports: + - "8013:8013" + - "8014:8014" + - "8015:8015" + - "8016:8016" + + lgtm: + image: grafana/otel-lgtm:latest + ports: + - "3000:3000" # Grafana UI (admin / admin) + - "4317:4317" # OTLP gRPC + - "4318:4318" # OTLP HTTP + - "9090:9090" # Prometheus query API (verify.sh) + - "3200:3200" # Tempo HTTP API (verify.sh) + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + volumes: + - ../..:/workspaces/${localWorkspaceFolderBasename:-open-ecosystem-challenges}:ro + - ../../adventures/planned/00-blind-by-design/expert/dashboards:/otel-lgtm/grafana/dashboards:ro + + loadgen: + image: grafana/k6:latest + command: ["run", "--quiet", "/scripts/script.js"] + volumes: + - ../../adventures/planned/00-blind-by-design/expert/loadgen/k6:/scripts:ro + environment: + # The script idles while loadgen_active is "off". Flip it in flags.json + # to start hammering the lab. + - BASE_URL=http://workspace:8080 + - FLAGD_URL=http://flagd:8013 + restart: unless-stopped + depends_on: + - flagd diff --git a/.devcontainer/00-blind-by-design_03-expert/post-create.sh b/.devcontainer/00-blind-by-design_03-expert/post-create.sh new file mode 100755 index 00000000..ee115d49 --- /dev/null +++ b/.devcontainer/00-blind-by-design_03-expert/post-create.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +set -e + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" + +# shellcheck disable=SC1091 +source "$REPO_ROOT/lib/scripts/tracker.sh" +set_tracking_context "00-blind-by-design" "expert" +track_codespace_created + +# gum is used by the verify.sh / output.sh helpers +"$REPO_ROOT/lib/shared/init.sh" --version v0.17.0 # https://github.com/charmbracelet/gum/releases + +# jq is needed by verify.sh; the Java devcontainer image is debian-based. +if ! command -v jq >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y --no-install-recommends jq +fi + +CHALLENGE_DIR="$REPO_ROOT/adventures/planned/00-blind-by-design/expert" + +# Make the Maven wrapper executable so the participant can just `./mvnw ...` +if [[ -f "$CHALLENGE_DIR/mvnw" ]]; then + chmod +x "$CHALLENGE_DIR/mvnw" +fi + +echo "โœจ Pre-warming the Maven dependency cache so the first ./mvnw is fast..." +( cd "$CHALLENGE_DIR" && ./mvnw -q -DskipTests dependency:go-offline ) || \ + echo "โš ๏ธ Dependency pre-warm skipped (network or wrapper not ready yet)" + +echo "โœ… Phase 3 toolchain ready (gum + Java 21). flagd / lgtm / loadgen run as sibling devcontainer services." diff --git a/.devcontainer/00-blind-by-design_03-expert/post-start.sh b/.devcontainer/00-blind-by-design_03-expert/post-start.sh new file mode 100755 index 00000000..454d9b5e --- /dev/null +++ b/.devcontainer/00-blind-by-design_03-expert/post-start.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +set -e + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +CHALLENGE_DIR="$REPO_ROOT/adventures/planned/00-blind-by-design/expert" + +cat </dev/null 2>&1; then + code "$REPO_ROOT/adventures/planned/00-blind-by-design/docs/expert.md" \ + "$CHALLENGE_DIR/src/main/java/dev/openfeature/demo/java/demo/OpenTelemetryConfig.java" \ + "$CHALLENGE_DIR/src/main/java/dev/openfeature/demo/java/demo/OpenFeatureConfig.java" \ + "$CHALLENGE_DIR/flags.json" \ + 2>/dev/null || true +fi diff --git a/adventures/planned/00-blind-by-design/docs/expert.md b/adventures/planned/00-blind-by-design/docs/expert.md new file mode 100644 index 00000000..4b5334f5 --- /dev/null +++ b/adventures/planned/00-blind-by-design/docs/expert.md @@ -0,0 +1,292 @@ +# ๐Ÿ”ด Expert: Phase 3 โ€” read the chart + +Three sub-tasks: + +1. **Wire the OpenTelemetry meter provider** and register the OpenFeature `MetricsHook` so flag evaluations show up as Prometheus counters. +2. **Author a `ContextSpanHook`** of your own โ€” a small `Hook` that copies the merged evaluation context (`species`, `country`, `dose`) onto the active OTel span as `feature_flag.context.` so traces correlate variants with the context that drove them. +3. **Diagnose and roll back a misbehaving fractional rollout.** The `vision_amplifier_v2` flag is at 100% on; it's adding 200ms latency and a 10% HTTP 5xx rate. Identify it on the Grafana dashboard and roll it back via `flags.json` โ€” no redeploy. + +Spans are already flowing into Tempo from the OpenFeature `TracesHook`, but the metrics half is dead โ€” the `MeterProvider` has no exporter and the `MetricsHook` was never registered. The dashboard the operator wants to triage from is empty. The k6 loadgen is idle, waiting for a flag flip to turn it on. + +The level passes when (a) `feature_flag_evaluation_requests_total` is non-zero in Prometheus, (b) Tempo spans for `fun-with-flags-java-spring` carry `feature_flag.context.*` attributes, (c) `vision_amplifier_v2` is rolled back to 100% off, and (d) the HTTP 5xx rate over the last minute is below 1%. + +## ๐Ÿงช The story (optional) + +The trial just went wide. Phase 3 of the new vision amplifier โ€” `vision_amplifier_v2` โ€” was approved for the full cohort yesterday morning. The promise was straightforward: subjects emerge with sharper eyesight than they walked in with. By mid-afternoon the audit log was screaming. Subjects were stabilising 200ms slower, and roughly one in ten of them was emerging **blind** โ€” containment failure recorded as an HTTP 500. The lab director pulled up the **Feature Flag Metrics** dashboard expecting to triage visually. The dashboard was dark. Someone had wired up traces but never finished the metrics half. There is no chart to read. The lab is studying eyesight and the lab itself cannot see. + +Your job, in order: **turn on the lights**, find the bad arm of the trial, and **halt enrolment** on the amplifier โ€” all without redeploying the lab. That last constraint is the whole point of feature flags: when a rollout starts misbehaving in production, you need an operational lever that does not take twenty minutes to pull. Save the file, watch the dose drop, watch the 5xx rate fall back to baseline, watch the next batch of subjects walk out seeing. + +## โฐ Deadline + +Coming Soon +> โ„น๏ธ You can still complete the challenge after this date, but points will only +> be awarded for submissions before the deadline. + +## ๐Ÿ“ Solution Walkthrough + +> โš ๏ธ **Spoiler Alert:** The following walkthrough contains the full solution +> to the challenge. We encourage you to try solving it on your own first. +> Consider coming back here only if you get stuck or want to check your +> approach. + +If you get stuck, follow the +[step-by-step solution walkthrough](./solutions/expert.md). + +## ๐Ÿ’ฌ Join the discussion + +Share your solutions and questions in the +[challenge thread](https://community.open-ecosystem.com/c/open-ecosystem-challenges/) +in the Open Ecosystem Community. + +## ๐Ÿ—๏ธ Architecture + +Four containers and one Spring Boot process, all on a shared Docker network. + +``` +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” OTLP/gRPC :4317 โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ Spring Boot โ”‚ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ–ถ โ”‚ grafana/otel-lgtm โ”‚ +โ”‚ fun-with-flags- โ”‚ flag eval + HTTP โ”‚ - Grafana :3000 โ”‚ +โ”‚ java-spring โ”‚ โ”‚ - Prometheus :9090 โ”‚ +โ”‚ :8080 โ”‚ โ”‚ - Tempo :3200 โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ–ฒโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ”‚ OpenFeature SDK :8013 โ”‚ scrape / pull + โ”‚ (RPC mode) โ”‚ +โ”Œโ”€โ”€โ”€โ”€โ”€โ–ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ flagd โ”‚ โ—€โ”€โ”€โ”€โ”€ poll loadgen flag โ”€โ”€โ”‚ k6 loadgen โ”‚ +โ”‚ :8013 (gRPC + HTTP โ”‚ โ”‚ HTTP GET /?userId=โ€ฆ โ”‚ +โ”‚ eval gateway)โ”‚ โ”‚ (the lab interceptor โ”‚ +โ”‚ :8014 management / โ”‚ โ”‚ sets userId as the โ”‚ +โ”‚ metrics โ”‚ โ”‚ targetingKey, which โ”‚ +โ”‚ :8015 sync stream โ”‚ โ”‚ is what fractional โ”‚ +โ”‚ :8016 OFREP โ”‚ โ”‚ rollouts bucket on) โ”‚ +โ”‚ flags.json mounted โ”‚ โ”‚ โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ +``` + +## ๐ŸŽฏ Objective + +By the end of this level, you should have: + +- The OpenTelemetry **meter provider** wired and the OpenFeature **`MetricsHook`** registered +- Verified: the **`SpeciesInterceptor`** carried over from Intermediate is wiring `?userId=` as the OpenFeature **`targetingKey`** on every request, so the `vision_amplifier_v2` fractional rollout buckets per subject rather than landing every request in the same bucket *(you don't write this โ€” verify it via the dashboard's variant-distribution panel after step 5)* +- A **`ContextSpanHook`** of your own โ€” a small `Hook` that copies the merged evaluation context (`species`, `country`, `dose`) onto the active span as `feature_flag.context.` โ€” registered alongside `TracesHook`/`MetricsHook` +- **At least one trace** for service `fun-with-flags-java-spring` visible in Tempo +- Spans tagged with **`feature_flag.context.dose=underdose`** searchable in Tempo and lining up with `feature_flag.variant=clouded` on the same span +- The **`feature_flag_evaluation_requests_total`** counter non-zero in Prometheus +- The **`vision_amplifier_v2`** fractional rollout flipped back to **100% off / 0% on** +- The HTTP 5xx rate over the last minute below **1%** + +## ๐Ÿ“š Concepts you'll touch + +If you came in fresh on OpenTelemetry SDK plumbing or flagd's fractional rule, read this section first. + +### OpenTelemetry **TracerProvider** vs **MeterProvider** + +Spans are per-request timing (one trace per HTTP call, with nested events), counters are aggregate population stats (rate of evaluations across all requests, distribution of variants). In this lab the trace half is wired and Tempo already shows spans; the metrics half is dead and the dashboard is dark โ€” that's the gap you close. + +OTel ships two parallel pipelines, one for **traces** (spans, distributed timing) and one for **metrics** (counters, histograms). Each has its own provider, its own SDK, its own exporter. In this level the `TracerProvider` is already wired (spans are flowing into Tempo). The `MeterProvider` is not โ€” that is your fix. Both providers register globally via `GlobalOpenTelemetry`, so once you wire the meter, the OpenFeature `MetricsHook` finds it without any further plumbing. + +### OpenFeature `TracesHook` and `MetricsHook` + +The OpenFeature OTel contrib library ships two hooks that turn every flag evaluation into telemetry: + +- **`TracesHook`** โ€” emits a span event (`feature_flag.evaluation`) on the active span with `feature_flag.key`, `feature_flag.variant`, and `feature_flag.reason` attributes. This is why flag evaluations show up nested inside HTTP request spans in Tempo. +- **`MetricsHook`** โ€” emits four counters per evaluation: `feature_flag_evaluation_requests_total`, `_success_total`, `_error_total`, and an active-count up/down counter. These power the dashboard panels. + +Both hooks need a global `OpenTelemetry` instance. The `TracesHook` works once you have a `TracerProvider`; the `MetricsHook` needs a `MeterProvider`. + +### Authoring your own hook to enrich spans with context + +The `AuditHook` carried over from Intermediate already records the same context attributes (species / country / dose) into a durable `[AUDIT]` log line โ€” that is the safety officer's tool, useful weeks later for forensic follow-up. What it does not give you is **real-time correlation in the dashboard**: log lines do not show up alongside `feature_flag.variant` on a Tempo span. So `TracesHook` is great at recording **what** happened (the variant, the reason), `AuditHook` records the audit-archive view, and there is still a gap โ€” the evaluation context attributes that drove the decision are not on the span. The two hooks stay; you add a third for the on-call's view. + +The OpenFeature `Hook` interface is the right place to fix that. The shape is roughly: + +```text +before(hookCtx) { + span = active OTel span + for each allowlisted key in merged eval context: + span.setAttribute("feature_flag.context." + key, value) +} +``` + +The `before` hook receives a `HookContext` whose `getCtx()` returns the **merged** evaluation context (global + transaction + invocation), which is exactly what drove the flag's resolution โ€” so the attributes you copy off it line up with what the variant decision actually saw. Span attributes go on `Span.current()` because that is the active HTTP request span; the OpenFeature hook fires inside that span's scope. + +Register it next to `TracesHook` / `MetricsHook` in `OpenFeatureConfig`. Now every flag evaluation tags its parent span with the context attributes the lab cares about. In Tempo: **Search โ†’ Service: fun-with-flags-java-spring โ†’ +Tag โ†’ `feature_flag.context.dose=underdose`** lights up exactly the requests where a tech mis-dosed, with the resolved variant on the same span event. + +The full implementation, including imports and a couple of subtle correctness notes, is in [solutions/expert.md](./solutions/expert.md). + +> โš ๏ธ **Allowlist, don't iterate.** Use a fixed allowlist for the same reason the `AuditHook` does โ€” see [Intermediate's PII note](./intermediate.md#3c-an-audithook) and the [OpenTelemetry security guidance](https://opentelemetry.io/docs/security/). + +### `flagd` `fractional` operation + `targetingKey` + +`fractional` is flagd's bucketing operation. Given a list of `[variant, percent]` pairs, it deterministically assigns each evaluation to one variant based on a hash of the **targeting key** on the evaluation context. Same key โ†’ same bucket โ†’ same variant, every request. Different keys spread across the percentages. **If no targeting key is set, every evaluation hashes the same way and the rollout collapses โ€” every request lands in the same bucket and the percentages do nothing.** + +You already wired this up in Intermediate. The **`SpeciesInterceptor`** you wrote there reads `?userId=...` from each request and constructs an `ImmutableContext(userId, attributes)` โ€” by SDK convention the first `String` argument to `ImmutableContext` **is** the OpenFeature `targetingKey`. Expert ships the same interceptor byte-for-byte; the lab is already serving fractional rollouts correctly without you touching it. (Intermediate didn't have a flag that used the targetingKey; this is where it pays off.) + +The k6 loadgen demonstrates this end-to-end: it generates a fresh random `userId` per request, which means the interceptor produces a different targeting key per request, which means the fractional rollout spreads across the percentages exactly as configured. The dashboard's variant-distribution panel reflects that split directly. + +## ๐Ÿง  What You'll Learn + +- How the OpenFeature OpenTelemetry hooks (`TracesHook` and `MetricsHook`) join + flag evaluations to the rest of an application's telemetry without a + separate ingestion path +- How to **author your own `Hook`** โ€” a tiny class that copies merged-eval-context + attributes onto the active OTel span โ€” to close the loop between *why* a + flag resolved the way it did and *what* the operator sees in Tempo +- How [`fractional`](https://flagd.dev/reference/custom-operations/fractional-operation/) + rollout in flagd buckets users by `targetingKey` โ€” same key, same bucket, every + request โ€” and how to read that bucketing off a dashboard +- How a **flag flip** is a faster operational lever than a redeploy when a + rollout is misbehaving โ€” the difference between a one-line config change and + a twenty-minute deployment + +## ๐Ÿงฐ Toolbox + +Your Codespace comes pre-configured with the following tools: + +- [`curl`](https://curl.se/): HTTP client for hitting the lab, flagd, and Prometheus +- [`./mvnw`](https://maven.apache.org/wrapper/): The Maven wrapper to build and run the Spring Boot lab +- A browser pointed at [`http://localhost:3000`](http://localhost:3000) for Grafana (admin / admin) +- [`jq`](https://jqlang.github.io/jq/): Pretty-print and filter JSON from `curl` + +flagd, the Grafana LGTM stack, and the k6 loadgen are **sibling devcontainer services** โ€” they come up automatically when the Codespace boots. There is no `docker compose up` step. Inside the workspace they are reachable as `flagd`, `lgtm`, and `loadgen`; on the host they are forwarded to the same `localhost:NNNN` ports that `verify.sh` and the docs assume. + +## โœ… How to Play + +### 1. Start Your Challenge + +> ๐Ÿ“– **First time?** Check out the [Getting Started Guide](../../start-a-challenge) +> for detailed instructions on forking, starting a Codespace, and waiting for +> infrastructure setup. + +Quick start: + +- Fork the repo +- Create a Codespace +- Select **"Adventure 00 | ๐Ÿ”ด Expert (Phase 3 โ€” read the chart)"** +- Wait ~2-3 minutes for the sibling containers (flagd, Grafana LGTM, k6 + loadgen) to come up. They are part of the devcontainer compose, so they + start automatically โ€” no `docker compose up` step. +- Once the IDE attaches to the workspace, start the Spring Boot lab. Click + **Run** on `Laboratory` in the Spring Boot Dashboard panel (or press + **F5** with `Laboratory.java` open), or run `./mvnw spring-boot:run` + from the integrated terminal. + +### 2. Access the UIs + +Open the **Ports** tab in the bottom panel and click through to: + +#### Spring Boot lab (Port `8080`) + +The application under test. Open `http://localhost:8080/` to get a vision_state reading +back. Add a `userId` query parameter (e.g. `?userId=subject-42`) to give the +fractional rollout a stable bucketing key. + +#### Grafana (Port `3000`) + +The single window into the LGTM stack. Login is `admin` / `admin` (skip the +"change your password" prompt). + +- **Dashboards โ†’ Fun With Flags โ€” Feature Flag Metrics** โ€” the dashboard the + director keeps reloading. Empty for now. +- **Explore โ†’ Tempo** โ€” search by service `fun-with-flags-java-spring` + to see flag evaluations as span events nested inside HTTP request spans. + Traces work even before you wire up metrics. + +#### Prometheus (Port `9090`) + +Exposed by the LGTM container. Useful for `curl`-driven debugging: +`curl 'http://localhost:9090/api/v1/query?query=feature_flag_evaluation_requests_total'`. + +#### Tempo (Port `3200`) + +Tempo's own HTTP API. The `verify.sh` script uses +`http://localhost:3200/api/search?tags=service.name=fun-with-flags-java-spring` +to assert traces are flowing. + +#### flagd + +flagd is on `:8013` (gRPC eval) โ€” same as Beginner; the other ports (`8014` management/metrics, `8015` sync, `8016` OFREP) aren't used in this level. + +#### OTLP receivers (Ports `4317` / `4318`) + +The Spring Boot app exports traces (and, after you finish the wiring, metrics) +to the LGTM stack on `4317` (gRPC) and `4318` (HTTP). + +### 3. Implement the Objective + +There are three sub-tasks, in order: + +#### 3a. Wire the OpenTelemetry meter provider + +Open +`adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenTelemetryConfig.java`. +The `@Bean` method already calls `AutoConfiguredOpenTelemetrySdk.builder()`, +which produces an `OpenTelemetry` instance with **both** a `SdkTracerProvider` +and a `SdkMeterProvider` โ€” but only the tracer provider has an exporter. +The meter provider is told `otel.metrics.exporter=none`, so any metrics it +records go nowhere. + +Flip `otel.metrics.exporter` to `otlp` so the SDK attaches an +`OtlpGrpcMetricExporter`. The cleanest way is to update both the default in +`OpenTelemetryConfig.java` and the value in +`src/main/resources/application.properties`. While you're there, set +`otel.metric.export.interval=10000` so the dashboard updates within ten +seconds of new traffic instead of waiting a minute. + +#### 3b. Register `MetricsHook(OpenTelemetry)` on the OpenFeature API + +Open `OpenFeatureConfig.java`. The `TracesHook` is already registered; +`MetricsHook` is not. `MetricsHook` needs the `OpenTelemetry` instance to grab +the meter provider, so inject the bean via constructor injection and +`api.addHooks(new MetricsHook(openTelemetry));` next to the `TracesHook` call. + +If you compile and run after this step, the **Fun With Flags โ€” Feature Flag +Metrics** dashboard in Grafana stays empty โ€” there is no traffic. Move on. + +#### 3c. Turn on the loadgen, find the bad rollout, roll it back + +Edit `flags.json` in the expert directory and flip `loadgen_active`'s +`defaultVariant` from `"off"` to `"on"`. flagd watches the file and picks up +changes within a second. The k6 loadgen container has been polling +`loadgen_active` every two seconds โ€” it will notice and start hammering +`http://workspace:8080/` with five virtual users (the workspace service name resolves inside the compose network). + +Now open the dashboard. When the loadgen turns on you should see latency creep up around 200ms and 5xx rate around 10%; if those don't move, the loadgen flag isn't actually live yet. + +That's the diagnosis: the fractional rollout for `vision_amplifier_v2` is +inverted. The flag definition currently reads: + +```json +"fractional": [ + ["off", 0], + ["on", 100] +] +``` + +Edit `flags.json` again โ€” flip the percentages so `off` gets `100` and `on` +gets `0`. Save. Within one or two seconds flagd reloads. Because the +`SpeciesInterceptor` is wiring `?userId=` through to the OpenFeature +`targetingKey` on every request, and the loadgen generates a fresh `userId` +per request, the fractional rollout responds immediately โ€” every subject +re-buckets against the new percentages and the population moves to the safe +variant. Watch the latency p99 panel collapse back to baseline and the 5xx +rate fall to zero. + +**No deploy. No rebuild. No restart of the lab.** + +### 4. Verify Your Solution + +Once the dashboard is healthy, run the verifier: + +```bash +adventures/planned/00-blind-by-design/expert/verify.sh +``` + +The script asserts the lab, flagd, and LGTM are reachable, that +`vision_amplifier_v2` evaluates to `false` for a probe user, that the +`feature_flag_evaluation_requests_total` Prometheus counter is non-zero, that +Tempo has at least one trace for `fun-with-flags-java-spring`, and that the +HTTP 5xx rate over the last minute is below 1%. + +If everything turns green, your solution is solid. ๐ŸŽ‰ diff --git a/adventures/planned/00-blind-by-design/docs/solutions/expert.md b/adventures/planned/00-blind-by-design/docs/solutions/expert.md new file mode 100644 index 00000000..7b414c4e --- /dev/null +++ b/adventures/planned/00-blind-by-design/docs/solutions/expert.md @@ -0,0 +1,280 @@ +# ๐Ÿ”ด Expert Solution Walkthrough: Phase 3 โ€” read the chart + +Four sub-tasks, in order: wire the meter provider, register `MetricsHook`, +write and register a `ContextSpanHook` of your own, roll the bad flag back. +We'll do them exactly that way. + +> โš ๏ธ **Spoiler Alert:** This walkthrough contains the full solution. Try +> solving it on your own first. + +## ๐Ÿ“‹ Step 1: Read the objective + +> By the end of this level, you should have: +> +> - The OpenTelemetry meter provider wired and the OpenFeature `MetricsHook` registered +> - A `ContextSpanHook` of your own that copies the merged evaluation context +> (`species`, `country`, `dose`) onto the active span as `feature_flag.context.` +> - At least one trace for service `fun-with-flags-java-spring` visible in Tempo +> - Spans tagged with `feature_flag.context.dose=underdose` searchable in Tempo +> - The `feature_flag_evaluation_requests_total` counter non-zero in Prometheus +> - The `vision_amplifier_v2` fractional rollout flipped back to 100% off / 0% on +> - HTTP 5xx rate over the last minute below 1% + +## ๐Ÿ” Step 2: Inspect what's already wired + +Traces work out of the box โ€” the `TracesHook` is registered in +`OpenFeatureConfig.java` and the OTel SDK is exporting via OTLP/gRPC to the +LGTM container at `http://localhost:4317`. Open Grafana โ†’ Explore โ†’ Tempo โ†’ +search for `service.name=fun-with-flags-java-spring` and you should already +see traces. (If you don't, hit `curl http://localhost:8080/` a few times to +generate some.) + +The metrics half, however, is dead. Two reasons: + +1. `application.properties` has `otel.metrics.exporter=none`. The SDK creates + a `SdkMeterProvider` but no exporter is attached, so any counter it + records is dropped. +2. `OpenFeatureConfig.initProvider()` registers `TracesHook` but not + `MetricsHook`. Even if the meter provider could export, no one is + recording flag evaluations as metrics. + +One thing that **is** already wired and matters for this level: the +`SpeciesInterceptor` you wrote in Intermediate. Expert ships it byte-for-byte +unchanged. The relevant part for this level is the line you already wrote +that reads `?userId=โ€ฆ` from the query string and constructs +`new ImmutableContext(userId, attributes)` โ€” by SDK convention, the first +`String` argument **is** the OpenFeature `targetingKey`. That is what makes +the `vision_amplifier_v2` fractional rollout actually bucket per subject; +without it, every evaluation would hash the same way and the percentages +would do nothing. (Intermediate didn't have a flag that used the +targetingKey, so the wiring sat dormant; this is where it pays off.) You +don't write any new code for this in Expert โ€” the rollback in Step 6 takes +effect immediately because the loadgen sends a fresh `userId` per request +into the interceptor you already shipped. + +## ๐Ÿ›  Step 3: Wire the meter provider + +Open `src/main/java/dev/openfeature/demo/java/demo/OpenTelemetryConfig.java`. +Change the default for `otel.metrics.exporter` from `"none"` to `"otlp"`, and +add a default for `otel.metric.export.interval` so the meter flushes every +ten seconds. The full method: + +```java +@Bean +public OpenTelemetry openTelemetry( + @Value("${otel.service.name:fun-with-flags-java-spring}") String serviceName, + @Value("${otel.exporter.otlp.endpoint:http://localhost:4317}") String otlpEndpoint, + @Value("${otel.exporter.otlp.protocol:grpc}") String otlpProtocol, + @Value("${otel.traces.exporter:otlp}") String tracesExporter, + @Value("${otel.metrics.exporter:otlp}") String metricsExporter, + @Value("${otel.logs.exporter:none}") String logsExporter, + @Value("${otel.metric.export.interval:10000}") String metricExportInterval) { + System.setProperty("otel.service.name", serviceName); + System.setProperty("otel.exporter.otlp.endpoint", otlpEndpoint); + System.setProperty("otel.exporter.otlp.protocol", otlpProtocol); + System.setProperty("otel.traces.exporter", tracesExporter); + System.setProperty("otel.metrics.exporter", metricsExporter); + System.setProperty("otel.logs.exporter", logsExporter); + System.setProperty("otel.metric.export.interval", metricExportInterval); + + autoConfigured = AutoConfiguredOpenTelemetrySdk.builder() + .setResultAsGlobal() + .build(); + return autoConfigured.getOpenTelemetrySdk(); +} +``` + +Then update `src/main/resources/application.properties` to match: + +```properties +spring.application.name=demo + +otel.exporter.otlp.endpoint=http://localhost:4317 +otel.exporter.otlp.protocol=grpc +otel.traces.exporter=otlp +otel.metrics.exporter=otlp +otel.logs.exporter=none +otel.service.name=fun-with-flags-java-spring +otel.metric.export.interval=10000 +``` + +> The autoconfigure module reads `otel.metrics.exporter` and, when set to +> `otlp`, attaches an `OtlpGrpcMetricExporter` to the `SdkMeterProvider`. The +> resulting `OpenTelemetry` bean now exposes a working `getMeterProvider()`. + +## ๐Ÿ›  Step 4: Register `MetricsHook` on the OpenFeature API + +Open `OpenFeatureConfig.java`. Inject the `OpenTelemetry` bean via +constructor injection and add `MetricsHook` next to the existing +`TracesHook` call: + +```java +import dev.openfeature.contrib.hooks.otel.MetricsHook; +import dev.openfeature.contrib.hooks.otel.TracesHook; +import io.opentelemetry.api.OpenTelemetry; + +@Configuration +public class OpenFeatureConfig implements WebMvcConfigurer { + + private final OpenTelemetry openTelemetry; + + public OpenFeatureConfig(OpenTelemetry openTelemetry) { + this.openTelemetry = openTelemetry; + } + + @PostConstruct + public void initProvider() { + OpenFeatureAPI api = OpenFeatureAPI.getInstance(); + FlagdOptions flagdOptions = FlagdOptions.builder() + .resolverType(Config.Resolver.RPC) + .build(); + api.setProviderAndWait(new FlagdProvider(flagdOptions)); + + HashMap attributes = new HashMap<>(); + attributes.put("country", new Value(Optional.ofNullable(System.getenv("COUNTRY")).orElse(""))); + api.setEvaluationContext(new ImmutableContext(attributes)); + + api.addHooks(new AuditHook()); // already wired in broken state + api.addHooks(new TracesHook()); // already wired in broken state + api.addHooks(new MetricsHook(openTelemetry)); // <-- you add this + api.addHooks(new ContextSpanHook()); // <-- you add this + } + + // addInterceptors(...) unchanged +} +``` + +### The `ContextSpanHook` + +A small `Hook` of your own, in a new file `ContextSpanHook.java`, that mirrors the merged evaluation context onto the active span. This is what lets Tempo show "this request had `dose=underdose` and got `variant=clouded`" on the same span. + +```java +package dev.openfeature.demo.java.demo; + +import dev.openfeature.sdk.EvaluationContext; +import dev.openfeature.sdk.Hook; +import dev.openfeature.sdk.HookContext; +import dev.openfeature.sdk.Value; +import io.opentelemetry.api.trace.Span; + +import java.util.List; +import java.util.Map; +import java.util.Optional; + +public class ContextSpanHook implements Hook { + + private static final List TRACKED = List.of("species", "country", "dose"); + + @Override + public Optional before(HookContext ctx, Map hints) { + Span span = Span.current(); + EvaluationContext ec = ctx.getCtx(); + for (String key : TRACKED) { + Value v = ec.getValue(key); + if (v != null && v.asString() != null) { + span.setAttribute("feature_flag.context." + key, v.asString()); + } + } + return Hook.super.before(ctx, hints); + } +} +``` + +Three notes worth calling out: + +- `HookContext.getCtx()` returns the **merged** evaluation context โ€” global + transaction + invocation, in that precedence order. So the hook reads whatever the SDK is about to use, regardless of which layer set the value. +- `Span.current()` returns the no-op span if there is no active OTel context (e.g. in tests without an instrumented HTTP server). `setAttribute` on the no-op span is a safe no-op, so the hook does not need defensive guards. +- **`TRACKED` is a fixed allowlist on purpose โ€” do not iterate.** The merged context typically also carries `targetingKey` (often a stable user id) and, in real apps, things like `email`, account ids, or device identifiers. If you replace the allowlist with `for (String key : ec.asMap().keySet())` you ship that PII straight into Tempo / Prometheus, where it is retained for days and is hard to redact after the fact. Pick the minimum set of keys that helps you correlate, document why each is safe for long-term storage, and add new keys deliberately. The OpenTelemetry [security & privacy guidance](https://opentelemetry.io/docs/security/) covers the broader principle. + +Restart the lab: + +```bash +./mvnw spring-boot:run +``` + +After it boots, hit `curl http://localhost:8080/` a few times. Wait ten to +fifteen seconds and check Prometheus: + +```bash +curl -s 'http://localhost:9090/api/v1/query?query=feature_flag_evaluation_requests_total' | jq +``` + +You should see entries with `feature_flag_key` labels for `vision_state`, +`vision_amplifier_v2`, and `loadgen_active`. The dashboard panels in Grafana +will start drawing within the next refresh interval. + +## ๐Ÿ›  Step 5: Turn on the loadgen and read the chart + +Open `flags.json` and flip `loadgen_active`: + +```json +"loadgen_active": { + "state": "ENABLED", + "variants": { "off": false, "on": true }, + "defaultVariant": "on" +} +``` + +Save. The k6 loadgen polls flagd every two seconds and starts hammering. Now +open Grafana โ†’ **Dashboards โ†’ Fun With Flags โ€” Feature Flag Metrics**. +You'll see: + +- **Evaluations per second** โ€” three flag keys, all live +- **Variant distribution** โ€” `vision_amplifier_v2` is heavily skewed toward `on` +- **HTTP latency** โ€” sitting around 200ms, well above baseline +- **HTTP 5xx rate** โ€” around 10% + +## ๐Ÿ›  Step 6: Roll the rollout back + +The fractional bucket for `vision_amplifier_v2` is inverted. Edit `flags.json`: + +```diff + "vision_amplifier_v2": { + "state": "ENABLED", + "variants": { "off": false, "on": true }, + "defaultVariant": "off", + "targeting": { + "fractional": [ +- ["off", 0], +- ["on", 100] ++ ["off", 100], ++ ["on", 0] + ] + } + } +``` + +Save. flagd reloads within a second. The k6 script generates a fresh +`userId` per request, so the next request is immediately bucketed into +`off`. The dashboard panels recover within seconds. + +## โœ… Step 7: Verify + +Run the verifier: + +```bash +adventures/planned/00-blind-by-design/expert/verify.sh +``` + +All eight checks should pass (lab reachable, flagd reachable, LGTM +reachable, `vision_amplifier_v2` rolled back, Prometheus has the metric +counter, Tempo has traces, Tempo spans carry the `feature_flag.context.*` +attribute, 5xx rate below threshold). The 5xx rate check tolerates a brief +tail of errors from before the rollback, but if you wait a minute it +settles to zero. + +## ๐ŸŽ“ What this exercise demonstrates + +- **Decoupling deployment from release.** Once the flag is in place, rolling + out and rolling back happen via a JSON edit, not a redeploy. That is the + same lever you would pull at 3am when the new pricing engine starts + erroring. +- **Stable bucketing via `targetingKey`.** The k6 script generates a fresh + `userId` per request *on purpose* โ€” it lets us see the rollback take + effect immediately. In a real app, the `userId` is the logged-in user, so + the bucketing is sticky across the user's session and the rollback only + helps users who arrive *after* the flag flip. +- **Two halves of OTel observability.** Traces tell you about a specific + request; metrics tell you about the population. The OpenFeature OTel + hooks expose both for flag evaluations using the same OTel SDK the rest of + the app already exports through. diff --git a/adventures/planned/00-blind-by-design/expert/.mvn/wrapper/maven-wrapper.properties b/adventures/planned/00-blind-by-design/expert/.mvn/wrapper/maven-wrapper.properties new file mode 100644 index 00000000..3ee7848f --- /dev/null +++ b/adventures/planned/00-blind-by-design/expert/.mvn/wrapper/maven-wrapper.properties @@ -0,0 +1 @@ +distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.9.15/apache-maven-3.9.15-bin.zip diff --git a/adventures/planned/00-blind-by-design/expert/.vscode/launch.json b/adventures/planned/00-blind-by-design/expert/.vscode/launch.json new file mode 100644 index 00000000..5c0005f5 --- /dev/null +++ b/adventures/planned/00-blind-by-design/expert/.vscode/launch.json @@ -0,0 +1,14 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "type": "java", + "name": "๐Ÿงช Run the Phase 3 Lab", + "request": "launch", + "mainClass": "dev.openfeature.demo.java.demo.Laboratory", + "projectName": "demo", + "console": "integratedTerminal", + "cwd": "${workspaceFolder}" + } + ] +} diff --git a/adventures/planned/00-blind-by-design/expert/.vscode/tasks.json b/adventures/planned/00-blind-by-design/expert/.vscode/tasks.json new file mode 100644 index 00000000..1d483f30 --- /dev/null +++ b/adventures/planned/00-blind-by-design/expert/.vscode/tasks.json @@ -0,0 +1,14 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "label": "๐Ÿงช Verify Solution", + "type": "shell", + "command": "./verify.sh", + "options": { "cwd": "${workspaceFolder}" }, + "problemMatcher": [], + "presentation": { "reveal": "always", "panel": "dedicated" }, + "group": { "kind": "test", "isDefault": true } + } + ] +} diff --git a/adventures/planned/00-blind-by-design/expert/dashboards/feature-flags.json b/adventures/planned/00-blind-by-design/expert/dashboards/feature-flags.json new file mode 100644 index 00000000..a293ce92 --- /dev/null +++ b/adventures/planned/00-blind-by-design/expert/dashboards/feature-flags.json @@ -0,0 +1,135 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "drawStyle": "line", "lineWidth": 2, "fillOpacity": 10 }, "unit": "ops" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "id": 1, + "options": { "legend": { "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum by (feature_flag_key) (rate(feature_flag_evaluation_requests_total[1m]))", + "legendFormat": "{{feature_flag_key}}", + "refId": "A" + } + ], + "title": "Flag evaluations per second (by flag)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "unit": "short" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "id": 2, + "options": { "legend": { "displayMode": "table", "placement": "right" }, "pieType": "donut", "reduceOptions": { "calcs": ["lastNotNull"], "values": false } }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum by (feature_flag_variant) (increase(feature_flag_evaluation_success_total[5m]))", + "legendFormat": "{{feature_flag_variant}}", + "refId": "A" + } + ], + "title": "Variant distribution (last 5m)", + "type": "piechart" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 0.01 } ] }, "unit": "ops" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "id": 3, + "options": { "legend": { "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum by (feature_flag_key, error_type) (rate(feature_flag_evaluation_error_total[1m]))", + "legendFormat": "{{feature_flag_key}} ({{error_type}})", + "refId": "A" + } + ], + "title": "Evaluation errors per second", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "unit": "short" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "id": 4, + "options": { "legend": { "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum by (service_name) (rate(feature_flag_evaluation_requests_total[1m]))", + "legendFormat": "{{service_name}}", + "refId": "A" + } + ], + "title": "Evaluations per service (rate)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "Step 7 โ€” HTTP request latency p99 from OTel auto-instrumentation. Watch this rise when a slow rollout cohort gets bigger.", + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "drawStyle": "line", "lineWidth": 2, "fillOpacity": 5 }, "unit": "s" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "id": 5, + "options": { "legend": { "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.99, sum by (le, service_name) (rate(http_server_request_duration_seconds_bucket[1m])))", + "legendFormat": "p99 {{service_name}}", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.50, sum by (le, service_name) (rate(http_server_request_duration_seconds_bucket[1m])))", + "legendFormat": "p50 {{service_name}}", + "refId": "B" + } + ], + "title": "HTTP request latency (p50, p99)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "Step 7 โ€” HTTP 5xx rate. Watch this jump when the new code path's error injection kicks in.", + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 0.1 }, { "color": "red", "value": 1 } ] }, "unit": "ops" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "id": 6, + "options": { "legend": { "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum by (service_name) (rate(http_server_request_duration_seconds_count{http_response_status_code=~\"5..\"}[1m]))", + "legendFormat": "{{service_name}}", + "refId": "A" + } + ], + "title": "HTTP 5xx per second", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 38, + "style": "dark", + "tags": ["openfeature", "feature-flags"], + "templating": { "list": [] }, + "time": { "from": "now-15m", "to": "now" }, + "timepicker": {}, + "timezone": "", + "title": "Fun With Flags โ€” Feature Flag Metrics", + "uid": "fun-with-flags-metrics", + "version": 1, + "weekStart": "" +} diff --git a/adventures/planned/00-blind-by-design/expert/flags.json b/adventures/planned/00-blind-by-design/expert/flags.json new file mode 100644 index 00000000..4ccfb246 --- /dev/null +++ b/adventures/planned/00-blind-by-design/expert/flags.json @@ -0,0 +1,46 @@ +{ + "flags": { + "vision_state": { + "state": "ENABLED", + "variants": { + "enhanced": "enhanced", + "sharp": "sharp", + "blurry": "blurry", + "clouded": "clouded" + }, + "defaultVariant": "blurry", + "targeting": { + "if": [ + { "===": [{ "var": "species" }, "zyklop"] }, + "enhanced", + { "in": [{ "var": "dose" }, ["underdose", "overdose"]] }, + "clouded", + { "===": [{ "var": "country" }, "de"] }, + "sharp" + ] + } + }, + "vision_amplifier_v2": { + "state": "ENABLED", + "variants": { + "off": false, + "on": true + }, + "defaultVariant": "off", + "targeting": { + "fractional": [ + ["off", 0], + ["on", 100] + ] + } + }, + "loadgen_active": { + "state": "ENABLED", + "variants": { + "off": false, + "on": true + }, + "defaultVariant": "off" + } + } +} diff --git a/adventures/planned/00-blind-by-design/expert/loadgen/k6/script.js b/adventures/planned/00-blind-by-design/expert/loadgen/k6/script.js new file mode 100644 index 00000000..bd648a77 --- /dev/null +++ b/adventures/planned/00-blind-by-design/expert/loadgen/k6/script.js @@ -0,0 +1,63 @@ +// k6 script that hits the demo's GET / with random species values, but only +// when the OpenFeature flag `loadgen_active` is true. Flip the flag in the +// running flagd's flags.json (defaultVariant: "off" โ†’ "on") and the script +// starts hammering within seconds. Flip it back and it goes idle. +// +// The script targets one app instance via BASE_URL โ€” point it at :8080 of +// whichever folder you're running. FLAGD_URL is flagd's eval endpoint on +// :8013 (the gRPC port also serves HTTP/JSON via gRPC-Gateway, so a plain +// curl-style POST works against the same port the SDK uses). + +import http from 'k6/http'; +import { sleep } from 'k6'; + +export const options = { + vus: 5, // five virtual users; modest load, dashboard stays readable + duration: '24h', // run forever โ€” toggle the flag to start/stop traffic +}; + +const BASE_URL = __ENV.BASE_URL || 'http://host.docker.internal:8080'; +const FLAGD_URL = __ENV.FLAGD_URL || 'http://host.docker.internal:8013'; + +// Pool of subject species. Empty string means "no query parameter" โ€” exercises +// the country-fallback or default branch. The mix is deliberately uneven so the +// variant distribution panel in Grafana looks like real traffic, not a flat split. +const SPECIES = ['zyklop', 'zyklop', 'human', 'human', 'human', 'orc', 'elf', 'goblin', '']; + +// Generate a random user id per request. The Phase 3 `vision_amplifier_v2` flag +// uses a fractional rollout that buckets on the OpenFeature targetingKey, so +// without a stable per-request id every request would land in the same bucket. +function randomUserId() { + return `user-${Math.floor(Math.random() * 100000)}`; +} + +function isLoadgenActive() { + const res = http.post( + `${FLAGD_URL}/flagd.evaluation.v1.Service/ResolveBoolean`, + JSON.stringify({ flagKey: 'loadgen_active', context: {} }), + { headers: { 'Content-Type': 'application/json' }, timeout: '2s' }, + ); + if (res.status !== 200) return false; + try { + return JSON.parse(res.body).value === true; + } catch { + return false; + } +} + +export default function () { + if (!isLoadgenActive()) { + // Flag is off โ€” idle gently. Two seconds is short enough to feel responsive + // when the flag flips on, long enough not to thrash flagd. + sleep(2); + return; + } + + const species = SPECIES[Math.floor(Math.random() * SPECIES.length)]; + const userId = randomUserId(); + const params = [`userId=${userId}`]; + if (species) params.push(`species=${species}`); + const url = `${BASE_URL}/?${params.join('&')}`; + http.get(url, { tags: { species: species || 'default' } }); + sleep(0.1); +} diff --git a/adventures/planned/00-blind-by-design/expert/mvnw b/adventures/planned/00-blind-by-design/expert/mvnw new file mode 100755 index 00000000..9b14e061 --- /dev/null +++ b/adventures/planned/00-blind-by-design/expert/mvnw @@ -0,0 +1,259 @@ +#!/bin/sh +# ---------------------------------------------------------------------------- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ---------------------------------------------------------------------------- + +# ---------------------------------------------------------------------------- +# Apache Maven Wrapper startup batch script, version 3.3.4 +# +# Optional ENV vars +# ----------------- +# JAVA_HOME - location of a JDK home dir, required when download maven via java source +# MVNW_REPOURL - repo url base for downloading maven distribution +# MVNW_USERNAME/MVNW_PASSWORD - user and password for downloading maven +# MVNW_VERBOSE - true: enable verbose log; debug: trace the mvnw script; others: silence the output +# ---------------------------------------------------------------------------- + +set -euf +[ "${MVNW_VERBOSE-}" != debug ] || set -x + +# OS specific support. +native_path() { printf %s\\n "$1"; } +case "$(uname)" in +CYGWIN* | MINGW*) + [ -z "${JAVA_HOME-}" ] || JAVA_HOME="$(cygpath --unix "$JAVA_HOME")" + native_path() { cygpath --path --windows "$1"; } + ;; +esac + +# set JAVACMD and JAVACCMD +set_java_home() { + # For Cygwin and MinGW, ensure paths are in Unix format before anything is touched + if [ -n "${JAVA_HOME-}" ]; then + if [ -x "$JAVA_HOME/jre/sh/java" ]; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + JAVACCMD="$JAVA_HOME/jre/sh/javac" + else + JAVACMD="$JAVA_HOME/bin/java" + JAVACCMD="$JAVA_HOME/bin/javac" + + if [ ! -x "$JAVACMD" ] || [ ! -x "$JAVACCMD" ]; then + echo "The JAVA_HOME environment variable is not defined correctly, so mvnw cannot run." >&2 + echo "JAVA_HOME is set to \"$JAVA_HOME\", but \"\$JAVA_HOME/bin/java\" or \"\$JAVA_HOME/bin/javac\" does not exist." >&2 + return 1 + fi + fi + else + JAVACMD="$( + 'set' +e + 'unset' -f command 2>/dev/null + 'command' -v java + )" || : + JAVACCMD="$( + 'set' +e + 'unset' -f command 2>/dev/null + 'command' -v javac + )" || : + + if [ ! -x "${JAVACMD-}" ] || [ ! -x "${JAVACCMD-}" ]; then + echo "The java/javac command does not exist in PATH nor is JAVA_HOME set, so mvnw cannot run." >&2 + return 1 + fi + fi +} + +# hash string like Java String::hashCode +hash_string() { + str="${1:-}" h=0 + while [ -n "$str" ]; do + char="${str%"${str#?}"}" + h=$(((h * 31 + $(LC_CTYPE=C printf %d "'$char")) % 4294967296)) + str="${str#?}" + done + printf %x\\n $h +} + +verbose() { :; } +[ "${MVNW_VERBOSE-}" != true ] || verbose() { printf %s\\n "${1-}"; } + +die() { + printf %s\\n "$1" >&2 + exit 1 +} + +trim() { + # MWRAPPER-139: + # Trims trailing and leading whitespace, carriage returns, tabs, and linefeeds. + # Needed for removing poorly interpreted newline sequences when running in more + # exotic environments such as mingw bash on Windows. + printf "%s" "${1}" | tr -d '[:space:]' +} + +# parse distributionUrl and optional distributionSha256Sum, requires .mvn/wrapper/maven-wrapper.properties +while IFS="=" read -r key value; do + case "${key-}" in + distributionUrl) distributionUrl=$(trim "${value-}") ;; + distributionSha256Sum) distributionSha256Sum=$(trim "${value-}") ;; + esac +done <"${0%/*}/.mvn/wrapper/maven-wrapper.properties" +[ -n "${distributionUrl-}" ] || die "cannot read distributionUrl property in ${0%/*}/.mvn/wrapper/maven-wrapper.properties" + +case "${distributionUrl##*/}" in +maven-mvnd-*bin.*) + MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/ + case "${PROCESSOR_ARCHITECTURE-}${PROCESSOR_ARCHITEW6432-}:$(uname -a)" in + *AMD64:CYGWIN* | *AMD64:MINGW*) distributionPlatform=windows-amd64 ;; + :Darwin*x86_64) distributionPlatform=darwin-amd64 ;; + :Darwin*arm64) distributionPlatform=darwin-aarch64 ;; + :Linux*x86_64*) distributionPlatform=linux-amd64 ;; + *) + echo "Cannot detect native platform for mvnd on $(uname)-$(uname -m), use pure java version" >&2 + distributionPlatform=linux-amd64 + ;; + esac + distributionUrl="${distributionUrl%-bin.*}-$distributionPlatform.zip" + ;; +maven-mvnd-*) MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/ ;; +*) MVN_CMD="mvn${0##*/mvnw}" _MVNW_REPO_PATTERN=/org/apache/maven/ ;; +esac + +# apply MVNW_REPOURL and calculate MAVEN_HOME +# maven home pattern: ~/.m2/wrapper/dists/{apache-maven-,maven-mvnd--}/ +[ -z "${MVNW_REPOURL-}" ] || distributionUrl="$MVNW_REPOURL$_MVNW_REPO_PATTERN${distributionUrl#*"$_MVNW_REPO_PATTERN"}" +distributionUrlName="${distributionUrl##*/}" +distributionUrlNameMain="${distributionUrlName%.*}" +distributionUrlNameMain="${distributionUrlNameMain%-bin}" +MAVEN_USER_HOME="${MAVEN_USER_HOME:-${HOME}/.m2}" +MAVEN_HOME="${MAVEN_USER_HOME}/wrapper/dists/${distributionUrlNameMain-}/$(hash_string "$distributionUrl")" + +exec_maven() { + unset MVNW_VERBOSE MVNW_USERNAME MVNW_PASSWORD MVNW_REPOURL || : + exec "$MAVEN_HOME/bin/$MVN_CMD" "$@" || die "cannot exec $MAVEN_HOME/bin/$MVN_CMD" +} + +if [ -d "$MAVEN_HOME" ]; then + verbose "found existing MAVEN_HOME at $MAVEN_HOME" + exec_maven "$@" +fi + +case "${distributionUrl-}" in +*?-bin.zip | *?maven-mvnd-?*-?*.zip) ;; +*) die "distributionUrl is not valid, must match *-bin.zip or maven-mvnd-*.zip, but found '${distributionUrl-}'" ;; +esac + +# prepare tmp dir +if TMP_DOWNLOAD_DIR="$(mktemp -d)" && [ -d "$TMP_DOWNLOAD_DIR" ]; then + clean() { rm -rf -- "$TMP_DOWNLOAD_DIR"; } + trap clean HUP INT TERM EXIT +else + die "cannot create temp dir" +fi + +mkdir -p -- "${MAVEN_HOME%/*}" + +# Download and Install Apache Maven +verbose "Couldn't find MAVEN_HOME, downloading and installing it ..." +verbose "Downloading from: $distributionUrl" +verbose "Downloading to: $TMP_DOWNLOAD_DIR/$distributionUrlName" + +# select .zip or .tar.gz +if ! command -v unzip >/dev/null; then + distributionUrl="${distributionUrl%.zip}.tar.gz" + distributionUrlName="${distributionUrl##*/}" +fi + +# verbose opt +__MVNW_QUIET_WGET=--quiet __MVNW_QUIET_CURL=--silent __MVNW_QUIET_UNZIP=-q __MVNW_QUIET_TAR='' +[ "${MVNW_VERBOSE-}" != true ] || __MVNW_QUIET_WGET='' __MVNW_QUIET_CURL='' __MVNW_QUIET_UNZIP='' __MVNW_QUIET_TAR=v + +# normalize http auth +case "${MVNW_PASSWORD:+has-password}" in +'') MVNW_USERNAME='' MVNW_PASSWORD='' ;; +has-password) [ -n "${MVNW_USERNAME-}" ] || MVNW_USERNAME='' MVNW_PASSWORD='' ;; +esac + +if [ -z "${MVNW_USERNAME-}" ] && command -v wget >/dev/null; then + verbose "Found wget ... using wget" + wget ${__MVNW_QUIET_WGET:+"$__MVNW_QUIET_WGET"} "$distributionUrl" -O "$TMP_DOWNLOAD_DIR/$distributionUrlName" || die "wget: Failed to fetch $distributionUrl" +elif [ -z "${MVNW_USERNAME-}" ] && command -v curl >/dev/null; then + verbose "Found curl ... using curl" + curl ${__MVNW_QUIET_CURL:+"$__MVNW_QUIET_CURL"} -f -L -o "$TMP_DOWNLOAD_DIR/$distributionUrlName" "$distributionUrl" || die "curl: Failed to fetch $distributionUrl" +elif set_java_home; then + verbose "Falling back to use Java to download" + javaSource="$TMP_DOWNLOAD_DIR/Downloader.java" + targetZip="$TMP_DOWNLOAD_DIR/$distributionUrlName" + cat >"$javaSource" <<-END + public class Downloader extends java.net.Authenticator + { + protected java.net.PasswordAuthentication getPasswordAuthentication() + { + return new java.net.PasswordAuthentication( System.getenv( "MVNW_USERNAME" ), System.getenv( "MVNW_PASSWORD" ).toCharArray() ); + } + public static void main( String[] args ) throws Exception + { + setDefault( new Downloader() ); + java.nio.file.Files.copy( java.net.URI.create( args[0] ).toURL().openStream(), java.nio.file.Paths.get( args[1] ).toAbsolutePath().normalize() ); + } + } + END + # For Cygwin/MinGW, switch paths to Windows format before running javac and java + verbose " - Compiling Downloader.java ..." + "$(native_path "$JAVACCMD")" "$(native_path "$javaSource")" || die "Failed to compile Downloader.java" + verbose " - Running Downloader.java ..." + "$(native_path "$JAVACMD")" -cp "$(native_path "$TMP_DOWNLOAD_DIR")" Downloader "$distributionUrl" "$(native_path "$targetZip")" +fi + +# If specified, validate the SHA-256 sum of the Maven distribution zip file +if [ -n "${distributionSha256Sum-}" ]; then + distributionSha256Result=false + if [ "$MVN_CMD" = mvnd.sh ]; then + echo "Checksum validation is not supported for maven-mvnd." >&2 + echo "Please disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2 + exit 1 + elif command -v sha256sum >/dev/null; then + if echo "$distributionSha256Sum $TMP_DOWNLOAD_DIR/$distributionUrlName" | sha256sum -c >/dev/null 2>&1; then + distributionSha256Result=true + fi + elif command -v shasum >/dev/null; then + if echo "$distributionSha256Sum $TMP_DOWNLOAD_DIR/$distributionUrlName" | shasum -a 256 -c >/dev/null 2>&1; then + distributionSha256Result=true + fi + else + echo "Checksum validation was requested but neither 'sha256sum' or 'shasum' are available." >&2 + echo "Please install either command, or disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2 + exit 1 + fi + if [ $distributionSha256Result = false ]; then + echo "Error: Failed to validate Maven distribution SHA-256, your Maven distribution might be compromised." >&2 + echo "If you updated your Maven version, you need to update the specified distributionSha256Sum property." >&2 + exit 1 + fi +fi + +# unzip and move +if command -v unzip >/dev/null; then + unzip ${__MVNW_QUIET_UNZIP:+"$__MVNW_QUIET_UNZIP"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -d "$TMP_DOWNLOAD_DIR" || die "failed to unzip" +else + tar xzf${__MVNW_QUIET_TAR:+"$__MVNW_QUIET_TAR"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -C "$TMP_DOWNLOAD_DIR" || die "failed to untar" +fi +printf %s\\n "$distributionUrl" >"$TMP_DOWNLOAD_DIR/$distributionUrlNameMain/mvnw.url" +mv -- "$TMP_DOWNLOAD_DIR/$distributionUrlNameMain" "$MAVEN_HOME" || [ -d "$MAVEN_HOME" ] || die "fail to move MAVEN_HOME" + +clean || : +exec_maven "$@" diff --git a/adventures/planned/00-blind-by-design/expert/mvnw.cmd b/adventures/planned/00-blind-by-design/expert/mvnw.cmd new file mode 100644 index 00000000..155e00b9 --- /dev/null +++ b/adventures/planned/00-blind-by-design/expert/mvnw.cmd @@ -0,0 +1,149 @@ +<# : batch portion +@REM ---------------------------------------------------------------------------- +@REM Licensed to the Apache Software Foundation (ASF) under one +@REM or more contributor license agreements. See the NOTICE file +@REM distributed with this work for additional information +@REM regarding copyright ownership. The ASF licenses this file +@REM to you under the Apache License, Version 2.0 (the +@REM "License"); you may not use this file except in compliance +@REM with the License. You may obtain a copy of the License at +@REM +@REM http://www.apache.org/licenses/LICENSE-2.0 +@REM +@REM Unless required by applicable law or agreed to in writing, +@REM software distributed under the License is distributed on an +@REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +@REM KIND, either express or implied. See the License for the +@REM specific language governing permissions and limitations +@REM under the License. +@REM ---------------------------------------------------------------------------- + +@REM ---------------------------------------------------------------------------- +@REM Apache Maven Wrapper startup batch script, version 3.3.4 +@REM +@REM Optional ENV vars +@REM MVNW_REPOURL - repo url base for downloading maven distribution +@REM MVNW_USERNAME/MVNW_PASSWORD - user and password for downloading maven +@REM MVNW_VERBOSE - true: enable verbose log; others: silence the output +@REM ---------------------------------------------------------------------------- + +@IF "%__MVNW_ARG0_NAME__%"=="" (SET __MVNW_ARG0_NAME__=%~nx0) +@SET __MVNW_CMD__= +@SET __MVNW_ERROR__= +@SET __MVNW_PSMODULEP_SAVE=%PSModulePath% +@SET PSModulePath= +@FOR /F "usebackq tokens=1* delims==" %%A IN (`powershell -noprofile "& {$scriptDir='%~dp0'; $script='%__MVNW_ARG0_NAME__%'; icm -ScriptBlock ([Scriptblock]::Create((Get-Content -Raw '%~f0'))) -NoNewScope}"`) DO @( + IF "%%A"=="MVN_CMD" (set __MVNW_CMD__=%%B) ELSE IF "%%B"=="" (echo %%A) ELSE (echo %%A=%%B) +) +@SET PSModulePath=%__MVNW_PSMODULEP_SAVE% +@SET __MVNW_PSMODULEP_SAVE= +@SET __MVNW_ARG0_NAME__= +@SET MVNW_USERNAME= +@SET MVNW_PASSWORD= +@IF NOT "%__MVNW_CMD__%"=="" (%__MVNW_CMD__% %*) +@echo Cannot start maven from wrapper >&2 && exit /b 1 +@GOTO :EOF +: end batch / begin powershell #> + +$ErrorActionPreference = "Stop" +if ($env:MVNW_VERBOSE -eq "true") { + $VerbosePreference = "Continue" +} + +# calculate distributionUrl, requires .mvn/wrapper/maven-wrapper.properties +$distributionUrl = (Get-Content -Raw "$scriptDir/.mvn/wrapper/maven-wrapper.properties" | ConvertFrom-StringData).distributionUrl +if (!$distributionUrl) { + Write-Error "cannot read distributionUrl property in $scriptDir/.mvn/wrapper/maven-wrapper.properties" +} + +switch -wildcard -casesensitive ( $($distributionUrl -replace '^.*/','') ) { + "maven-mvnd-*" { + $USE_MVND = $true + $distributionUrl = $distributionUrl -replace '-bin\.[^.]*$',"-windows-amd64.zip" + $MVN_CMD = "mvnd.cmd" + break + } + default { + $USE_MVND = $false + $MVN_CMD = $script -replace '^mvnw','mvn' + break + } +} + +# apply MVNW_REPOURL and calculate MAVEN_HOME +# maven home pattern: ~/.m2/wrapper/dists/{apache-maven-,maven-mvnd--}/ +if ($env:MVNW_REPOURL) { + $MVNW_REPO_PATTERN = if ($USE_MVND) { "/org/apache/maven/" } else { "/maven/mvnd/" } + $distributionUrl = "$env:MVNW_REPOURL$MVNW_REPO_PATTERN$($distributionUrl -replace '^.*'+$MVNW_REPO_PATTERN,'')" +} +$distributionUrlName = $distributionUrl -replace '^.*/','' +$distributionUrlNameMain = $distributionUrlName -replace '\.[^.]*$','' -replace '-bin$','' +$MAVEN_HOME_PARENT = "$HOME/.m2/wrapper/dists/$distributionUrlNameMain" +if ($env:MAVEN_USER_HOME) { + $MAVEN_HOME_PARENT = "$env:MAVEN_USER_HOME/wrapper/dists/$distributionUrlNameMain" +} +$MAVEN_HOME_NAME = ([System.Security.Cryptography.MD5]::Create().ComputeHash([byte[]][char[]]$distributionUrl) | ForEach-Object {$_.ToString("x2")}) -join '' +$MAVEN_HOME = "$MAVEN_HOME_PARENT/$MAVEN_HOME_NAME" + +if (Test-Path -Path "$MAVEN_HOME" -PathType Container) { + Write-Verbose "found existing MAVEN_HOME at $MAVEN_HOME" + Write-Output "MVN_CMD=$MAVEN_HOME/bin/$MVN_CMD" + exit $? +} + +if (! $distributionUrlNameMain -or ($distributionUrlName -eq $distributionUrlNameMain)) { + Write-Error "distributionUrl is not valid, must end with *-bin.zip, but found $distributionUrl" +} + +# prepare tmp dir +$TMP_DOWNLOAD_DIR_HOLDER = New-TemporaryFile +$TMP_DOWNLOAD_DIR = New-Item -Itemtype Directory -Path "$TMP_DOWNLOAD_DIR_HOLDER.dir" +$TMP_DOWNLOAD_DIR_HOLDER.Delete() | Out-Null +trap { + if ($TMP_DOWNLOAD_DIR.Exists) { + try { Remove-Item $TMP_DOWNLOAD_DIR -Recurse -Force | Out-Null } + catch { Write-Warning "Cannot remove $TMP_DOWNLOAD_DIR" } + } +} + +New-Item -Itemtype Directory -Path "$MAVEN_HOME_PARENT" -Force | Out-Null + +# Download and Install Apache Maven +Write-Verbose "Couldn't find MAVEN_HOME, downloading and installing it ..." +Write-Verbose "Downloading from: $distributionUrl" +Write-Verbose "Downloading to: $TMP_DOWNLOAD_DIR/$distributionUrlName" + +$webclient = New-Object System.Net.WebClient +if ($env:MVNW_USERNAME -and $env:MVNW_PASSWORD) { + $webclient.Credentials = New-Object System.Net.NetworkCredential($env:MVNW_USERNAME, $env:MVNW_PASSWORD) +} +[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 +$webclient.DownloadFile($distributionUrl, "$TMP_DOWNLOAD_DIR/$distributionUrlName") | Out-Null + +# If specified, validate the SHA-256 sum of the Maven distribution zip file +$distributionSha256Sum = (Get-Content -Raw "$scriptDir/.mvn/wrapper/maven-wrapper.properties" | ConvertFrom-StringData).distributionSha256Sum +if ($distributionSha256Sum) { + if ($USE_MVND) { + Write-Error "Checksum validation is not supported for maven-mvnd. `nPlease disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." + } + Import-Module $PSHOME\Modules\Microsoft.PowerShell.Utility -Function Get-FileHash + if ((Get-FileHash "$TMP_DOWNLOAD_DIR/$distributionUrlName" -Algorithm SHA256).Hash.ToLower() -ne $distributionSha256Sum) { + Write-Error "Error: Failed to validate Maven distribution SHA-256, your Maven distribution might be compromised. If you updated your Maven version, you need to update the specified distributionSha256Sum property." + } +} + +# unzip and move +Expand-Archive "$TMP_DOWNLOAD_DIR/$distributionUrlName" -DestinationPath "$TMP_DOWNLOAD_DIR" | Out-Null +Rename-Item -Path "$TMP_DOWNLOAD_DIR/$distributionUrlNameMain" -NewName $MAVEN_HOME_NAME | Out-Null +try { + Move-Item -Path "$TMP_DOWNLOAD_DIR/$MAVEN_HOME_NAME" -Destination $MAVEN_HOME_PARENT | Out-Null +} catch { + if (! (Test-Path -Path "$MAVEN_HOME" -PathType Container)) { + Write-Error "fail to move MAVEN_HOME" + } +} finally { + try { Remove-Item $TMP_DOWNLOAD_DIR -Recurse -Force | Out-Null } + catch { Write-Warning "Cannot remove $TMP_DOWNLOAD_DIR" } +} + +Write-Output "MVN_CMD=$MAVEN_HOME/bin/$MVN_CMD" diff --git a/adventures/planned/00-blind-by-design/expert/pom.xml b/adventures/planned/00-blind-by-design/expert/pom.xml new file mode 100644 index 00000000..69455a5b --- /dev/null +++ b/adventures/planned/00-blind-by-design/expert/pom.xml @@ -0,0 +1,101 @@ + + + 4.0.0 + + org.springframework.boot + spring-boot-starter-parent + 4.0.6 + + + dev.openfeature.demo.java + demo + 0.0.1-SNAPSHOT + demo + Blind by Design - Expert: pharma trial dispenser + + 21 + 1.48.0 + 2.14.0 + + + + + io.opentelemetry + opentelemetry-bom + ${opentelemetry.version} + pom + import + + + io.opentelemetry.instrumentation + opentelemetry-instrumentation-bom + ${opentelemetry.instrumentation.version} + pom + import + + + + + + org.springframework.boot + spring-boot-starter-actuator + + + org.springframework.boot + spring-boot-starter-web + + + org.springframework.boot + spring-boot-starter-test + test + + + + + dev.openfeature + sdk + 1.14.2 + + + dev.openfeature.contrib.providers + flagd + 0.11.8 + + + + + dev.openfeature.contrib.hooks + otel + 3.2.1 + + + + + io.opentelemetry + opentelemetry-api + + + io.opentelemetry + opentelemetry-sdk + + + io.opentelemetry + opentelemetry-exporter-otlp + + + io.opentelemetry + opentelemetry-sdk-extension-autoconfigure + + + + + + + org.springframework.boot + spring-boot-maven-plugin + + + + + diff --git a/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/AuditHook.java b/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/AuditHook.java new file mode 100644 index 00000000..ad1ce2a4 --- /dev/null +++ b/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/AuditHook.java @@ -0,0 +1,53 @@ +package dev.openfeature.demo.java.demo; + +import dev.openfeature.sdk.EvaluationContext; +import dev.openfeature.sdk.FlagEvaluationDetails; +import dev.openfeature.sdk.Hook; +import dev.openfeature.sdk.HookContext; +import dev.openfeature.sdk.Value; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.Map; + +/** + * Audit-log hook carried over from the Intermediate level. Writes one line + * per evaluation tagged {@code [AUDIT]}, with the cohort attributes the lab + * director cares about. Variants of {@code clouded} log at {@code WARN} so + * the safety officer can grep for improper-dosing follow-ups. + * + *

This is the durable, weeks-from-now archive view. The Phase 3 task adds + * a {@code ContextSpanHook} for real-time correlation in Tempo โ€” both hooks + * stay registered, they just serve different downstreams.

+ */ +public class AuditHook implements Hook { + + private static final Logger LOG = LoggerFactory.getLogger(AuditHook.class); + + /** Allowlist of context attributes safe to drop into the audit log. */ + private static final List AUDITED = List.of("species", "country", "dose"); + + @Override + public void after(HookContext ctx, FlagEvaluationDetails details, Map hints) { + StringBuilder ctxLine = new StringBuilder(); + EvaluationContext ec = ctx.getCtx(); + for (String key : AUDITED) { + Value v = ec != null ? ec.getValue(key) : null; + ctxLine.append(' ').append(key).append('=').append(v != null ? v.asString() : "(absent)"); + } + String message = String.format("[AUDIT] flag=%s variant=%s reason=%s%s", + ctx.getFlagKey(), details.getVariant(), details.getReason(), ctxLine); + + if ("clouded".equals(details.getVariant())) { + LOG.warn("{} -- improper dosing or off-protocol cohort, follow-up required", message); + } else { + LOG.info("{}", message); + } + } + + @Override + public void error(HookContext ctx, Exception err, Map hints) { + LOG.warn("[AUDIT] flag evaluation error flag={} err={}", ctx.getFlagKey(), err.toString()); + } +} diff --git a/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/Laboratory.java b/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/Laboratory.java new file mode 100644 index 00000000..33c27c39 --- /dev/null +++ b/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/Laboratory.java @@ -0,0 +1,13 @@ +package dev.openfeature.demo.java.demo; + +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; + +@SpringBootApplication +public class Laboratory { + + public static void main(String[] args) { + SpringApplication.run(Laboratory.class, args); + } + +} diff --git a/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenFeatureConfig.java b/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenFeatureConfig.java new file mode 100644 index 00000000..361a7005 --- /dev/null +++ b/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenFeatureConfig.java @@ -0,0 +1,74 @@ +package dev.openfeature.demo.java.demo; + +import dev.openfeature.contrib.hooks.otel.TracesHook; +import dev.openfeature.contrib.providers.flagd.Config; +import dev.openfeature.contrib.providers.flagd.FlagdOptions; +import dev.openfeature.contrib.providers.flagd.FlagdProvider; +import dev.openfeature.sdk.ImmutableContext; +import dev.openfeature.sdk.OpenFeatureAPI; +import dev.openfeature.sdk.Value; +import jakarta.annotation.PostConstruct; +import org.springframework.context.annotation.Configuration; +import org.springframework.web.servlet.config.annotation.InterceptorRegistry; +import org.springframework.web.servlet.config.annotation.WebMvcConfigurer; + +import java.util.HashMap; +import java.util.Optional; + +/** + * Wires the OpenFeature client to a remote flagd container ({@code Resolver.RPC}, + * default host {@code localhost:8013}) and registers the cross-cutting hooks. + * + *

Half-wired on purpose: the {@link TracesHook} reads the current span from + * the global tracer provider, so flag evaluations show up in Tempo as soon as + * the OpenTelemetry SDK is initialized. The matching {@code MetricsHook} is NOT + * registered here โ€” the meter provider is not exporting yet and the + * "Fun With Flags" dashboard panels in Grafana stay dark. Finishing the wiring + * is the participant's first task in this level.

+ */ +@Configuration +public class OpenFeatureConfig implements WebMvcConfigurer { + + @PostConstruct + public void initProvider() { + OpenFeatureAPI api = OpenFeatureAPI.getInstance(); + FlagdOptions flagdOptions = FlagdOptions.builder() + .resolverType(Config.Resolver.RPC) + .build(); + + api.setProviderAndWait(new FlagdProvider(flagdOptions)); + + String country = Optional.ofNullable(System.getenv("COUNTRY")).orElse(""); + HashMap attributes = new HashMap<>(); + attributes.put("country", new Value(country)); + ImmutableContext evaluationContext = new ImmutableContext(attributes); + api.setEvaluationContext(evaluationContext); + + api.addHooks(new AuditHook()); + api.addHooks(new TracesHook()); + // TODO Phase 3 task #1: register the matching MetricsHook here once + // the meter provider has been wired up in OpenTelemetryConfig. Without + // it the Grafana feature-flag dashboard cannot draw its panels. + // + // TODO Phase 3 task #2: write a small ContextSpanHook that copies the + // merged evaluation context attributes (species, country, dose) onto the + // active OpenTelemetry span โ€” for example as + // `feature_flag.context.` โ€” and register it here. Lets you search + // Tempo for `feature_flag.context.dose=underdose` and see, on the same + // span, which `feature_flag.variant` the lab recorded. Closes the + // loop between why an outcome happened and what the chart knew at + // the time. + // + // โš ๏ธ Use a fixed allowlist of keys; do NOT iterate over the whole + // evaluation context. The merged context routinely carries the + // OpenFeature targetingKey (often a user id) and, in real apps, things + // like email or account identifiers โ€” span attributes are retained + // for days in Tempo/Prometheus and are hard to redact after the fact. + // See https://opentelemetry.io/docs/security/ for the broader rule. + } + + @Override + public void addInterceptors(InterceptorRegistry registry) { + registry.addInterceptor(new SpeciesInterceptor()); + } +} diff --git a/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenTelemetryConfig.java b/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenTelemetryConfig.java new file mode 100644 index 00000000..80f21a47 --- /dev/null +++ b/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenTelemetryConfig.java @@ -0,0 +1,73 @@ +package dev.openfeature.demo.java.demo; + +import io.opentelemetry.api.OpenTelemetry; +import io.opentelemetry.sdk.autoconfigure.AutoConfiguredOpenTelemetrySdk; +import jakarta.annotation.PreDestroy; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +/** + * Half-wired OpenTelemetry SDK. + * + *

Traces ARE exported to the LGTM stack via OTLP/gRPC at + * {@code http://localhost:4317}. The {@code TracesHook} registered in + * {@link OpenFeatureConfig} attaches every flag evaluation as a span event + * inside the active HTTP request span โ€” open Grafana โ†’ Explore โ†’ Tempo and + * search for service {@code fun-with-flags-java-spring} to see them.

+ * + *

Metrics are NOT exported yet. The autoconfigure module is told + * {@code otel.metrics.exporter=none}, which means the {@code SdkMeterProvider} + * either is not created or has no exporter attached, so the Grafana + * "Fun With Flags โ€” Feature Flag Metrics" dashboard stays empty. To finish + * Phase 3 the participant must:

+ * + *
    + *
  1. Switch {@code otel.metrics.exporter} to {@code otlp} and set a + * reasonable {@code otel.metric.export.interval} so Mimir receives + * evaluation metrics.
  2. + *
  3. Register the matching + * {@code dev.openfeature.contrib.hooks.otel.MetricsHook} on the + * OpenFeature API in {@link OpenFeatureConfig#initProvider()}.
  4. + *
+ */ +@Configuration +public class OpenTelemetryConfig { + + private AutoConfiguredOpenTelemetrySdk autoConfigured; + + @Bean + public OpenTelemetry openTelemetry( + @Value("${otel.service.name:fun-with-flags-java-spring}") String serviceName, + @Value("${otel.exporter.otlp.endpoint:http://localhost:4317}") String otlpEndpoint, + @Value("${otel.exporter.otlp.protocol:grpc}") String otlpProtocol, + @Value("${otel.traces.exporter:otlp}") String tracesExporter, + // Phase 3 TODO: flip this to "otlp" so the meter provider exports. + @Value("${otel.metrics.exporter:none}") String metricsExporter, + @Value("${otel.logs.exporter:none}") String logsExporter) { + // Expose configured values via system properties so the SDK + // autoconfigure module picks them up regardless of how the app + // was launched. + System.setProperty("otel.service.name", serviceName); + System.setProperty("otel.exporter.otlp.endpoint", otlpEndpoint); + System.setProperty("otel.exporter.otlp.protocol", otlpProtocol); + System.setProperty("otel.traces.exporter", tracesExporter); + System.setProperty("otel.metrics.exporter", metricsExporter); + System.setProperty("otel.logs.exporter", logsExporter); + // Phase 3 TODO: once metrics are flipped on, surface a sensible + // export interval here, e.g. 10000 ms, so the dashboard updates + // within ten seconds of new traffic. + + autoConfigured = AutoConfiguredOpenTelemetrySdk.builder() + .setResultAsGlobal() + .build(); + return autoConfigured.getOpenTelemetrySdk(); + } + + @PreDestroy + public void shutdown() { + if (autoConfigured != null) { + autoConfigured.getOpenTelemetrySdk().close(); + } + } +} diff --git a/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/SpeciesInterceptor.java b/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/SpeciesInterceptor.java new file mode 100644 index 00000000..a1020ebe --- /dev/null +++ b/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/SpeciesInterceptor.java @@ -0,0 +1,45 @@ +package dev.openfeature.demo.java.demo; + +import dev.openfeature.sdk.ImmutableContext; +import dev.openfeature.sdk.OpenFeatureAPI; +import dev.openfeature.sdk.ThreadLocalTransactionContextPropagator; +import dev.openfeature.sdk.Value; +import jakarta.servlet.http.HttpServletRequest; +import jakarta.servlet.http.HttpServletResponse; +import org.springframework.web.servlet.HandlerInterceptor; + +import java.util.HashMap; + +/** + * Per-request OpenFeature transaction context. Reads {@code species} (drives the + * species targeting branch on {@code vision_state}) and {@code userId} (used as + * the OpenFeature targetingKey, so the fractional rollout on + * {@code vision_amplifier_v2} is sticky per caller). + */ +public class SpeciesInterceptor implements HandlerInterceptor { + + @Override + public boolean preHandle(HttpServletRequest request, HttpServletResponse response, Object handler) throws Exception { + String species = request.getParameter("species"); + String userId = request.getParameter("userId"); + HashMap attributes = new HashMap<>(); + if (species != null) { + attributes.put("species", new Value(species)); + } + ImmutableContext evaluationContext = userId != null + ? new ImmutableContext(userId, attributes) + : new ImmutableContext(attributes); + OpenFeatureAPI.getInstance().setTransactionContext(evaluationContext); + return HandlerInterceptor.super.preHandle(request, response, handler); + } + + @Override + public void afterCompletion(HttpServletRequest request, HttpServletResponse response, Object handler, Exception ex) throws Exception { + OpenFeatureAPI.getInstance().setTransactionContext(new ImmutableContext()); + HandlerInterceptor.super.afterCompletion(request, response, handler, ex); + } + + static { + OpenFeatureAPI.getInstance().setTransactionContextPropagator(new ThreadLocalTransactionContextPropagator()); + } +} diff --git a/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/Trial.java b/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/Trial.java new file mode 100644 index 00000000..f5c79d1d --- /dev/null +++ b/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/Trial.java @@ -0,0 +1,59 @@ +package dev.openfeature.demo.java.demo; + +import dev.openfeature.sdk.Client; +import dev.openfeature.sdk.ImmutableContext; +import dev.openfeature.sdk.OpenFeatureAPI; +import dev.openfeature.sdk.Value; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.GetMapping; +import org.springframework.web.bind.annotation.RequestParam; +import org.springframework.web.bind.annotation.RestController; + +import java.util.HashMap; +import java.util.concurrent.ThreadLocalRandom; + +/** + * Phase 3 lab. Reads the {@code vision_amplifier_v2} flag and, when the + * fractional rollout puts the caller into the {@code on} bucket, executes the + * deliberately bad new formulation: 200ms slower, 10% chance of a 5xx. The + * baseline {@code vision_state} flag still drives the response body. + * + *

Each evaluation also passes a {@code dose} attribute as invocation + * context โ€” the fraction of clinical staff who under- or over-dose + * subjects shows up here. Most subjects get {@code "standard"}, the rest get + * {@code "underdose"} or {@code "overdose"}, both of which override the cohort + * targeting and yield {@code clouded}.

+ */ +@RestController +public class Trial { + + @GetMapping("/") + public ResponseEntity observeSubject(@RequestParam(required = false) String dose) { + Client client = OpenFeatureAPI.getInstance().getClient(); + boolean newAlgo = client.getBooleanValue("vision_amplifier_v2", false); + if (newAlgo) { + try { + Thread.sleep(200); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + if (ThreadLocalRandom.current().nextDouble() < 0.1) { + return ResponseEntity.status(500).body("simulated failure in vision_amplifier_v2"); + } + } + + String resolvedDose = (dose != null) ? dose : pickDose(); + HashMap invocationCtx = new HashMap<>(); + invocationCtx.put("dose", new Value(resolvedDose)); + + return ResponseEntity.ok( + client.getStringDetails("vision_state", "untreated", new ImmutableContext(invocationCtx))); + } + + private static String pickDose() { + double r = ThreadLocalRandom.current().nextDouble(); + if (r < 0.60) return "standard"; + if (r < 0.90) return "underdose"; + return "overdose"; + } +} diff --git a/adventures/planned/00-blind-by-design/expert/src/main/resources/application.properties b/adventures/planned/00-blind-by-design/expert/src/main/resources/application.properties new file mode 100644 index 00000000..186c82e1 --- /dev/null +++ b/adventures/planned/00-blind-by-design/expert/src/main/resources/application.properties @@ -0,0 +1,11 @@ +spring.application.name=demo + +# Phase 3 OpenTelemetry configuration (OTLP gRPC exporter to the local LGTM stack) +otel.exporter.otlp.endpoint=http://localhost:4317 +otel.exporter.otlp.protocol=grpc +otel.traces.exporter=otlp +# TODO Phase 3 task: flip this from "none" to "otlp" so flag-evaluation +# metrics start exporting to the LGTM stack. +otel.metrics.exporter=none +otel.logs.exporter=none +otel.service.name=fun-with-flags-java-spring diff --git a/adventures/planned/00-blind-by-design/expert/verify.sh b/adventures/planned/00-blind-by-design/expert/verify.sh new file mode 100755 index 00000000..e5b92de0 --- /dev/null +++ b/adventures/planned/00-blind-by-design/expert/verify.sh @@ -0,0 +1,227 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Load shared libraries +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck disable=SC1091 +source "$SCRIPT_DIR/../../../../lib/scripts/loader.sh" + +OBJECTIVE="By the end of this level, you should have: +- The OpenTelemetry meter provider wired and the OpenFeature MetricsHook registered +- At least one trace for service 'fun-with-flags-java-spring' visible in Tempo +- The 'feature_flag_evaluation_requests_total' counter non-zero in Prometheus +- The 'vision_amplifier_v2' fractional rollout flipped back to 100% off / 0% on +- HTTP 5xx rate over the last minute below 1%" + +DOCS_URL="https://dynatrace-oss.github.io/open-ecosystem-challenges/00-blind-by-design/expert" + +print_header \ + 'Adventure 00: Blind by Design' \ + '๐Ÿ”ด Expert: Phase 3 โ€” read the chart' \ + 'Verification' + +check_prerequisites curl jq + +print_sub_header "Running verification checks..." + +TESTS_PASSED=0 +TESTS_FAILED=0 +FAILED_CHECKS=() + +APP_URL="http://localhost:8080" +FLAGD_HTTP="http://localhost:8013" +PROMETHEUS_URL="http://localhost:9090" +TEMPO_URL="http://localhost:3200" +GRAFANA_URL="http://localhost:3000" + +# ---- 1. App reachable ------------------------------------------------------ +print_test_section "Checking lab reachability" +if curl -fsS --max-time 5 "$APP_URL/" >/dev/null 2>&1; then + print_info_indent "โœ“ Spring Boot lab reachable at $APP_URL" + TESTS_PASSED=$((TESTS_PASSED + 1)) +else + print_error_indent "Spring Boot lab is not reachable at $APP_URL" + print_hint "Start the app with: ./mvnw spring-boot:run" + TESTS_FAILED=$((TESTS_FAILED + 1)) + FAILED_CHECKS+=("app_reachable") +fi +print_new_line + +# ---- 2. flagd reachable --------------------------------------------------- +print_test_section "Checking flagd reachability" +if curl -fsS --max-time 5 -X POST "$FLAGD_HTTP/flagd.evaluation.v1.Service/ResolveBoolean" \ + -H 'Content-Type: application/json' \ + -d '{"flagKey":"loadgen_active","context":{}}' >/dev/null 2>&1; then + print_info_indent "โœ“ flagd HTTP eval API reachable at $FLAGD_HTTP" + TESTS_PASSED=$((TESTS_PASSED + 1)) +else + print_error_indent "flagd HTTP API is not reachable at $FLAGD_HTTP" + print_hint "flagd is a sibling devcontainer service. Reopen the Codespace if it is not running." + TESTS_FAILED=$((TESTS_FAILED + 1)) + FAILED_CHECKS+=("flagd_reachable") +fi +print_new_line + +# ---- 3. LGTM stack reachable --------------------------------------------- +print_test_section "Checking Grafana LGTM stack reachability" +if curl -fsS --max-time 5 "$GRAFANA_URL/api/health" >/dev/null 2>&1; then + print_info_indent "โœ“ Grafana reachable at $GRAFANA_URL" + TESTS_PASSED=$((TESTS_PASSED + 1)) +else + print_error_indent "Grafana is not reachable at $GRAFANA_URL" + print_hint "The LGTM stack is a sibling devcontainer service (lgtm). Reopen the Codespace if it is not running." + TESTS_FAILED=$((TESTS_FAILED + 1)) + FAILED_CHECKS+=("lgtm_reachable") +fi +print_new_line + +# ---- 4. vision_amplifier_v2 rolled back ----------------------------------- +print_test_section "Checking vision_amplifier_v2 rollback" +ROLLOUT_RESPONSE=$(curl -fsS --max-time 5 -X POST \ + "$FLAGD_HTTP/flagd.evaluation.v1.Service/ResolveBoolean" \ + -H 'Content-Type: application/json' \ + -d '{"flagKey":"vision_amplifier_v2","context":{"targetingKey":"verify-probe-user"}}' 2>/dev/null || echo "") + +if [[ -z "$ROLLOUT_RESPONSE" ]]; then + print_error_indent "Could not query vision_amplifier_v2 from flagd" + print_hint "Make sure the flagd container is running and flags.json has vision_amplifier_v2 defined." + TESTS_FAILED=$((TESTS_FAILED + 1)) + FAILED_CHECKS+=("vision_amplifier_v2_rollback") +else + ROLLOUT_VALUE=$(echo "$ROLLOUT_RESPONSE" | jq -r '.value // empty') + if [[ "$ROLLOUT_VALUE" == "false" ]]; then + print_info_indent "โœ“ vision_amplifier_v2 evaluates to false (rollout has been rolled back)" + TESTS_PASSED=$((TESTS_PASSED + 1)) + else + print_error_indent "vision_amplifier_v2 still resolves to '$ROLLOUT_VALUE' for the probe user" + print_hint "Edit flags.json: flip the fractional bucket so 'off' is 100 and 'on' is 0, save, and flagd will pick it up." + TESTS_FAILED=$((TESTS_FAILED + 1)) + FAILED_CHECKS+=("vision_amplifier_v2_rollback") + fi +fi +print_new_line + +# ---- 5. Prometheus has feature_flag_evaluation_requests_total ---------- +print_test_section "Checking feature_flag metrics in Prometheus" +PROM_QUERY='feature_flag_evaluation_requests_total' +PROM_RESPONSE=$(curl -fsS --max-time 5 -G "$PROMETHEUS_URL/api/v1/query" \ + --data-urlencode "query=$PROM_QUERY" 2>/dev/null || echo "") + +if [[ -z "$PROM_RESPONSE" ]]; then + print_error_indent "Could not query Prometheus at $PROMETHEUS_URL" + print_hint "The grafana/otel-lgtm container exposes Prometheus on port 9090. If port 9090 is not forwarded, the lgtm sibling container has not started โ€” reopen the Codespace." + TESTS_FAILED=$((TESTS_FAILED + 1)) + FAILED_CHECKS+=("prometheus_metrics") +else + RESULT_COUNT=$(echo "$PROM_RESPONSE" | jq '.data.result | length // 0') + TOTAL=$(echo "$PROM_RESPONSE" | jq -r '[.data.result[]?.value[1] | tonumber] | add // 0') + # `add // 0` is a tiny safeguard if the array is empty. + if [[ "$RESULT_COUNT" -gt 0 ]] && awk -v v="$TOTAL" 'BEGIN { exit !(v+0 > 0) }'; then + print_info_indent "โœ“ feature_flag_evaluation_requests_total is non-zero (sum=$TOTAL)" + TESTS_PASSED=$((TESTS_PASSED + 1)) + else + print_error_indent "feature_flag_evaluation_requests_total is missing or zero" + print_hint "Wire the OpenTelemetry meter provider AND register MetricsHook in OpenFeatureConfig.initProvider(). Then drive traffic by flipping loadgen_active to 'on'." + TESTS_FAILED=$((TESTS_FAILED + 1)) + FAILED_CHECKS+=("prometheus_metrics") + fi +fi +print_new_line + +# ---- 6. Tempo has at least one trace for the service ------------------- +print_test_section "Checking traces in Tempo" +TEMPO_RESPONSE=$(curl -fsS --max-time 5 -G "$TEMPO_URL/api/search" \ + --data-urlencode 'tags=service.name=fun-with-flags-java-spring' \ + --data-urlencode 'limit=20' 2>/dev/null || echo "") + +if [[ -z "$TEMPO_RESPONSE" ]]; then + print_error_indent "Could not query Tempo at $TEMPO_URL" + print_hint "The grafana/otel-lgtm container exposes Tempo on port 3200. If port 9090 is not forwarded, the lgtm sibling container has not started โ€” reopen the Codespace." + TESTS_FAILED=$((TESTS_FAILED + 1)) + FAILED_CHECKS+=("tempo_traces") +else + TRACE_COUNT=$(echo "$TEMPO_RESPONSE" | jq '.traces | length // 0') + if [[ "$TRACE_COUNT" -gt 0 ]]; then + print_info_indent "โœ“ Tempo has $TRACE_COUNT trace(s) for service 'fun-with-flags-java-spring'" + TESTS_PASSED=$((TESTS_PASSED + 1)) + else + print_error_indent "Tempo has no traces for service 'fun-with-flags-java-spring'" + print_hint "Send some traffic: curl http://localhost:8080/?userId=demo and wait a few seconds for the exporter to flush." + TESTS_FAILED=$((TESTS_FAILED + 1)) + FAILED_CHECKS+=("tempo_traces") + fi +fi +print_new_line + +# ---- 6b. Tempo spans carry the dose context attribute ------------------ +# Generate a deterministic underdose request, give the exporter a moment to +# flush, then query Tempo for spans with feature_flag.context.dose. If the +# attribute is missing the participant has not registered the +# ContextSpanHook (or it is not reading the merged eval context). +print_test_section "Checking flag-context attributes on Tempo spans" +curl -s --max-time 5 'http://localhost:8080/?dose=underdose' >/dev/null 2>&1 || true +sleep 6 # OTel batch span processor flush window +DOSE_TEMPO=$(curl -fsS --max-time 5 -G "$TEMPO_URL/api/search" \ + --data-urlencode 'tags=feature_flag.context.dose=underdose' \ + --data-urlencode 'limit=5' 2>/dev/null || echo "") + +if [[ -z "$DOSE_TEMPO" ]]; then + print_error_indent "Could not query Tempo for context attributes" + TESTS_FAILED=$((TESTS_FAILED + 1)) + FAILED_CHECKS+=("tempo_context") +else + DOSE_COUNT=$(echo "$DOSE_TEMPO" | jq '.traces | length // 0') + if [[ "$DOSE_COUNT" -gt 0 ]]; then + print_info_indent "โœ“ Tempo has $DOSE_COUNT span(s) tagged feature_flag.context.dose=underdose" + TESTS_PASSED=$((TESTS_PASSED + 1)) + else + print_error_indent "No spans with feature_flag.context.dose=underdose found in Tempo" + print_hint "Did you register the ContextSpanHook that copies merged-eval-context attrs onto Span.current()?" + TESTS_FAILED=$((TESTS_FAILED + 1)) + FAILED_CHECKS+=("tempo_context") + fi +fi +print_new_line + +# ---- 7. HTTP 5xx rate under threshold ---------------------------------- +print_test_section "Checking HTTP 5xx error rate (last 1m)" +ERROR_QUERY='sum(rate(http_server_request_duration_seconds_count{http_response_status_code=~"5.."}[1m])) / clamp_min(sum(rate(http_server_request_duration_seconds_count[1m])), 1e-9)' +ERROR_RESPONSE=$(curl -fsS --max-time 5 -G "$PROMETHEUS_URL/api/v1/query" \ + --data-urlencode "query=$ERROR_QUERY" 2>/dev/null || echo "") + +if [[ -z "$ERROR_RESPONSE" ]]; then + # Fallback: try the older Spring metric name + ERROR_QUERY_ALT='sum(rate(http_server_requests_seconds_count{status=~"5.."}[1m])) / clamp_min(sum(rate(http_server_requests_seconds_count[1m])), 1e-9)' + ERROR_RESPONSE=$(curl -fsS --max-time 5 -G "$PROMETHEUS_URL/api/v1/query" \ + --data-urlencode "query=$ERROR_QUERY_ALT" 2>/dev/null || echo "") +fi + +if [[ -z "$ERROR_RESPONSE" ]]; then + print_error_indent "Could not query Prometheus for HTTP error rate" + TESTS_FAILED=$((TESTS_FAILED + 1)) + FAILED_CHECKS+=("error_rate") +else + ERROR_RATE=$(echo "$ERROR_RESPONSE" | jq -r '.data.result[0].value[1] // "0"') + # Treat NaN (no requests at all) as a pass โ€” there's no traffic to fail on. + if [[ "$ERROR_RATE" == "NaN" ]]; then + print_info_indent "โœ“ No traffic in the last minute โ€” error rate not meaningful (treated as pass)" + TESTS_PASSED=$((TESTS_PASSED + 1)) + elif awk -v v="$ERROR_RATE" 'BEGIN { exit !(v+0 < 0.01) }'; then + PERCENT=$(awk -v v="$ERROR_RATE" 'BEGIN { printf "%.2f", v*100 }') + print_info_indent "โœ“ HTTP 5xx rate is ${PERCENT}% (< 1%)" + TESTS_PASSED=$((TESTS_PASSED + 1)) + else + PERCENT=$(awk -v v="$ERROR_RATE" 'BEGIN { printf "%.2f", v*100 }') + print_error_indent "HTTP 5xx rate is ${PERCENT}% (>= 1%)" + print_hint "The 'on' bucket of vision_amplifier_v2 throws 5xx 10% of the time. Roll the rollout back to 100% off." + TESTS_FAILED=$((TESTS_FAILED + 1)) + FAILED_CHECKS+=("error_rate") + fi +fi +print_new_line + +print_verification_summary "Phase 3 โ€” read the chart" "$DOCS_URL" "$OBJECTIVE" + +if [[ $TESTS_FAILED -ne 0 ]]; then + exit 1 +fi From 659887332028d1d395f92376494cfa2e8c9a0f1b Mon Sep 17 00:00:00 2001 From: Simon Schrottner Date: Thu, 30 Apr 2026 15:15:48 +0200 Subject: [PATCH 2/8] review: address PR #42 feedback for Expert level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - rename '๐Ÿงช The story (optional)' โ†’ '๐Ÿช The Backstory' - pin all docker images: flagd v0.15.4, otel-lgtm 0.26.0, k6 1.7.1 - devcontainer: drop flagd ports (8013/8014/8015/8016) from forwardPorts; the LGTM-stack ports (3000/9090/3200/4317/4318) and :8080 stay forwarded as before - drop the published flagd ports from docker-compose โ€” flagd reaches the lab on the docker-internal network as `flagd:8013` - drop the 'Solution Walkthrough' section and the inline solutions/expert.md cross-link (solutions are unpublished pre-deadline) - replace the verify-script blurb with the Adventure 03 template - 'Access the UIs / flagd' subsection: explain flagd is internal-only now that the ports aren't forwarded - verify.sh: lean on test_http_endpoint for the reachability check; point FLAGD_HTTP at flagd:8013 (docker network DNS) since the host no longer forwards :8013 Refs: PR #42 review by @KatharinaSick Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Simon Schrottner --- .../devcontainer.json | 8 +--- .../docker-compose.yml | 14 +++---- .../planned/00-blind-by-design/docs/expert.md | 37 +++++++------------ .../00-blind-by-design/expert/verify.sh | 15 ++++---- 4 files changed, 29 insertions(+), 45 deletions(-) diff --git a/.devcontainer/00-blind-by-design_03-expert/devcontainer.json b/.devcontainer/00-blind-by-design_03-expert/devcontainer.json index 44fb84e5..70cc431c 100644 --- a/.devcontainer/00-blind-by-design_03-expert/devcontainer.json +++ b/.devcontainer/00-blind-by-design_03-expert/devcontainer.json @@ -22,18 +22,14 @@ ] } }, - "forwardPorts": [8080, 3000, 4317, 4318, 9090, 3200, 8013, 8014, 8015, 8016], + "forwardPorts": [8080, 3000, 4317, 4318, 9090, 3200], "portsAttributes": { "8080": { "label": "Spring Boot lab", "onAutoForward": "notify" }, "3000": { "label": "Grafana", "onAutoForward": "notify" }, "4317": { "label": "OTLP gRPC", "onAutoForward": "ignore" }, "4318": { "label": "OTLP HTTP", "onAutoForward": "ignore" }, "9090": { "label": "Prometheus", "onAutoForward": "ignore" }, - "3200": { "label": "Tempo HTTP API", "onAutoForward": "ignore" }, - "8013": { "label": "flagd gRPC eval", "onAutoForward": "ignore" }, - "8014": { "label": "flagd management/metrics", "onAutoForward": "ignore" }, - "8015": { "label": "flagd sync (IN_PROCESS)", "onAutoForward": "ignore" }, - "8016": { "label": "flagd OFREP", "onAutoForward": "ignore" } + "3200": { "label": "Tempo HTTP API", "onAutoForward": "ignore" } }, "otherPortsAttributes": { "onAutoForward": "ignore" diff --git a/.devcontainer/00-blind-by-design_03-expert/docker-compose.yml b/.devcontainer/00-blind-by-design_03-expert/docker-compose.yml index 842b3fce..cc866c6b 100644 --- a/.devcontainer/00-blind-by-design_03-expert/docker-compose.yml +++ b/.devcontainer/00-blind-by-design_03-expert/docker-compose.yml @@ -26,21 +26,19 @@ services: - COUNTRY=de flagd: - image: ghcr.io/open-feature/flagd:latest + image: ghcr.io/open-feature/flagd:v0.15.4 volumes: - ../..:/workspaces/${localWorkspaceFolderBasename:-open-ecosystem-challenges}:ro command: - start - --uri - file:/workspaces/${localWorkspaceFolderBasename:-open-ecosystem-challenges}/adventures/planned/00-blind-by-design/expert/flags.json - ports: - - "8013:8013" - - "8014:8014" - - "8015:8015" - - "8016:8016" + # No `ports:` block โ€” the lab and loadgen reach flagd on the + # docker-internal network as `flagd:8013`. Forwarding the flagd + # ports onto the Codespace host is not needed to play the level. lgtm: - image: grafana/otel-lgtm:latest + image: grafana/otel-lgtm:0.26.0 ports: - "3000:3000" # Grafana UI (admin / admin) - "4317:4317" # OTLP gRPC @@ -55,7 +53,7 @@ services: - ../../adventures/planned/00-blind-by-design/expert/dashboards:/otel-lgtm/grafana/dashboards:ro loadgen: - image: grafana/k6:latest + image: grafana/k6:1.7.1 command: ["run", "--quiet", "/scripts/script.js"] volumes: - ../../adventures/planned/00-blind-by-design/expert/loadgen/k6:/scripts:ro diff --git a/adventures/planned/00-blind-by-design/docs/expert.md b/adventures/planned/00-blind-by-design/docs/expert.md index 4b5334f5..6002370e 100644 --- a/adventures/planned/00-blind-by-design/docs/expert.md +++ b/adventures/planned/00-blind-by-design/docs/expert.md @@ -10,7 +10,7 @@ Spans are already flowing into Tempo from the OpenFeature `TracesHook`, but the The level passes when (a) `feature_flag_evaluation_requests_total` is non-zero in Prometheus, (b) Tempo spans for `fun-with-flags-java-spring` carry `feature_flag.context.*` attributes, (c) `vision_amplifier_v2` is rolled back to 100% off, and (d) the HTTP 5xx rate over the last minute is below 1%. -## ๐Ÿงช The story (optional) +## ๐Ÿช The Backstory The trial just went wide. Phase 3 of the new vision amplifier โ€” `vision_amplifier_v2` โ€” was approved for the full cohort yesterday morning. The promise was straightforward: subjects emerge with sharper eyesight than they walked in with. By mid-afternoon the audit log was screaming. Subjects were stabilising 200ms slower, and roughly one in ten of them was emerging **blind** โ€” containment failure recorded as an HTTP 500. The lab director pulled up the **Feature Flag Metrics** dashboard expecting to triage visually. The dashboard was dark. Someone had wired up traces but never finished the metrics half. There is no chart to read. The lab is studying eyesight and the lab itself cannot see. @@ -22,16 +22,6 @@ Coming Soon > โ„น๏ธ You can still complete the challenge after this date, but points will only > be awarded for submissions before the deadline. -## ๐Ÿ“ Solution Walkthrough - -> โš ๏ธ **Spoiler Alert:** The following walkthrough contains the full solution -> to the challenge. We encourage you to try solving it on your own first. -> Consider coming back here only if you get stuck or want to check your -> approach. - -If you get stuck, follow the -[step-by-step solution walkthrough](./solutions/expert.md). - ## ๐Ÿ’ฌ Join the discussion Share your solutions and questions in the @@ -113,8 +103,6 @@ The `before` hook receives a `HookContext` whose `getCtx()` returns the **merged Register it next to `TracesHook` / `MetricsHook` in `OpenFeatureConfig`. Now every flag evaluation tags its parent span with the context attributes the lab cares about. In Tempo: **Search โ†’ Service: fun-with-flags-java-spring โ†’ +Tag โ†’ `feature_flag.context.dose=underdose`** lights up exactly the requests where a tech mis-dosed, with the resolved variant on the same span event. -The full implementation, including imports and a couple of subtle correctness notes, is in [solutions/expert.md](./solutions/expert.md). - > โš ๏ธ **Allowlist, don't iterate.** Use a fixed allowlist for the same reason the `AuditHook` does โ€” see [Intermediate's PII note](./intermediate.md#3c-an-audithook) and the [OpenTelemetry security guidance](https://opentelemetry.io/docs/security/). ### `flagd` `fractional` operation + `targetingKey` @@ -149,7 +137,7 @@ Your Codespace comes pre-configured with the following tools: - A browser pointed at [`http://localhost:3000`](http://localhost:3000) for Grafana (admin / admin) - [`jq`](https://jqlang.github.io/jq/): Pretty-print and filter JSON from `curl` -flagd, the Grafana LGTM stack, and the k6 loadgen are **sibling devcontainer services** โ€” they come up automatically when the Codespace boots. There is no `docker compose up` step. Inside the workspace they are reachable as `flagd`, `lgtm`, and `loadgen`; on the host they are forwarded to the same `localhost:NNNN` ports that `verify.sh` and the docs assume. +flagd, the Grafana LGTM stack, and the k6 loadgen are **sibling devcontainer services** โ€” they come up automatically when the Codespace boots. There is no `docker compose up` step. Inside the workspace they are reachable as `flagd`, `lgtm`, and `loadgen`. The Grafana / Prometheus / Tempo / OTLP ports on `lgtm` are also forwarded onto the Codespace host so you can click them in the Ports tab; flagd stays on the docker-internal network only. ## โœ… How to Play @@ -206,7 +194,7 @@ to assert traces are flowing. #### flagd -flagd is on `:8013` (gRPC eval) โ€” same as Beginner; the other ports (`8014` management/metrics, `8015` sync, `8016` OFREP) aren't used in this level. +flagd runs on the docker-internal network only. The lab and the loadgen reach it as `flagd:8013`; you don't need to forward its ports onto the Codespace host to play this level. (`verify.sh` runs inside the workspace container so it can reach `flagd:8013` directly.) #### OTLP receivers (Ports `4317` / `4318`) @@ -277,16 +265,19 @@ rate fall to zero. ### 4. Verify Your Solution -Once the dashboard is healthy, run the verifier: +Once you think you've solved the challenge, run the verification script: ```bash -adventures/planned/00-blind-by-design/expert/verify.sh +./verify.sh ``` -The script asserts the lab, flagd, and LGTM are reachable, that -`vision_amplifier_v2` evaluates to `false` for a probe user, that the -`feature_flag_evaluation_requests_total` Prometheus counter is non-zero, that -Tempo has at least one trace for `fun-with-flags-java-spring`, and that the -HTTP 5xx rate over the last minute is below 1%. +**If the verification fails:** + +The script will tell you which checks failed. Fix the issues and run it again. + +**If the verification passes:** -If everything turns green, your solution is solid. ๐ŸŽ‰ +1. The script will check if your changes are committed and pushed. +2. Follow the on-screen instructions to commit your changes if needed. +3. Once everything is ready, the script will generate a **Certificate of Completion**. +4. **Copy this certificate** and paste it into the [challenge thread](https://community.open-ecosystem.com/c/open-ecosystem-challenges/) to claim your victory! ๐Ÿ† diff --git a/adventures/planned/00-blind-by-design/expert/verify.sh b/adventures/planned/00-blind-by-design/expert/verify.sh index e5b92de0..97c4bc90 100755 --- a/adventures/planned/00-blind-by-design/expert/verify.sh +++ b/adventures/planned/00-blind-by-design/expert/verify.sh @@ -29,20 +29,19 @@ TESTS_FAILED=0 FAILED_CHECKS=() APP_URL="http://localhost:8080" -FLAGD_HTTP="http://localhost:8013" +# flagd is on the docker-internal network only โ€” verify.sh runs from +# the workspace container's terminal, where the service name resolves. +FLAGD_HTTP="http://flagd:8013" PROMETHEUS_URL="http://localhost:9090" TEMPO_URL="http://localhost:3200" GRAFANA_URL="http://localhost:3000" # ---- 1. App reachable ------------------------------------------------------ +# Lean on test_http_endpoint from lib/scripts/http.sh โ€” handles connection +# failure and unexpected-content cases for us. print_test_section "Checking lab reachability" -if curl -fsS --max-time 5 "$APP_URL/" >/dev/null 2>&1; then - print_info_indent "โœ“ Spring Boot lab reachable at $APP_URL" - TESTS_PASSED=$((TESTS_PASSED + 1)) -else - print_error_indent "Spring Boot lab is not reachable at $APP_URL" - print_hint "Start the app with: ./mvnw spring-boot:run" - TESTS_FAILED=$((TESTS_FAILED + 1)) +if ! test_http_endpoint "$APP_URL/" "vision_state" \ + "Start the app with: ./mvnw spring-boot:run"; then FAILED_CHECKS+=("app_reachable") fi print_new_line From 8207aee4097839f3781c8e430ab5b83b95237787 Mon Sep 17 00:00:00 2001 From: Simon Schrottner Date: Thu, 30 Apr 2026 16:08:35 +0200 Subject: [PATCH 3/8] expert: add Makefile, drop solution walkthrough Mirror @KatharinaSick's Beginner pattern (605dabc): a thin Makefile for discoverability + remove the solution doc since solutions are not meant to be published before the challenge launch. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Simon Schrottner --- .../docs/solutions/expert.md | 280 ------------------ .../00-blind-by-design/expert/Makefile | 36 +++ 2 files changed, 36 insertions(+), 280 deletions(-) delete mode 100644 adventures/planned/00-blind-by-design/docs/solutions/expert.md create mode 100644 adventures/planned/00-blind-by-design/expert/Makefile diff --git a/adventures/planned/00-blind-by-design/docs/solutions/expert.md b/adventures/planned/00-blind-by-design/docs/solutions/expert.md deleted file mode 100644 index 7b414c4e..00000000 --- a/adventures/planned/00-blind-by-design/docs/solutions/expert.md +++ /dev/null @@ -1,280 +0,0 @@ -# ๐Ÿ”ด Expert Solution Walkthrough: Phase 3 โ€” read the chart - -Four sub-tasks, in order: wire the meter provider, register `MetricsHook`, -write and register a `ContextSpanHook` of your own, roll the bad flag back. -We'll do them exactly that way. - -> โš ๏ธ **Spoiler Alert:** This walkthrough contains the full solution. Try -> solving it on your own first. - -## ๐Ÿ“‹ Step 1: Read the objective - -> By the end of this level, you should have: -> -> - The OpenTelemetry meter provider wired and the OpenFeature `MetricsHook` registered -> - A `ContextSpanHook` of your own that copies the merged evaluation context -> (`species`, `country`, `dose`) onto the active span as `feature_flag.context.` -> - At least one trace for service `fun-with-flags-java-spring` visible in Tempo -> - Spans tagged with `feature_flag.context.dose=underdose` searchable in Tempo -> - The `feature_flag_evaluation_requests_total` counter non-zero in Prometheus -> - The `vision_amplifier_v2` fractional rollout flipped back to 100% off / 0% on -> - HTTP 5xx rate over the last minute below 1% - -## ๐Ÿ” Step 2: Inspect what's already wired - -Traces work out of the box โ€” the `TracesHook` is registered in -`OpenFeatureConfig.java` and the OTel SDK is exporting via OTLP/gRPC to the -LGTM container at `http://localhost:4317`. Open Grafana โ†’ Explore โ†’ Tempo โ†’ -search for `service.name=fun-with-flags-java-spring` and you should already -see traces. (If you don't, hit `curl http://localhost:8080/` a few times to -generate some.) - -The metrics half, however, is dead. Two reasons: - -1. `application.properties` has `otel.metrics.exporter=none`. The SDK creates - a `SdkMeterProvider` but no exporter is attached, so any counter it - records is dropped. -2. `OpenFeatureConfig.initProvider()` registers `TracesHook` but not - `MetricsHook`. Even if the meter provider could export, no one is - recording flag evaluations as metrics. - -One thing that **is** already wired and matters for this level: the -`SpeciesInterceptor` you wrote in Intermediate. Expert ships it byte-for-byte -unchanged. The relevant part for this level is the line you already wrote -that reads `?userId=โ€ฆ` from the query string and constructs -`new ImmutableContext(userId, attributes)` โ€” by SDK convention, the first -`String` argument **is** the OpenFeature `targetingKey`. That is what makes -the `vision_amplifier_v2` fractional rollout actually bucket per subject; -without it, every evaluation would hash the same way and the percentages -would do nothing. (Intermediate didn't have a flag that used the -targetingKey, so the wiring sat dormant; this is where it pays off.) You -don't write any new code for this in Expert โ€” the rollback in Step 6 takes -effect immediately because the loadgen sends a fresh `userId` per request -into the interceptor you already shipped. - -## ๐Ÿ›  Step 3: Wire the meter provider - -Open `src/main/java/dev/openfeature/demo/java/demo/OpenTelemetryConfig.java`. -Change the default for `otel.metrics.exporter` from `"none"` to `"otlp"`, and -add a default for `otel.metric.export.interval` so the meter flushes every -ten seconds. The full method: - -```java -@Bean -public OpenTelemetry openTelemetry( - @Value("${otel.service.name:fun-with-flags-java-spring}") String serviceName, - @Value("${otel.exporter.otlp.endpoint:http://localhost:4317}") String otlpEndpoint, - @Value("${otel.exporter.otlp.protocol:grpc}") String otlpProtocol, - @Value("${otel.traces.exporter:otlp}") String tracesExporter, - @Value("${otel.metrics.exporter:otlp}") String metricsExporter, - @Value("${otel.logs.exporter:none}") String logsExporter, - @Value("${otel.metric.export.interval:10000}") String metricExportInterval) { - System.setProperty("otel.service.name", serviceName); - System.setProperty("otel.exporter.otlp.endpoint", otlpEndpoint); - System.setProperty("otel.exporter.otlp.protocol", otlpProtocol); - System.setProperty("otel.traces.exporter", tracesExporter); - System.setProperty("otel.metrics.exporter", metricsExporter); - System.setProperty("otel.logs.exporter", logsExporter); - System.setProperty("otel.metric.export.interval", metricExportInterval); - - autoConfigured = AutoConfiguredOpenTelemetrySdk.builder() - .setResultAsGlobal() - .build(); - return autoConfigured.getOpenTelemetrySdk(); -} -``` - -Then update `src/main/resources/application.properties` to match: - -```properties -spring.application.name=demo - -otel.exporter.otlp.endpoint=http://localhost:4317 -otel.exporter.otlp.protocol=grpc -otel.traces.exporter=otlp -otel.metrics.exporter=otlp -otel.logs.exporter=none -otel.service.name=fun-with-flags-java-spring -otel.metric.export.interval=10000 -``` - -> The autoconfigure module reads `otel.metrics.exporter` and, when set to -> `otlp`, attaches an `OtlpGrpcMetricExporter` to the `SdkMeterProvider`. The -> resulting `OpenTelemetry` bean now exposes a working `getMeterProvider()`. - -## ๐Ÿ›  Step 4: Register `MetricsHook` on the OpenFeature API - -Open `OpenFeatureConfig.java`. Inject the `OpenTelemetry` bean via -constructor injection and add `MetricsHook` next to the existing -`TracesHook` call: - -```java -import dev.openfeature.contrib.hooks.otel.MetricsHook; -import dev.openfeature.contrib.hooks.otel.TracesHook; -import io.opentelemetry.api.OpenTelemetry; - -@Configuration -public class OpenFeatureConfig implements WebMvcConfigurer { - - private final OpenTelemetry openTelemetry; - - public OpenFeatureConfig(OpenTelemetry openTelemetry) { - this.openTelemetry = openTelemetry; - } - - @PostConstruct - public void initProvider() { - OpenFeatureAPI api = OpenFeatureAPI.getInstance(); - FlagdOptions flagdOptions = FlagdOptions.builder() - .resolverType(Config.Resolver.RPC) - .build(); - api.setProviderAndWait(new FlagdProvider(flagdOptions)); - - HashMap attributes = new HashMap<>(); - attributes.put("country", new Value(Optional.ofNullable(System.getenv("COUNTRY")).orElse(""))); - api.setEvaluationContext(new ImmutableContext(attributes)); - - api.addHooks(new AuditHook()); // already wired in broken state - api.addHooks(new TracesHook()); // already wired in broken state - api.addHooks(new MetricsHook(openTelemetry)); // <-- you add this - api.addHooks(new ContextSpanHook()); // <-- you add this - } - - // addInterceptors(...) unchanged -} -``` - -### The `ContextSpanHook` - -A small `Hook` of your own, in a new file `ContextSpanHook.java`, that mirrors the merged evaluation context onto the active span. This is what lets Tempo show "this request had `dose=underdose` and got `variant=clouded`" on the same span. - -```java -package dev.openfeature.demo.java.demo; - -import dev.openfeature.sdk.EvaluationContext; -import dev.openfeature.sdk.Hook; -import dev.openfeature.sdk.HookContext; -import dev.openfeature.sdk.Value; -import io.opentelemetry.api.trace.Span; - -import java.util.List; -import java.util.Map; -import java.util.Optional; - -public class ContextSpanHook implements Hook { - - private static final List TRACKED = List.of("species", "country", "dose"); - - @Override - public Optional before(HookContext ctx, Map hints) { - Span span = Span.current(); - EvaluationContext ec = ctx.getCtx(); - for (String key : TRACKED) { - Value v = ec.getValue(key); - if (v != null && v.asString() != null) { - span.setAttribute("feature_flag.context." + key, v.asString()); - } - } - return Hook.super.before(ctx, hints); - } -} -``` - -Three notes worth calling out: - -- `HookContext.getCtx()` returns the **merged** evaluation context โ€” global + transaction + invocation, in that precedence order. So the hook reads whatever the SDK is about to use, regardless of which layer set the value. -- `Span.current()` returns the no-op span if there is no active OTel context (e.g. in tests without an instrumented HTTP server). `setAttribute` on the no-op span is a safe no-op, so the hook does not need defensive guards. -- **`TRACKED` is a fixed allowlist on purpose โ€” do not iterate.** The merged context typically also carries `targetingKey` (often a stable user id) and, in real apps, things like `email`, account ids, or device identifiers. If you replace the allowlist with `for (String key : ec.asMap().keySet())` you ship that PII straight into Tempo / Prometheus, where it is retained for days and is hard to redact after the fact. Pick the minimum set of keys that helps you correlate, document why each is safe for long-term storage, and add new keys deliberately. The OpenTelemetry [security & privacy guidance](https://opentelemetry.io/docs/security/) covers the broader principle. - -Restart the lab: - -```bash -./mvnw spring-boot:run -``` - -After it boots, hit `curl http://localhost:8080/` a few times. Wait ten to -fifteen seconds and check Prometheus: - -```bash -curl -s 'http://localhost:9090/api/v1/query?query=feature_flag_evaluation_requests_total' | jq -``` - -You should see entries with `feature_flag_key` labels for `vision_state`, -`vision_amplifier_v2`, and `loadgen_active`. The dashboard panels in Grafana -will start drawing within the next refresh interval. - -## ๐Ÿ›  Step 5: Turn on the loadgen and read the chart - -Open `flags.json` and flip `loadgen_active`: - -```json -"loadgen_active": { - "state": "ENABLED", - "variants": { "off": false, "on": true }, - "defaultVariant": "on" -} -``` - -Save. The k6 loadgen polls flagd every two seconds and starts hammering. Now -open Grafana โ†’ **Dashboards โ†’ Fun With Flags โ€” Feature Flag Metrics**. -You'll see: - -- **Evaluations per second** โ€” three flag keys, all live -- **Variant distribution** โ€” `vision_amplifier_v2` is heavily skewed toward `on` -- **HTTP latency** โ€” sitting around 200ms, well above baseline -- **HTTP 5xx rate** โ€” around 10% - -## ๐Ÿ›  Step 6: Roll the rollout back - -The fractional bucket for `vision_amplifier_v2` is inverted. Edit `flags.json`: - -```diff - "vision_amplifier_v2": { - "state": "ENABLED", - "variants": { "off": false, "on": true }, - "defaultVariant": "off", - "targeting": { - "fractional": [ -- ["off", 0], -- ["on", 100] -+ ["off", 100], -+ ["on", 0] - ] - } - } -``` - -Save. flagd reloads within a second. The k6 script generates a fresh -`userId` per request, so the next request is immediately bucketed into -`off`. The dashboard panels recover within seconds. - -## โœ… Step 7: Verify - -Run the verifier: - -```bash -adventures/planned/00-blind-by-design/expert/verify.sh -``` - -All eight checks should pass (lab reachable, flagd reachable, LGTM -reachable, `vision_amplifier_v2` rolled back, Prometheus has the metric -counter, Tempo has traces, Tempo spans carry the `feature_flag.context.*` -attribute, 5xx rate below threshold). The 5xx rate check tolerates a brief -tail of errors from before the rollback, but if you wait a minute it -settles to zero. - -## ๐ŸŽ“ What this exercise demonstrates - -- **Decoupling deployment from release.** Once the flag is in place, rolling - out and rolling back happen via a JSON edit, not a redeploy. That is the - same lever you would pull at 3am when the new pricing engine starts - erroring. -- **Stable bucketing via `targetingKey`.** The k6 script generates a fresh - `userId` per request *on purpose* โ€” it lets us see the rollback take - effect immediately. In a real app, the `userId` is the logged-in user, so - the bucketing is sticky across the user's session and the rollback only - helps users who arrive *after* the flag flip. -- **Two halves of OTel observability.** Traces tell you about a specific - request; metrics tell you about the population. The OpenFeature OTel - hooks expose both for flag evaluations using the same OTel SDK the rest of - the app already exports through. diff --git a/adventures/planned/00-blind-by-design/expert/Makefile b/adventures/planned/00-blind-by-design/expert/Makefile new file mode 100644 index 00000000..fe0ae409 --- /dev/null +++ b/adventures/planned/00-blind-by-design/expert/Makefile @@ -0,0 +1,36 @@ +# ============================================================================ +# Makefile for Blind by Design - Expert Level: Phase 3 โ€” read the chart +# ============================================================================ +# This Makefile provides convenient commands for running the Spring Boot lab +# alongside the Grafana LGTM stack and verifying your solution. +# ============================================================================ + +.PHONY: help lab probe verify + +# Default target - show help +help: + @echo "Blind by Design - Expert Level: Phase 3 โ€” read the chart" + @echo "" + @echo "Application:" + @echo " make lab - Start the Spring Boot lab on :8080" + @echo " make probe - Hit the lab as a sample subject and pretty-print the response" + @echo "" + @echo "Verification:" + @echo " make verify - Run verification checks (lab + flagd + LGTM + dashboard + 5xx-rate)" + +# ---------------------------------------------------------------------------- +# Application Targets +# ---------------------------------------------------------------------------- + +lab: + @./mvnw spring-boot:run + +probe: + @curl -s 'http://localhost:8080/?userId=subject-42' | jq + +# ---------------------------------------------------------------------------- +# Verification Targets +# ---------------------------------------------------------------------------- + +verify: + @./verify.sh From fb0b018353198de9dad7d96f382d4d2c236a874c Mon Sep 17 00:00:00 2001 From: Simon Schrottner Date: Fri, 8 May 2026 09:03:57 +0200 Subject: [PATCH 4/8] review: apply PR #43 feedback to Expert level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirrors the Intermediate cleanup (8bcf885) on Expert, plus picks up the targetingKey + PII discipline that was deferred from Intermediate. - Rewrite Objective as 5 outcome-based bullets โ€” drop the mechanism-heavy list, drop the parenthetical "verified: SpeciesInterceptor wires userId" note (now redundant: targetingKey lives at the implementation level here, not the objective level). - Drop the "Concepts you'll touch" section. Its load-bearing content migrates inline to the per-step instructions: TracerProvider vs MeterProvider gloss into 4a, TracesHook/MetricsHook gloss into 4b, the ContextSpanHook authoring guide into a new 4c, and the fractional + targetingKey explanation into 4d. - Add explicit step 4c "Author and register your own ContextSpanHook" โ€” the ContextSpanHook was an objective bullet with no corresponding implementation step (analogous to the missing dose-passing step on Intermediate). - Move the PII allowlist callout to step 4c โ€” Expert is where it earns its place, since eval context is about to flow onto OTel spans that ship to SIEM-grade backends. The Intermediate cross-link goes away; the discipline lives here standalone. - Lift "Start the Lab" out of step 1 into its own step 2 โ€” mirrors the Intermediate / Beginner shape, so a player who clicks the Ports tab before reading further doesn't see a 502. - Format Deadline + Community thread sections as Coming Soon callouts, matching Intermediate. - Sync verify.sh OBJECTIVE block to the new outcome-based docs. Addresses Katharina's review themes carried over from #43 (objective shape, Concepts vs Learn overlap, verifier exercises objective). Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Simon Schrottner --- .../planned/00-blind-by-design/docs/expert.md | 166 +++++++----------- .../00-blind-by-design/expert/verify.sh | 12 +- 2 files changed, 65 insertions(+), 113 deletions(-) diff --git a/adventures/planned/00-blind-by-design/docs/expert.md b/adventures/planned/00-blind-by-design/docs/expert.md index 6002370e..5a696ecf 100644 --- a/adventures/planned/00-blind-by-design/docs/expert.md +++ b/adventures/planned/00-blind-by-design/docs/expert.md @@ -18,15 +18,11 @@ Your job, in order: **turn on the lights**, find the bad arm of the trial, and * ## โฐ Deadline -Coming Soon -> โ„น๏ธ You can still complete the challenge after this date, but points will only -> be awarded for submissions before the deadline. +> ๐Ÿšง **Coming Soon** โ€” this level is in the planned bucket. Final deadline will be announced when the adventure goes live. ## ๐Ÿ’ฌ Join the discussion -Share your solutions and questions in the -[challenge thread](https://community.open-ecosystem.com/c/open-ecosystem-challenges/) -in the Open Ecosystem Community. +> ๐Ÿšง **Coming Soon** โ€” community thread will be linked here at launch. ## ๐Ÿ—๏ธ Architecture @@ -55,63 +51,13 @@ Four containers and one Spring Boot process, all on a shared Docker network. ## ๐ŸŽฏ Objective -By the end of this level, you should have: +By the end of this level, the lab hits each of these observable outcomes: -- The OpenTelemetry **meter provider** wired and the OpenFeature **`MetricsHook`** registered -- Verified: the **`SpeciesInterceptor`** carried over from Intermediate is wiring `?userId=` as the OpenFeature **`targetingKey`** on every request, so the `vision_amplifier_v2` fractional rollout buckets per subject rather than landing every request in the same bucket *(you don't write this โ€” verify it via the dashboard's variant-distribution panel after step 5)* -- A **`ContextSpanHook`** of your own โ€” a small `Hook` that copies the merged evaluation context (`species`, `country`, `dose`) onto the active span as `feature_flag.context.` โ€” registered alongside `TracesHook`/`MetricsHook` -- **At least one trace** for service `fun-with-flags-java-spring` visible in Tempo -- Spans tagged with **`feature_flag.context.dose=underdose`** searchable in Tempo and lining up with `feature_flag.variant=clouded` on the same span -- The **`feature_flag_evaluation_requests_total`** counter non-zero in Prometheus -- The **`vision_amplifier_v2`** fractional rollout flipped back to **100% off / 0% on** -- The HTTP 5xx rate over the last minute below **1%** - -## ๐Ÿ“š Concepts you'll touch - -If you came in fresh on OpenTelemetry SDK plumbing or flagd's fractional rule, read this section first. - -### OpenTelemetry **TracerProvider** vs **MeterProvider** - -Spans are per-request timing (one trace per HTTP call, with nested events), counters are aggregate population stats (rate of evaluations across all requests, distribution of variants). In this lab the trace half is wired and Tempo already shows spans; the metrics half is dead and the dashboard is dark โ€” that's the gap you close. - -OTel ships two parallel pipelines, one for **traces** (spans, distributed timing) and one for **metrics** (counters, histograms). Each has its own provider, its own SDK, its own exporter. In this level the `TracerProvider` is already wired (spans are flowing into Tempo). The `MeterProvider` is not โ€” that is your fix. Both providers register globally via `GlobalOpenTelemetry`, so once you wire the meter, the OpenFeature `MetricsHook` finds it without any further plumbing. - -### OpenFeature `TracesHook` and `MetricsHook` - -The OpenFeature OTel contrib library ships two hooks that turn every flag evaluation into telemetry: - -- **`TracesHook`** โ€” emits a span event (`feature_flag.evaluation`) on the active span with `feature_flag.key`, `feature_flag.variant`, and `feature_flag.reason` attributes. This is why flag evaluations show up nested inside HTTP request spans in Tempo. -- **`MetricsHook`** โ€” emits four counters per evaluation: `feature_flag_evaluation_requests_total`, `_success_total`, `_error_total`, and an active-count up/down counter. These power the dashboard panels. - -Both hooks need a global `OpenTelemetry` instance. The `TracesHook` works once you have a `TracerProvider`; the `MetricsHook` needs a `MeterProvider`. - -### Authoring your own hook to enrich spans with context - -The `AuditHook` carried over from Intermediate already records the same context attributes (species / country / dose) into a durable `[AUDIT]` log line โ€” that is the safety officer's tool, useful weeks later for forensic follow-up. What it does not give you is **real-time correlation in the dashboard**: log lines do not show up alongside `feature_flag.variant` on a Tempo span. So `TracesHook` is great at recording **what** happened (the variant, the reason), `AuditHook` records the audit-archive view, and there is still a gap โ€” the evaluation context attributes that drove the decision are not on the span. The two hooks stay; you add a third for the on-call's view. - -The OpenFeature `Hook` interface is the right place to fix that. The shape is roughly: - -```text -before(hookCtx) { - span = active OTel span - for each allowlisted key in merged eval context: - span.setAttribute("feature_flag.context." + key, value) -} -``` - -The `before` hook receives a `HookContext` whose `getCtx()` returns the **merged** evaluation context (global + transaction + invocation), which is exactly what drove the flag's resolution โ€” so the attributes you copy off it line up with what the variant decision actually saw. Span attributes go on `Span.current()` because that is the active HTTP request span; the OpenFeature hook fires inside that span's scope. - -Register it next to `TracesHook` / `MetricsHook` in `OpenFeatureConfig`. Now every flag evaluation tags its parent span with the context attributes the lab cares about. In Tempo: **Search โ†’ Service: fun-with-flags-java-spring โ†’ +Tag โ†’ `feature_flag.context.dose=underdose`** lights up exactly the requests where a tech mis-dosed, with the resolved variant on the same span event. - -> โš ๏ธ **Allowlist, don't iterate.** Use a fixed allowlist for the same reason the `AuditHook` does โ€” see [Intermediate's PII note](./intermediate.md#3c-an-audithook) and the [OpenTelemetry security guidance](https://opentelemetry.io/docs/security/). - -### `flagd` `fractional` operation + `targetingKey` - -`fractional` is flagd's bucketing operation. Given a list of `[variant, percent]` pairs, it deterministically assigns each evaluation to one variant based on a hash of the **targeting key** on the evaluation context. Same key โ†’ same bucket โ†’ same variant, every request. Different keys spread across the percentages. **If no targeting key is set, every evaluation hashes the same way and the rollout collapses โ€” every request lands in the same bucket and the percentages do nothing.** - -You already wired this up in Intermediate. The **`SpeciesInterceptor`** you wrote there reads `?userId=...` from each request and constructs an `ImmutableContext(userId, attributes)` โ€” by SDK convention the first `String` argument to `ImmutableContext` **is** the OpenFeature `targetingKey`. Expert ships the same interceptor byte-for-byte; the lab is already serving fractional rollouts correctly without you touching it. (Intermediate didn't have a flag that used the targetingKey; this is where it pays off.) - -The k6 loadgen demonstrates this end-to-end: it generates a fresh random `userId` per request, which means the interceptor produces a different targeting key per request, which means the fractional rollout spreads across the percentages exactly as configured. The dashboard's variant-distribution panel reflects that split directly. +- **Spans for `fun-with-flags-java-spring` are visible in Tempo** with `feature_flag.context.` attributes โ€” searching `feature_flag.context.dose=underdose` lights up the requests where a tech mis-dosed, with `feature_flag.variant=clouded` on the same span. +- **`feature_flag_evaluation_requests_total` is non-zero in Prometheus** โ€” flag evaluations show up as counters, not just spans. +- **The Feature Flag Metrics dashboard renders.** Variant-distribution, error rate, latency p99 โ€” all populated from the metric counters. +- **The `vision_amplifier_v2` rollout is rolled back to 100% off** โ€” without redeploying the lab. +- **HTTP 5xx rate over the last minute drops below 1%.** The bad arm is contained. ## ๐Ÿง  What You'll Learn @@ -155,12 +101,19 @@ Quick start: - Wait ~2-3 minutes for the sibling containers (flagd, Grafana LGTM, k6 loadgen) to come up. They are part of the devcontainer compose, so they start automatically โ€” no `docker compose up` step. -- Once the IDE attaches to the workspace, start the Spring Boot lab. Click - **Run** on `Laboratory` in the Spring Boot Dashboard panel (or press - **F5** with `Laboratory.java` open), or run `./mvnw spring-boot:run` - from the integrated terminal. -### 2. Access the UIs +### 2. Start the Lab + +The sibling containers (flagd, the LGTM stack, the k6 loadgen) are already up โ€” the Spring Boot lab itself isn't. Boot it before you click into the Ports tab so the forwarded `:8080` is actually serving. Either click **Run** on `Laboratory` in the Spring Boot Dashboard panel (or press **F5** with `Laboratory.java` open), or, from the terminal: + +```bash +cd adventures/planned/00-blind-by-design/expert +./mvnw spring-boot:run +``` + +Spans start flowing into Tempo on the first request โ€” the OpenTelemetry trace pipeline is already wired. The metrics half is dead (task 4a) so the Grafana dashboard panels stay empty until you fix it. + +### 3. Access the UIs Open the **Ports** tab in the bottom panel and click through to: @@ -201,49 +154,55 @@ flagd runs on the docker-internal network only. The lab and the loadgen reach it The Spring Boot app exports traces (and, after you finish the wiring, metrics) to the LGTM stack on `4317` (gRPC) and `4318` (HTTP). -### 3. Implement the Objective +### 4. Implement the Objective + +Four sub-tasks, in order: wire the meter provider, register the matching `MetricsHook`, write your own `ContextSpanHook` to enrich spans with the flag-decision context, then turn on the loadgen so you can find and roll back the misbehaving fractional rollout. + +#### 4a. Wire the OpenTelemetry meter provider + +OTel ships two parallel pipelines: **traces** (per-request spans, already flowing into Tempo) and **metrics** (aggregate counters, dead). Each has its own provider, its own SDK, its own exporter. The fix here is on the metrics side โ€” a `MeterProvider` is being created but its exporter is `none`, so any metrics it records go nowhere. Both providers register globally via `GlobalOpenTelemetry`, so once the meter is wired the `MetricsHook` (next step) finds it without any further plumbing. + +Open `adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenTelemetryConfig.java`. The `@Bean` method already calls `AutoConfiguredOpenTelemetrySdk.builder()`, which produces an `OpenTelemetry` instance with **both** a `SdkTracerProvider` and a `SdkMeterProvider` โ€” but only the tracer provider has an exporter. The meter provider is told `otel.metrics.exporter=none`. + +Flip `otel.metrics.exporter` to `otlp` so the SDK attaches an `OtlpGrpcMetricExporter`. The cleanest way is to update both the default in `OpenTelemetryConfig.java` and the value in `src/main/resources/application.properties`. While you're there, set `otel.metric.export.interval=10000` so the dashboard updates within ten seconds of new traffic instead of waiting a minute. -There are three sub-tasks, in order: +#### 4b. Register `MetricsHook` on the OpenFeature API -#### 3a. Wire the OpenTelemetry meter provider +The OpenFeature OTel contrib library ships two hooks that turn flag evaluations into telemetry: **`TracesHook`** emits a span event (`feature_flag.evaluation`) on the active span โ€” that's why flag evaluations show up nested inside HTTP request spans in Tempo. **`MetricsHook`** emits four counters per evaluation: `feature_flag_evaluation_requests_total`, `_success_total`, `_error_total`, plus an active-count up/down counter. These power the dashboard panels. -Open -`adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenTelemetryConfig.java`. -The `@Bean` method already calls `AutoConfiguredOpenTelemetrySdk.builder()`, -which produces an `OpenTelemetry` instance with **both** a `SdkTracerProvider` -and a `SdkMeterProvider` โ€” but only the tracer provider has an exporter. -The meter provider is told `otel.metrics.exporter=none`, so any metrics it -records go nowhere. +Open `OpenFeatureConfig.java`. `TracesHook` is already registered; `MetricsHook` is not. `MetricsHook` needs the `OpenTelemetry` instance to grab the meter provider, so inject the bean via constructor injection and call `api.addHooks(new MetricsHook(openTelemetry));` next to the `TracesHook` line. + +If you compile and run after this step, the **Fun With Flags โ€” Feature Flag Metrics** dashboard in Grafana stays empty โ€” there is no traffic to drive the counters. Move on. + +#### 4c. Author and register your own `ContextSpanHook` + +The two contrib hooks tell you *what* happened โ€” which flag, which variant, which reason. The `AuditHook` shipped with this level (carried over from Intermediate) writes the durable archive view to disk. What's missing is the **on-call's view in Tempo**: when a span shows `feature_flag.variant=clouded`, the operator can't see *why* without a separate hop into the audit log. Write a third hook that copies the merged eval context attributes onto the active OTel span โ€” same data the audit log records, but visible right next to the variant in the trace UI. + +The shape is roughly: + +```text +before(hookCtx) { + span = active OTel span + for each allowlisted key in merged eval context: + span.setAttribute("feature_flag.context." + key, value) +} +``` -Flip `otel.metrics.exporter` to `otlp` so the SDK attaches an -`OtlpGrpcMetricExporter`. The cleanest way is to update both the default in -`OpenTelemetryConfig.java` and the value in -`src/main/resources/application.properties`. While you're there, set -`otel.metric.export.interval=10000` so the dashboard updates within ten -seconds of new traffic instead of waiting a minute. +The `before` callback receives a `HookContext`, and `getCtx()` returns the **merged** evaluation context (global + transaction + invocation) โ€” exactly what drove the flag's resolution, so the attributes you copy off it line up with what the variant decision actually saw. Span attributes go on `Span.current()` because that's the active HTTP request span; the OpenFeature hook fires inside its scope. -#### 3b. Register `MetricsHook(OpenTelemetry)` on the OpenFeature API +Register it next to `TracesHook` / `MetricsHook` in `OpenFeatureConfig`. Now in Tempo: **Search โ†’ Service: fun-with-flags-java-spring โ†’ +Tag โ†’ `feature_flag.context.dose=underdose`** lights up exactly the requests where a tech mis-dosed, with the resolved variant on the same span event. -Open `OpenFeatureConfig.java`. The `TracesHook` is already registered; -`MetricsHook` is not. `MetricsHook` needs the `OpenTelemetry` instance to grab -the meter provider, so inject the bean via constructor injection and -`api.addHooks(new MetricsHook(openTelemetry));` next to the `TracesHook` call. +> โš ๏ธ **Allowlist, don't iterate.** Use a fixed allowlist (`List.of("species", "country", "dose")`) โ€” never iterate the whole eval context. The merged context routinely carries the OpenFeature `targetingKey`, typically a stable user id that joins to email and account data in real apps. Span attributes are retained for days in Tempo and indexed at scale; once they ship, redacting after the fact is hard. Same discipline `AuditHook` already follows for the audit log, same reason. See [OpenTelemetry's security guidance](https://opentelemetry.io/docs/security/). -If you compile and run after this step, the **Fun With Flags โ€” Feature Flag -Metrics** dashboard in Grafana stays empty โ€” there is no traffic. Move on. +#### 4d. Turn on the loadgen, find the bad rollout, roll it back -#### 3c. Turn on the loadgen, find the bad rollout, roll it back +`fractional` is flagd's bucketing operation: given a list of `[variant, percent]` pairs, it deterministically assigns each evaluation to a variant based on a hash of the **`targetingKey`** on the eval context. Same key โ†’ same bucket โ†’ same variant, every request. Different keys spread across the percentages. **If no targeting key is set, every evaluation hashes the same way, every request lands in the same bucket, and the percentages do nothing.** The `SpeciesInterceptor` shipped with this level reads `?userId=` from each request and threads it through as the targetingKey โ€” the lab is already serving fractional rollouts correctly without you touching it. The k6 loadgen exploits this: it generates a fresh random `userId` per request, which means a different targetingKey per request, which means the fractional rollout spreads across the percentages exactly as configured. -Edit `flags.json` in the expert directory and flip `loadgen_active`'s -`defaultVariant` from `"off"` to `"on"`. flagd watches the file and picks up -changes within a second. The k6 loadgen container has been polling -`loadgen_active` every two seconds โ€” it will notice and start hammering -`http://workspace:8080/` with five virtual users (the workspace service name resolves inside the compose network). +Edit `flags.json` in the expert directory and flip `loadgen_active`'s `defaultVariant` from `"off"` to `"on"`. flagd watches the file and picks up changes within a second. The k6 loadgen container has been polling `loadgen_active` every two seconds โ€” it will notice and start hammering `http://workspace:8080/` with five virtual users (the workspace service name resolves inside the compose network). Now open the dashboard. When the loadgen turns on you should see latency creep up around 200ms and 5xx rate around 10%; if those don't move, the loadgen flag isn't actually live yet. -That's the diagnosis: the fractional rollout for `vision_amplifier_v2` is -inverted. The flag definition currently reads: +That's the diagnosis: the fractional rollout for `vision_amplifier_v2` is inverted. The flag definition currently reads: ```json "fractional": [ @@ -252,18 +211,11 @@ inverted. The flag definition currently reads: ] ``` -Edit `flags.json` again โ€” flip the percentages so `off` gets `100` and `on` -gets `0`. Save. Within one or two seconds flagd reloads. Because the -`SpeciesInterceptor` is wiring `?userId=` through to the OpenFeature -`targetingKey` on every request, and the loadgen generates a fresh `userId` -per request, the fractional rollout responds immediately โ€” every subject -re-buckets against the new percentages and the population moves to the safe -variant. Watch the latency p99 panel collapse back to baseline and the 5xx -rate fall to zero. +Edit `flags.json` again โ€” flip the percentages so `off` gets `100` and `on` gets `0`. Save. Within one or two seconds flagd reloads. Because the targetingKey is sticky per `userId` and the loadgen generates a fresh `userId` per request, every subject re-buckets against the new percentages and the population moves to the safe variant. Watch the latency p99 panel collapse back to baseline and the 5xx rate fall to zero. **No deploy. No rebuild. No restart of the lab.** -### 4. Verify Your Solution +### 5. Verify Your Solution Once you think you've solved the challenge, run the verification script: diff --git a/adventures/planned/00-blind-by-design/expert/verify.sh b/adventures/planned/00-blind-by-design/expert/verify.sh index 97c4bc90..027462dd 100755 --- a/adventures/planned/00-blind-by-design/expert/verify.sh +++ b/adventures/planned/00-blind-by-design/expert/verify.sh @@ -6,12 +6,12 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # shellcheck disable=SC1091 source "$SCRIPT_DIR/../../../../lib/scripts/loader.sh" -OBJECTIVE="By the end of this level, you should have: -- The OpenTelemetry meter provider wired and the OpenFeature MetricsHook registered -- At least one trace for service 'fun-with-flags-java-spring' visible in Tempo -- The 'feature_flag_evaluation_requests_total' counter non-zero in Prometheus -- The 'vision_amplifier_v2' fractional rollout flipped back to 100% off / 0% on -- HTTP 5xx rate over the last minute below 1%" +OBJECTIVE="By the end of this level, the lab hits each of these observable outcomes: + +- Spans for 'fun-with-flags-java-spring' visible in Tempo with feature_flag.context. attributes (searching 'feature_flag.context.dose=underdose' lights up the mis-dose requests) +- 'feature_flag_evaluation_requests_total' non-zero in Prometheus โ€” flag evaluations show up as counters, not just spans +- The 'vision_amplifier_v2' rollout is rolled back to 100% off โ€” without redeploying the lab +- HTTP 5xx rate over the last minute drops below 1%" DOCS_URL="https://dynatrace-oss.github.io/open-ecosystem-challenges/00-blind-by-design/expert" From 64638583fafa6940c3146093f86aecdb4d8556e8 Mon Sep 17 00:00:00 2001 From: Simon Schrottner Date: Mon, 18 May 2026 15:35:36 +0200 Subject: [PATCH 5/8] docs(expert): drop cd command from Start the Lab snippet The devcontainer's workspaceFolder already opens at the expert directory, so `cd adventures/planned/00-blind-by-design/expert` is a no-op for any user on the intended path. Carries the same cleanup Katharina applied to Intermediate in a26ad06. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Simon Schrottner --- adventures/planned/00-blind-by-design/docs/expert.md | 1 - 1 file changed, 1 deletion(-) diff --git a/adventures/planned/00-blind-by-design/docs/expert.md b/adventures/planned/00-blind-by-design/docs/expert.md index 5a696ecf..d0757547 100644 --- a/adventures/planned/00-blind-by-design/docs/expert.md +++ b/adventures/planned/00-blind-by-design/docs/expert.md @@ -107,7 +107,6 @@ Quick start: The sibling containers (flagd, the LGTM stack, the k6 loadgen) are already up โ€” the Spring Boot lab itself isn't. Boot it before you click into the Ports tab so the forwarded `:8080` is actually serving. Either click **Run** on `Laboratory` in the Spring Boot Dashboard panel (or press **F5** with `Laboratory.java` open), or, from the terminal: ```bash -cd adventures/planned/00-blind-by-design/expert ./mvnw spring-boot:run ``` From 1248c53f3bb1ad210ab391e5641133858292c43f Mon Sep 17 00:00:00 2001 From: Simon Schrottner Date: Mon, 18 May 2026 15:45:12 +0200 Subject: [PATCH 6/8] review: address PR feedback for Expert level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Drop "Phase 3 โ€” " from the level title across all 8 sites (doc title, devcontainer name, verify.sh banner + summary, Makefile, post-start.sh banner, "Select Codespace config" line in step 1). Katharina suggested calling it just "Read the chart" โ€” keeps the story flavor without the artificial sub-numbering. - Trim the doc intro from three paragraphs to one. The "Three sub-tasks" enumeration and the "Level passes when (a)/(b)/(c)/(d)" recap were duplicating the Objective section; the "Spans flowing / metrics dead" paragraph is the one that earns its place as the lead. - Rewrite 4a-4d to be directional instead of tutorial. Expert level shouldn't dictate every keystroke; point at the gap and the outcome, leave the keystrokes to the player. - 4a: drop the explicit `otel.metrics.exporter=otlp` instruction + the batch-interval recipe. Name the two files where the autoconfig defaults live; hint that the default batch interval will be a pain for live debugging. - 4b: drop the literal `api.addHooks(new MetricsHook(openTelemetry));` line. Name the gap (TracesHook registered, MetricsHook isn't) and the next step. - 4c: keep the pseudocode shape (directional already) but trim `Span.current()` mechanic and the "Search โ†’ Service โ†’" UI walkthrough; replace with "verifier searches Tempo for feature_flag.context.dose=underdose" as the smoke signal. - 4d: drop the inverted-fractional spoiler. Tell the player that the dashboard's variant-distribution panel surfaces the offender; the rollback itself is theirs to do. - Add a "Helpful Documentation" sub-section at the end of "How to Play" (matching the pattern Adventure 03 Expert uses). Five external references: OpenFeature OTel hooks, OTel SDK autoconfigure, OpenFeature Hooks concept, flagd fractional, OTel security guidance. Addresses Katharina's review comments on aepfli/open-ecosystem-challenges#1 (intro paragraphs, "this is now very much of a tutorial again โ€” applies to all steps", and the helpful-documentation suggestion). Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Simon Schrottner --- .../devcontainer.json | 2 +- .../post-start.sh | 2 +- .../planned/00-blind-by-design/docs/expert.md | 53 ++++++------------- .../00-blind-by-design/expert/Makefile | 4 +- .../00-blind-by-design/expert/verify.sh | 4 +- 5 files changed, 23 insertions(+), 42 deletions(-) diff --git a/.devcontainer/00-blind-by-design_03-expert/devcontainer.json b/.devcontainer/00-blind-by-design_03-expert/devcontainer.json index 70cc431c..3a9e14e2 100644 --- a/.devcontainer/00-blind-by-design_03-expert/devcontainer.json +++ b/.devcontainer/00-blind-by-design_03-expert/devcontainer.json @@ -1,5 +1,5 @@ { - "name": "๐Ÿงช Adventure 00 | ๐Ÿ”ด Expert (Phase 3 โ€” read the chart)", + "name": "๐Ÿงช Adventure 00 | ๐Ÿ”ด Expert (Read the chart)", "dockerComposeFile": "docker-compose.yml", "service": "workspace", "workspaceFolder": "/workspaces/${localWorkspaceFolderBasename}/adventures/planned/00-blind-by-design/expert", diff --git a/.devcontainer/00-blind-by-design_03-expert/post-start.sh b/.devcontainer/00-blind-by-design_03-expert/post-start.sh index 454d9b5e..88c02e43 100755 --- a/.devcontainer/00-blind-by-design_03-expert/post-start.sh +++ b/.devcontainer/00-blind-by-design_03-expert/post-start.sh @@ -6,7 +6,7 @@ CHALLENGE_DIR="$REPO_ROOT/adventures/planned/00-blind-by-design/expert" cat <` so traces correlate variants with the context that drove them. -3. **Diagnose and roll back a misbehaving fractional rollout.** The `vision_amplifier_v2` flag is at 100% on; it's adding 200ms latency and a 10% HTTP 5xx rate. Identify it on the Grafana dashboard and roll it back via `flags.json` โ€” no redeploy. +# ๐Ÿ”ด Expert: Read the chart Spans are already flowing into Tempo from the OpenFeature `TracesHook`, but the metrics half is dead โ€” the `MeterProvider` has no exporter and the `MetricsHook` was never registered. The dashboard the operator wants to triage from is empty. The k6 loadgen is idle, waiting for a flag flip to turn it on. -The level passes when (a) `feature_flag_evaluation_requests_total` is non-zero in Prometheus, (b) Tempo spans for `fun-with-flags-java-spring` carry `feature_flag.context.*` attributes, (c) `vision_amplifier_v2` is rolled back to 100% off, and (d) the HTTP 5xx rate over the last minute is below 1%. - ## ๐Ÿช The Backstory The trial just went wide. Phase 3 of the new vision amplifier โ€” `vision_amplifier_v2` โ€” was approved for the full cohort yesterday morning. The promise was straightforward: subjects emerge with sharper eyesight than they walked in with. By mid-afternoon the audit log was screaming. Subjects were stabilising 200ms slower, and roughly one in ten of them was emerging **blind** โ€” containment failure recorded as an HTTP 500. The lab director pulled up the **Feature Flag Metrics** dashboard expecting to triage visually. The dashboard was dark. Someone had wired up traces but never finished the metrics half. There is no chart to read. The lab is studying eyesight and the lab itself cannot see. @@ -97,7 +89,7 @@ Quick start: - Fork the repo - Create a Codespace -- Select **"Adventure 00 | ๐Ÿ”ด Expert (Phase 3 โ€” read the chart)"** +- Select **"Adventure 00 | ๐Ÿ”ด Expert (Read the chart)"** - Wait ~2-3 minutes for the sibling containers (flagd, Grafana LGTM, k6 loadgen) to come up. They are part of the devcontainer compose, so they start automatically โ€” no `docker compose up` step. @@ -159,23 +151,19 @@ Four sub-tasks, in order: wire the meter provider, register the matching `Metric #### 4a. Wire the OpenTelemetry meter provider -OTel ships two parallel pipelines: **traces** (per-request spans, already flowing into Tempo) and **metrics** (aggregate counters, dead). Each has its own provider, its own SDK, its own exporter. The fix here is on the metrics side โ€” a `MeterProvider` is being created but its exporter is `none`, so any metrics it records go nowhere. Both providers register globally via `GlobalOpenTelemetry`, so once the meter is wired the `MetricsHook` (next step) finds it without any further plumbing. +OTel ships two parallel pipelines: **traces** (per-request spans, already flowing into Tempo) and **metrics** (aggregate counters, dead). Each has its own provider, its own SDK, its own exporter. The metrics half is being built via the autoconfig SDK but told to export to `none` โ€” any metrics it records have nowhere to go. Both providers register globally via `GlobalOpenTelemetry`, so once the meter has a working exporter, the OpenFeature `MetricsHook` (next step) finds it without any further plumbing. -Open `adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenTelemetryConfig.java`. The `@Bean` method already calls `AutoConfiguredOpenTelemetrySdk.builder()`, which produces an `OpenTelemetry` instance with **both** a `SdkTracerProvider` and a `SdkMeterProvider` โ€” but only the tracer provider has an exporter. The meter provider is told `otel.metrics.exporter=none`. - -Flip `otel.metrics.exporter` to `otlp` so the SDK attaches an `OtlpGrpcMetricExporter`. The cleanest way is to update both the default in `OpenTelemetryConfig.java` and the value in `src/main/resources/application.properties`. While you're there, set `otel.metric.export.interval=10000` so the dashboard updates within ten seconds of new traffic instead of waiting a minute. +`OpenTelemetryConfig.java` and `application.properties` are where the autoconfig defaults live; the LGTM stack accepts OTLP on `:4317` (gRPC). After you wire it, watch how long the dashboard lags new traffic โ€” the SDK's default batch interval will make the next ten minutes harder than they need to be. #### 4b. Register `MetricsHook` on the OpenFeature API -The OpenFeature OTel contrib library ships two hooks that turn flag evaluations into telemetry: **`TracesHook`** emits a span event (`feature_flag.evaluation`) on the active span โ€” that's why flag evaluations show up nested inside HTTP request spans in Tempo. **`MetricsHook`** emits four counters per evaluation: `feature_flag_evaluation_requests_total`, `_success_total`, `_error_total`, plus an active-count up/down counter. These power the dashboard panels. - -Open `OpenFeatureConfig.java`. `TracesHook` is already registered; `MetricsHook` is not. `MetricsHook` needs the `OpenTelemetry` instance to grab the meter provider, so inject the bean via constructor injection and call `api.addHooks(new MetricsHook(openTelemetry));` next to the `TracesHook` line. +The OpenFeature OTel contrib library ships two hooks that turn flag evaluations into telemetry: **`TracesHook`** emits a span event on the active span (that's why flag evaluations show up nested inside HTTP request spans in Tempo); **`MetricsHook`** emits four counters per evaluation โ€” `feature_flag_evaluation_requests_total` and friends โ€” that power the dashboard panels. -If you compile and run after this step, the **Fun With Flags โ€” Feature Flag Metrics** dashboard in Grafana stays empty โ€” there is no traffic to drive the counters. Move on. +`OpenFeatureConfig.java` registers `TracesHook` but stops there. `MetricsHook` needs an `OpenTelemetry` handle to find the meter provider you just wired. Even once it's registered, the **Fun With Flags โ€” Feature Flag Metrics** dashboard stays empty until something drives traffic โ€” that's the next step. #### 4c. Author and register your own `ContextSpanHook` -The two contrib hooks tell you *what* happened โ€” which flag, which variant, which reason. The `AuditHook` shipped with this level (carried over from Intermediate) writes the durable archive view to disk. What's missing is the **on-call's view in Tempo**: when a span shows `feature_flag.variant=clouded`, the operator can't see *why* without a separate hop into the audit log. Write a third hook that copies the merged eval context attributes onto the active OTel span โ€” same data the audit log records, but visible right next to the variant in the trace UI. +The two contrib hooks tell you *what* happened โ€” which flag, which variant, which reason. The `AuditHook` shipped with this level (carried over from Intermediate) writes the durable archive view to disk. What's missing is the **on-call's view in Tempo**: when a span shows `feature_flag.variant=clouded`, the operator can't see *why* without a separate hop into the audit log. Write a third hook that copies the merged eval context attributes onto the active OTel span as `feature_flag.context.` โ€” same data the audit log records, but visible right next to the variant in the trace UI. The shape is roughly: @@ -187,32 +175,25 @@ before(hookCtx) { } ``` -The `before` callback receives a `HookContext`, and `getCtx()` returns the **merged** evaluation context (global + transaction + invocation) โ€” exactly what drove the flag's resolution, so the attributes you copy off it line up with what the variant decision actually saw. Span attributes go on `Span.current()` because that's the active HTTP request span; the OpenFeature hook fires inside its scope. - -Register it next to `TracesHook` / `MetricsHook` in `OpenFeatureConfig`. Now in Tempo: **Search โ†’ Service: fun-with-flags-java-spring โ†’ +Tag โ†’ `feature_flag.context.dose=underdose`** lights up exactly the requests where a tech mis-dosed, with the resolved variant on the same span event. +The `before` callback receives a `HookContext`, and `getCtx()` returns the **merged** evaluation context (global + transaction + invocation) โ€” exactly what drove the flag's resolution. Span attributes go on the currently active span; the OpenFeature hook fires inside its scope. Register it alongside `TracesHook` / `MetricsHook` in `OpenFeatureConfig`. The verifier searches Tempo for `feature_flag.context.dose=underdose` once you're done โ€” that's the smoke signal. > โš ๏ธ **Allowlist, don't iterate.** Use a fixed allowlist (`List.of("species", "country", "dose")`) โ€” never iterate the whole eval context. The merged context routinely carries the OpenFeature `targetingKey`, typically a stable user id that joins to email and account data in real apps. Span attributes are retained for days in Tempo and indexed at scale; once they ship, redacting after the fact is hard. Same discipline `AuditHook` already follows for the audit log, same reason. See [OpenTelemetry's security guidance](https://opentelemetry.io/docs/security/). #### 4d. Turn on the loadgen, find the bad rollout, roll it back -`fractional` is flagd's bucketing operation: given a list of `[variant, percent]` pairs, it deterministically assigns each evaluation to a variant based on a hash of the **`targetingKey`** on the eval context. Same key โ†’ same bucket โ†’ same variant, every request. Different keys spread across the percentages. **If no targeting key is set, every evaluation hashes the same way, every request lands in the same bucket, and the percentages do nothing.** The `SpeciesInterceptor` shipped with this level reads `?userId=` from each request and threads it through as the targetingKey โ€” the lab is already serving fractional rollouts correctly without you touching it. The k6 loadgen exploits this: it generates a fresh random `userId` per request, which means a different targetingKey per request, which means the fractional rollout spreads across the percentages exactly as configured. +`fractional` is flagd's bucketing operation: given a list of `[variant, percent]` pairs, it deterministically assigns each evaluation to a variant based on a hash of the **`targetingKey`** on the eval context. Same key โ†’ same bucket โ†’ same variant. Different keys spread across the percentages. **If no targeting key is set, every evaluation hashes the same way, every request lands in the same bucket, and the percentages do nothing.** The `SpeciesInterceptor` shipped with this level reads `?userId=` and threads it through as the targetingKey โ€” the lab is already serving fractional rollouts correctly without you touching it. -Edit `flags.json` in the expert directory and flip `loadgen_active`'s `defaultVariant` from `"off"` to `"on"`. flagd watches the file and picks up changes within a second. The k6 loadgen container has been polling `loadgen_active` every two seconds โ€” it will notice and start hammering `http://workspace:8080/` with five virtual users (the workspace service name resolves inside the compose network). +`flags.json` in the expert directory has a `loadgen_active` flag (off) and the misbehaving `vision_amplifier_v2` flag. flagd watches the file and picks up changes within a second; the k6 loadgen polls `loadgen_active` every two seconds, so flipping it turns on five virtual users hammering the lab. When the loadgen turns on, latency p99 should climb around 200ms and the 5xx rate around 10% โ€” confirmation that something is firing. The dashboard's variant-distribution panel tells you which one. Roll the offender back via the flag definition, watch the dashboard recover. -Now open the dashboard. When the loadgen turns on you should see latency creep up around 200ms and 5xx rate around 10%; if those don't move, the loadgen flag isn't actually live yet. - -That's the diagnosis: the fractional rollout for `vision_amplifier_v2` is inverted. The flag definition currently reads: - -```json -"fractional": [ - ["off", 0], - ["on", 100] -] -``` +**No deploy. No rebuild. No restart of the lab.** -Edit `flags.json` again โ€” flip the percentages so `off` gets `100` and `on` gets `0`. Save. Within one or two seconds flagd reloads. Because the targetingKey is sticky per `userId` and the loadgen generates a fresh `userId` per request, every subject re-buckets against the new percentages and the population moves to the safe variant. Watch the latency p99 panel collapse back to baseline and the 5xx rate fall to zero. +#### Helpful Documentation -**No deploy. No rebuild. No restart of the lab.** +- [OpenFeature OTel contrib hooks (Java)](https://github.com/open-feature/java-sdk-contrib/tree/main/hooks/open-telemetry) โ€” where `TracesHook` and `MetricsHook` live, with constructor signatures +- [OpenTelemetry Java SDK autoconfigure](https://github.com/open-telemetry/opentelemetry-java/tree/main/sdk-extensions/autoconfigure) โ€” every `otel.*` property the autoconfig SDK reads, including the exporter and batch-interval knobs +- [OpenFeature Hooks concept](https://openfeature.dev/docs/reference/concepts/hooks) โ€” the `before` / `after` / `error` / `finallyAfter` lifecycle for authoring your own hook +- [flagd `fractional` operation](https://flagd.dev/reference/custom-operations/fractional-operation/) โ€” the bucketing rule and how it reads the targetingKey +- [OpenTelemetry security guidance](https://opentelemetry.io/docs/security/) โ€” why allowlists on span attributes matter at SIEM scale ### 5. Verify Your Solution diff --git a/adventures/planned/00-blind-by-design/expert/Makefile b/adventures/planned/00-blind-by-design/expert/Makefile index fe0ae409..eed6032a 100644 --- a/adventures/planned/00-blind-by-design/expert/Makefile +++ b/adventures/planned/00-blind-by-design/expert/Makefile @@ -1,5 +1,5 @@ # ============================================================================ -# Makefile for Blind by Design - Expert Level: Phase 3 โ€” read the chart +# Makefile for Blind by Design - Expert Level: Read the chart # ============================================================================ # This Makefile provides convenient commands for running the Spring Boot lab # alongside the Grafana LGTM stack and verifying your solution. @@ -9,7 +9,7 @@ # Default target - show help help: - @echo "Blind by Design - Expert Level: Phase 3 โ€” read the chart" + @echo "Blind by Design - Expert Level: Read the chart" @echo "" @echo "Application:" @echo " make lab - Start the Spring Boot lab on :8080" diff --git a/adventures/planned/00-blind-by-design/expert/verify.sh b/adventures/planned/00-blind-by-design/expert/verify.sh index 027462dd..c2fd01e6 100755 --- a/adventures/planned/00-blind-by-design/expert/verify.sh +++ b/adventures/planned/00-blind-by-design/expert/verify.sh @@ -17,7 +17,7 @@ DOCS_URL="https://dynatrace-oss.github.io/open-ecosystem-challenges/00-blind-by- print_header \ 'Adventure 00: Blind by Design' \ - '๐Ÿ”ด Expert: Phase 3 โ€” read the chart' \ + '๐Ÿ”ด Expert: Read the chart' \ 'Verification' check_prerequisites curl jq @@ -219,7 +219,7 @@ else fi print_new_line -print_verification_summary "Phase 3 โ€” read the chart" "$DOCS_URL" "$OBJECTIVE" +print_verification_summary "Read the chart" "$DOCS_URL" "$OBJECTIVE" if [[ $TESTS_FAILED -ne 0 ]]; then exit 1 From e2acdbfdf75a68093dcadc71ecffaf884644927b Mon Sep 17 00:00:00 2001 From: Simon Schrottner Date: Mon, 18 May 2026 16:32:25 +0200 Subject: [PATCH 7/8] fix(expert): switch Expert to OTel Java Agent + verifier hostname/jq fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Caught two real broken-state bugs while doing a fresh-codespace run of the Expert level: 1. The trace pipeline never worked. pom.xml imported the OTel instrumentation BOM but did not pull in any instrumentation artifact, so Spring WebMVC never produced server spans. TracesHook and ContextSpanHook both attach to Span.current() โ€” without an active span, every setAttribute call silently disappears, Tempo stays empty, and the doc lead ("spans flowing into Tempo from TracesHook") was not actually true. 2. The verifier could not reach the LGTM stack (Grafana/Prometheus/Tempo). It used http://localhost:NNNN URLs, but verify.sh runs inside the workspace container, where localhost points to the workspace itself. The LGTM service is a sibling compose service, reachable only by service name on the docker-internal network. 3. The vision_amplifier_v2 rollback check was buggy: `jq -r '.value // empty'` treats jq-false as missing because `//` is the alternative operator. So a successfully rolled-back flag (.value=false) was printed as '' and the check failed. Worked by accident in the broken state where the flag resolved to true. Trace-pipeline fix uses the OpenTelemetry Java Agent rather than a Spring Boot starter โ€” the starter at the BOM-pinned 2.14.0 does not support Spring Boot 4 (NoClassDefFoundError on RestClientCustomizer); bumping the BOM is doable but the agent keeps the level focused on OpenFeature hooks rather than OTel SDK plumbing. Concrete changes: - pom.xml: drop opentelemetry-sdk, opentelemetry-exporter-otlp, opentelemetry-sdk-extension-autoconfigure (agent provides all three). Keep opentelemetry-api for the Hook type signatures. Add spring-boot-maven-plugin -javaagent:${OTEL_JAVAAGENT_JAR} so only the forked lab JVM is agent-attached, not Maven itself. - Delete OpenTelemetryConfig.java entirely โ€” the agent registers the global SDK before main() runs and AutoConfiguredOpenTelemetrySdk .setResultAsGlobal() would just collide with it. - OpenFeatureConfig.java: docstring + TODO comments reflect the new GlobalOpenTelemetry.get() pattern; players fetch the agent-installed OTel handle for MetricsHook rather than constructor-injecting it from a bean that no longer exists. - New otel.properties next to pom.xml: what the player edits to flip the metrics exporter. Pointed at by OTEL_JAVAAGENT_CONFIGURATION_FILE in docker-compose.yml. Same lesson as before (turn on metrics), new mechanic. - application.properties: strip all otel.* lines + add a comment explaining the agent does not read Spring's Environment. - docker-compose.yml: set OTEL_JAVAAGENT_JAR + OTEL_JAVAAGENT_CONFIGURATION_FILE in the workspace env; drop the manual OTEL_* vars that OpenTelemetryConfig used to bridge. - post-create.sh: download opentelemetry-javaagent.jar v2.27.0 into $REPO_ROOT/tools/, idempotent on re-run. - .vscode/launch.json: add vmArgs so F5/Spring Boot Dashboard launches also get the agent. Also rename to "Run the Lab" (Phase 3 was dropped from level title earlier). - devcontainer.json + post-start.sh openFiles: point at otel.properties instead of the deleted OpenTelemetryConfig.java. - .gitignore: add target/ + tools/ so the agent jar and Maven build output stop showing as untracked. - docs/expert.md step 4a + Helpful Documentation: reframe around editing otel.properties and link the agent config reference. - verify.sh: PROMETHEUS_URL / TEMPO_URL / GRAFANA_URL use the lgtm service name. AuditHook hint references the literal '[AUDIT]' format. jq fix for the rollback check (.value instead of .value // empty). Hints clarified to mention service-name vs localhost reachability. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Simon Schrottner --- .../devcontainer.json | 2 +- .../docker-compose.yml | 10 ++- .../post-create.sh | 15 ++++ .../post-start.sh | 2 +- .gitignore | 6 ++ .../planned/00-blind-by-design/docs/expert.md | 10 +-- .../expert/.vscode/launch.json | 5 +- .../00-blind-by-design/expert/otel.properties | 26 +++++++ .../planned/00-blind-by-design/expert/pom.xml | 29 ++++---- .../demo/java/demo/OpenFeatureConfig.java | 28 ++++--- .../demo/java/demo/OpenTelemetryConfig.java | 73 ------------------- .../src/main/resources/application.properties | 13 +--- .../00-blind-by-design/expert/verify.sh | 24 +++--- 13 files changed, 116 insertions(+), 127 deletions(-) create mode 100644 adventures/planned/00-blind-by-design/expert/otel.properties delete mode 100644 adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenTelemetryConfig.java diff --git a/.devcontainer/00-blind-by-design_03-expert/devcontainer.json b/.devcontainer/00-blind-by-design_03-expert/devcontainer.json index 3a9e14e2..e01594be 100644 --- a/.devcontainer/00-blind-by-design_03-expert/devcontainer.json +++ b/.devcontainer/00-blind-by-design_03-expert/devcontainer.json @@ -16,7 +16,7 @@ "codespaces": { "openFiles": [ "adventures/planned/00-blind-by-design/docs/expert.md", - "adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenTelemetryConfig.java", + "adventures/planned/00-blind-by-design/expert/otel.properties", "adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenFeatureConfig.java", "adventures/planned/00-blind-by-design/expert/flags.json" ] diff --git a/.devcontainer/00-blind-by-design_03-expert/docker-compose.yml b/.devcontainer/00-blind-by-design_03-expert/docker-compose.yml index cc866c6b..51633621 100644 --- a/.devcontainer/00-blind-by-design_03-expert/docker-compose.yml +++ b/.devcontainer/00-blind-by-design_03-expert/docker-compose.yml @@ -18,9 +18,13 @@ services: environment: - FLAGD_HOST=flagd - FLAGD_PORT=8013 - - OTEL_EXPORTER_OTLP_ENDPOINT=http://lgtm:4317 - - OTEL_EXPORTER_OTLP_PROTOCOL=grpc - - OTEL_SERVICE_NAME=fun-with-flags-java-spring + # OpenTelemetry Java Agent. post-create.sh downloads the jar; the + # spring-boot-maven-plugin reads OTEL_JAVAAGENT_JAR for its + # so only the forked lab JVM is agent-attached (not Maven itself). + # Agent config lives in expert/otel.properties โ€” that's the file + # players edit to toggle the metrics exporter. + - OTEL_JAVAAGENT_JAR=/workspaces/${localWorkspaceFolderBasename:-open-ecosystem-challenges}/tools/opentelemetry-javaagent.jar + - OTEL_JAVAAGENT_CONFIGURATION_FILE=/workspaces/${localWorkspaceFolderBasename:-open-ecosystem-challenges}/adventures/planned/00-blind-by-design/expert/otel.properties # Trial country of registration. Read by OpenFeatureConfig via # System.getenv("COUNTRY") and put on the global eval context. - COUNTRY=de diff --git a/.devcontainer/00-blind-by-design_03-expert/post-create.sh b/.devcontainer/00-blind-by-design_03-expert/post-create.sh index ee115d49..d2269dfe 100755 --- a/.devcontainer/00-blind-by-design_03-expert/post-create.sh +++ b/.devcontainer/00-blind-by-design_03-expert/post-create.sh @@ -24,6 +24,21 @@ if [[ -f "$CHALLENGE_DIR/mvnw" ]]; then chmod +x "$CHALLENGE_DIR/mvnw" fi +# Download the OpenTelemetry Java Agent. The Spring Boot Maven Plugin +# attaches it via -javaagent (see expert/pom.xml). One jar per Codespace +# โ€” skip if already present so re-runs are cheap. +OTEL_AGENT_VERSION="v2.27.0" +OTEL_AGENT_DIR="$REPO_ROOT/tools" +OTEL_AGENT_JAR="$OTEL_AGENT_DIR/opentelemetry-javaagent.jar" +mkdir -p "$OTEL_AGENT_DIR" +if [[ ! -f "$OTEL_AGENT_JAR" ]]; then + echo "โฌ‡๏ธ Downloading OpenTelemetry Java Agent $OTEL_AGENT_VERSION..." + curl -fsSL \ + -o "$OTEL_AGENT_JAR" \ + "https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/download/$OTEL_AGENT_VERSION/opentelemetry-javaagent.jar" \ + || echo "โš ๏ธ Failed to fetch the OpenTelemetry Java Agent โ€” traces and metrics will not flow until the jar is present at $OTEL_AGENT_JAR" +fi + echo "โœจ Pre-warming the Maven dependency cache so the first ./mvnw is fast..." ( cd "$CHALLENGE_DIR" && ./mvnw -q -DskipTests dependency:go-offline ) || \ echo "โš ๏ธ Dependency pre-warm skipped (network or wrapper not ready yet)" diff --git a/.devcontainer/00-blind-by-design_03-expert/post-start.sh b/.devcontainer/00-blind-by-design_03-expert/post-start.sh index 88c02e43..5641b129 100755 --- a/.devcontainer/00-blind-by-design_03-expert/post-start.sh +++ b/.devcontainer/00-blind-by-design_03-expert/post-start.sh @@ -45,7 +45,7 @@ track_codespace_initialized # editor uses internally and works against either the web or desktop client. if command -v code >/dev/null 2>&1; then code "$REPO_ROOT/adventures/planned/00-blind-by-design/docs/expert.md" \ - "$CHALLENGE_DIR/src/main/java/dev/openfeature/demo/java/demo/OpenTelemetryConfig.java" \ + "$CHALLENGE_DIR/otel.properties" \ "$CHALLENGE_DIR/src/main/java/dev/openfeature/demo/java/demo/OpenFeatureConfig.java" \ "$CHALLENGE_DIR/flags.json" \ 2>/dev/null || true diff --git a/.gitignore b/.gitignore index 95ad82f3..ad234894 100644 --- a/.gitignore +++ b/.gitignore @@ -19,5 +19,11 @@ venv/ .pytest_cache/ .mypy_cache/ +# Maven build artifacts +target/ + +# OpenTelemetry Java Agent (downloaded per-Codespace by post-create.sh) +tools/ + # Custom ignores/includes .prompts diff --git a/adventures/planned/00-blind-by-design/docs/expert.md b/adventures/planned/00-blind-by-design/docs/expert.md index e2d7c5dd..b0f9f53a 100644 --- a/adventures/planned/00-blind-by-design/docs/expert.md +++ b/adventures/planned/00-blind-by-design/docs/expert.md @@ -149,17 +149,17 @@ to the LGTM stack on `4317` (gRPC) and `4318` (HTTP). Four sub-tasks, in order: wire the meter provider, register the matching `MetricsHook`, write your own `ContextSpanHook` to enrich spans with the flag-decision context, then turn on the loadgen so you can find and roll back the misbehaving fractional rollout. -#### 4a. Wire the OpenTelemetry meter provider +#### 4a. Turn on the metrics exporter -OTel ships two parallel pipelines: **traces** (per-request spans, already flowing into Tempo) and **metrics** (aggregate counters, dead). Each has its own provider, its own SDK, its own exporter. The metrics half is being built via the autoconfig SDK but told to export to `none` โ€” any metrics it records have nowhere to go. Both providers register globally via `GlobalOpenTelemetry`, so once the meter has a working exporter, the OpenFeature `MetricsHook` (next step) finds it without any further plumbing. +OTel ships two parallel pipelines: **traces** (per-request spans, already flowing into Tempo) and **metrics** (aggregate counters, dead). The OpenTelemetry Java Agent attached to the lab JVM has both pipelines plumbed and pointed at the LGTM stack, but its config says `otel.metrics.exporter=none` โ€” anything the meter records goes nowhere. Flip the exporter on and the OpenFeature `MetricsHook` (next step) finds the working meter provider through `GlobalOpenTelemetry` without any further plumbing. -`OpenTelemetryConfig.java` and `application.properties` are where the autoconfig defaults live; the LGTM stack accepts OTLP on `:4317` (gRPC). After you wire it, watch how long the dashboard lags new traffic โ€” the SDK's default batch interval will make the next ten minutes harder than they need to be. +`otel.properties` (next to `pom.xml`) is what the agent reads on startup. While you're there, look at the export interval โ€” the agent's default makes the next ten minutes harder than they need to be. #### 4b. Register `MetricsHook` on the OpenFeature API The OpenFeature OTel contrib library ships two hooks that turn flag evaluations into telemetry: **`TracesHook`** emits a span event on the active span (that's why flag evaluations show up nested inside HTTP request spans in Tempo); **`MetricsHook`** emits four counters per evaluation โ€” `feature_flag_evaluation_requests_total` and friends โ€” that power the dashboard panels. -`OpenFeatureConfig.java` registers `TracesHook` but stops there. `MetricsHook` needs an `OpenTelemetry` handle to find the meter provider you just wired. Even once it's registered, the **Fun With Flags โ€” Feature Flag Metrics** dashboard stays empty until something drives traffic โ€” that's the next step. +`OpenFeatureConfig.java` registers `TracesHook` but stops there. `MetricsHook` needs an `OpenTelemetry` handle to find the meter provider โ€” the agent installs one globally at JVM start, so `GlobalOpenTelemetry.get()` is the way to reach it. Even once `MetricsHook` is registered, the **Fun With Flags โ€” Feature Flag Metrics** dashboard stays empty until something drives traffic โ€” that's the next step. #### 4c. Author and register your own `ContextSpanHook` @@ -190,7 +190,7 @@ The `before` callback receives a `HookContext`, and `getCtx()` returns the **mer #### Helpful Documentation - [OpenFeature OTel contrib hooks (Java)](https://github.com/open-feature/java-sdk-contrib/tree/main/hooks/open-telemetry) โ€” where `TracesHook` and `MetricsHook` live, with constructor signatures -- [OpenTelemetry Java SDK autoconfigure](https://github.com/open-telemetry/opentelemetry-java/tree/main/sdk-extensions/autoconfigure) โ€” every `otel.*` property the autoconfig SDK reads, including the exporter and batch-interval knobs +- [OpenTelemetry Java Agent โ€” agent configuration](https://opentelemetry.io/docs/zero-code/java/agent/configuration/) โ€” every `otel.*` key the agent honors, including exporter and batch-interval knobs - [OpenFeature Hooks concept](https://openfeature.dev/docs/reference/concepts/hooks) โ€” the `before` / `after` / `error` / `finallyAfter` lifecycle for authoring your own hook - [flagd `fractional` operation](https://flagd.dev/reference/custom-operations/fractional-operation/) โ€” the bucketing rule and how it reads the targetingKey - [OpenTelemetry security guidance](https://opentelemetry.io/docs/security/) โ€” why allowlists on span attributes matter at SIEM scale diff --git a/adventures/planned/00-blind-by-design/expert/.vscode/launch.json b/adventures/planned/00-blind-by-design/expert/.vscode/launch.json index 5c0005f5..f3151f97 100644 --- a/adventures/planned/00-blind-by-design/expert/.vscode/launch.json +++ b/adventures/planned/00-blind-by-design/expert/.vscode/launch.json @@ -3,12 +3,13 @@ "configurations": [ { "type": "java", - "name": "๐Ÿงช Run the Phase 3 Lab", + "name": "๐Ÿงช Run the Lab", "request": "launch", "mainClass": "dev.openfeature.demo.java.demo.Laboratory", "projectName": "demo", "console": "integratedTerminal", - "cwd": "${workspaceFolder}" + "cwd": "${workspaceFolder}", + "vmArgs": "-javaagent:${env:OTEL_JAVAAGENT_JAR}" } ] } diff --git a/adventures/planned/00-blind-by-design/expert/otel.properties b/adventures/planned/00-blind-by-design/expert/otel.properties new file mode 100644 index 00000000..694e36b0 --- /dev/null +++ b/adventures/planned/00-blind-by-design/expert/otel.properties @@ -0,0 +1,26 @@ +# OpenTelemetry Java Agent configuration. +# +# Read by the agent on JVM startup (via OTEL_JAVAAGENT_CONFIGURATION_FILE +# in docker-compose.yml). Edit + restart the lab to apply. +# +# Reference: https://opentelemetry.io/docs/zero-code/java/agent/configuration/ + +otel.service.name=fun-with-flags-java-spring +otel.exporter.otlp.endpoint=http://lgtm:4317 +otel.exporter.otlp.protocol=grpc + +# Traces flow into Tempo. Auto-instrumentation wraps every HTTP request +# in a server span, so the OpenFeature TracesHook + ContextSpanHook +# (once you write it) have an active span to attach to. +otel.traces.exporter=otlp + +# TODO Phase 3 task: flip from "none" to "otlp" so flag-evaluation +# metrics start exporting to the LGTM stack and the Feature Flag Metrics +# dashboard renders. +otel.metrics.exporter=none + +# Once metrics are on, set a short export interval so the dashboard +# updates within ten seconds of new traffic instead of waiting a minute. +otel.metric.export.interval=10000 + +otel.logs.exporter=none diff --git a/adventures/planned/00-blind-by-design/expert/pom.xml b/adventures/planned/00-blind-by-design/expert/pom.xml index 69455a5b..67515fad 100644 --- a/adventures/planned/00-blind-by-design/expert/pom.xml +++ b/adventures/planned/00-blind-by-design/expert/pom.xml @@ -70,23 +70,17 @@ 3.2.1 - + io.opentelemetry opentelemetry-api - - io.opentelemetry - opentelemetry-sdk - - - io.opentelemetry - opentelemetry-exporter-otlp - - - io.opentelemetry - opentelemetry-sdk-extension-autoconfigure - @@ -94,6 +88,15 @@ org.springframework.boot spring-boot-maven-plugin + + + -javaagent:${env.OTEL_JAVAAGENT_JAR} + diff --git a/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenFeatureConfig.java b/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenFeatureConfig.java index 361a7005..1c341895 100644 --- a/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenFeatureConfig.java +++ b/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenFeatureConfig.java @@ -16,15 +16,19 @@ import java.util.Optional; /** - * Wires the OpenFeature client to a remote flagd container ({@code Resolver.RPC}, - * default host {@code localhost:8013}) and registers the cross-cutting hooks. + * Wires the OpenFeature client to a remote flagd container ({@code Resolver.RPC}) + * and registers the cross-cutting hooks. * - *

Half-wired on purpose: the {@link TracesHook} reads the current span from - * the global tracer provider, so flag evaluations show up in Tempo as soon as - * the OpenTelemetry SDK is initialized. The matching {@code MetricsHook} is NOT - * registered here โ€” the meter provider is not exporting yet and the - * "Fun With Flags" dashboard panels in Grafana stay dark. Finishing the wiring - * is the participant's first task in this level.

+ *

OpenTelemetry SDK setup is provided by the OpenTelemetry Java Agent + * (attached via {@code -javaagent} โ€” see {@code pom.xml} and {@code otel.properties}). + * The agent installs the global {@link io.opentelemetry.api.OpenTelemetry} instance + * before {@code main()} runs, so {@link io.opentelemetry.api.GlobalOpenTelemetry#get()} + * returns a working SDK throughout this class.

+ * + *

Half-wired on purpose: the {@link TracesHook} is registered, so flag + * evaluations show up as span events in Tempo. The matching + * {@code MetricsHook} is NOT registered โ€” until it is, the "Fun With Flags" + * dashboard panels in Grafana stay dark.

*/ @Configuration public class OpenFeatureConfig implements WebMvcConfigurer { @@ -46,9 +50,11 @@ public void initProvider() { api.addHooks(new AuditHook()); api.addHooks(new TracesHook()); - // TODO Phase 3 task #1: register the matching MetricsHook here once - // the meter provider has been wired up in OpenTelemetryConfig. Without - // it the Grafana feature-flag dashboard cannot draw its panels. + // TODO Phase 3 task #1: register the matching MetricsHook here. Grab + // the OTel handle the agent installed via GlobalOpenTelemetry.get() + // โ€” the agent already wired the SDK and exporter before main() ran, + // but the metrics pipeline stays inert until you also turn on the + // metrics exporter in otel.properties (next to pom.xml). // // TODO Phase 3 task #2: write a small ContextSpanHook that copies the // merged evaluation context attributes (species, country, dose) onto the diff --git a/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenTelemetryConfig.java b/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenTelemetryConfig.java deleted file mode 100644 index 80f21a47..00000000 --- a/adventures/planned/00-blind-by-design/expert/src/main/java/dev/openfeature/demo/java/demo/OpenTelemetryConfig.java +++ /dev/null @@ -1,73 +0,0 @@ -package dev.openfeature.demo.java.demo; - -import io.opentelemetry.api.OpenTelemetry; -import io.opentelemetry.sdk.autoconfigure.AutoConfiguredOpenTelemetrySdk; -import jakarta.annotation.PreDestroy; -import org.springframework.beans.factory.annotation.Value; -import org.springframework.context.annotation.Bean; -import org.springframework.context.annotation.Configuration; - -/** - * Half-wired OpenTelemetry SDK. - * - *

Traces ARE exported to the LGTM stack via OTLP/gRPC at - * {@code http://localhost:4317}. The {@code TracesHook} registered in - * {@link OpenFeatureConfig} attaches every flag evaluation as a span event - * inside the active HTTP request span โ€” open Grafana โ†’ Explore โ†’ Tempo and - * search for service {@code fun-with-flags-java-spring} to see them.

- * - *

Metrics are NOT exported yet. The autoconfigure module is told - * {@code otel.metrics.exporter=none}, which means the {@code SdkMeterProvider} - * either is not created or has no exporter attached, so the Grafana - * "Fun With Flags โ€” Feature Flag Metrics" dashboard stays empty. To finish - * Phase 3 the participant must:

- * - *
    - *
  1. Switch {@code otel.metrics.exporter} to {@code otlp} and set a - * reasonable {@code otel.metric.export.interval} so Mimir receives - * evaluation metrics.
  2. - *
  3. Register the matching - * {@code dev.openfeature.contrib.hooks.otel.MetricsHook} on the - * OpenFeature API in {@link OpenFeatureConfig#initProvider()}.
  4. - *
- */ -@Configuration -public class OpenTelemetryConfig { - - private AutoConfiguredOpenTelemetrySdk autoConfigured; - - @Bean - public OpenTelemetry openTelemetry( - @Value("${otel.service.name:fun-with-flags-java-spring}") String serviceName, - @Value("${otel.exporter.otlp.endpoint:http://localhost:4317}") String otlpEndpoint, - @Value("${otel.exporter.otlp.protocol:grpc}") String otlpProtocol, - @Value("${otel.traces.exporter:otlp}") String tracesExporter, - // Phase 3 TODO: flip this to "otlp" so the meter provider exports. - @Value("${otel.metrics.exporter:none}") String metricsExporter, - @Value("${otel.logs.exporter:none}") String logsExporter) { - // Expose configured values via system properties so the SDK - // autoconfigure module picks them up regardless of how the app - // was launched. - System.setProperty("otel.service.name", serviceName); - System.setProperty("otel.exporter.otlp.endpoint", otlpEndpoint); - System.setProperty("otel.exporter.otlp.protocol", otlpProtocol); - System.setProperty("otel.traces.exporter", tracesExporter); - System.setProperty("otel.metrics.exporter", metricsExporter); - System.setProperty("otel.logs.exporter", logsExporter); - // Phase 3 TODO: once metrics are flipped on, surface a sensible - // export interval here, e.g. 10000 ms, so the dashboard updates - // within ten seconds of new traffic. - - autoConfigured = AutoConfiguredOpenTelemetrySdk.builder() - .setResultAsGlobal() - .build(); - return autoConfigured.getOpenTelemetrySdk(); - } - - @PreDestroy - public void shutdown() { - if (autoConfigured != null) { - autoConfigured.getOpenTelemetrySdk().close(); - } - } -} diff --git a/adventures/planned/00-blind-by-design/expert/src/main/resources/application.properties b/adventures/planned/00-blind-by-design/expert/src/main/resources/application.properties index 186c82e1..e46379db 100644 --- a/adventures/planned/00-blind-by-design/expert/src/main/resources/application.properties +++ b/adventures/planned/00-blind-by-design/expert/src/main/resources/application.properties @@ -1,11 +1,6 @@ spring.application.name=demo -# Phase 3 OpenTelemetry configuration (OTLP gRPC exporter to the local LGTM stack) -otel.exporter.otlp.endpoint=http://localhost:4317 -otel.exporter.otlp.protocol=grpc -otel.traces.exporter=otlp -# TODO Phase 3 task: flip this from "none" to "otlp" so flag-evaluation -# metrics start exporting to the LGTM stack. -otel.metrics.exporter=none -otel.logs.exporter=none -otel.service.name=fun-with-flags-java-spring +# OpenTelemetry SDK setup is provided by the OpenTelemetry Java Agent, +# which reads its configuration from otel.properties (next to pom.xml). +# Spring's Environment is not on the agent's lookup path, so do NOT put +# otel.* keys in this file โ€” they will be silently ignored. diff --git a/adventures/planned/00-blind-by-design/expert/verify.sh b/adventures/planned/00-blind-by-design/expert/verify.sh index c2fd01e6..4a45a914 100755 --- a/adventures/planned/00-blind-by-design/expert/verify.sh +++ b/adventures/planned/00-blind-by-design/expert/verify.sh @@ -29,12 +29,16 @@ TESTS_FAILED=0 FAILED_CHECKS=() APP_URL="http://localhost:8080" -# flagd is on the docker-internal network only โ€” verify.sh runs from -# the workspace container's terminal, where the service name resolves. +# verify.sh runs from inside the workspace container. The lab is in the +# same container, so localhost:8080 works โ€” but flagd and the LGTM stack +# are sibling compose services, reachable only by service name on the +# docker-internal network. Codespaces forwards the host ports onto the +# developer's laptop (so the browser sees localhost:3000), but those +# forwards don't loop back into the workspace container. FLAGD_HTTP="http://flagd:8013" -PROMETHEUS_URL="http://localhost:9090" -TEMPO_URL="http://localhost:3200" -GRAFANA_URL="http://localhost:3000" +PROMETHEUS_URL="http://lgtm:9090" +TEMPO_URL="http://lgtm:3200" +GRAFANA_URL="http://lgtm:3000" # ---- 1. App reachable ------------------------------------------------------ # Lean on test_http_endpoint from lib/scripts/http.sh โ€” handles connection @@ -68,7 +72,7 @@ if curl -fsS --max-time 5 "$GRAFANA_URL/api/health" >/dev/null 2>&1; then TESTS_PASSED=$((TESTS_PASSED + 1)) else print_error_indent "Grafana is not reachable at $GRAFANA_URL" - print_hint "The LGTM stack is a sibling devcontainer service (lgtm). Reopen the Codespace if it is not running." + print_hint "The LGTM stack is a sibling compose service named 'lgtm'. From the workspace container use lgtm:3000 (not localhost). If it's still unreachable, the sibling container has not started โ€” reopen the Codespace." TESTS_FAILED=$((TESTS_FAILED + 1)) FAILED_CHECKS+=("lgtm_reachable") fi @@ -87,7 +91,9 @@ if [[ -z "$ROLLOUT_RESPONSE" ]]; then TESTS_FAILED=$((TESTS_FAILED + 1)) FAILED_CHECKS+=("vision_amplifier_v2_rollback") else - ROLLOUT_VALUE=$(echo "$ROLLOUT_RESPONSE" | jq -r '.value // empty') + # NB: do not use `.value // empty` โ€” `//` treats jq-false as missing, + # so a successfully rolled-back flag (.value=false) would print as ''. + ROLLOUT_VALUE=$(echo "$ROLLOUT_RESPONSE" | jq -r '.value') if [[ "$ROLLOUT_VALUE" == "false" ]]; then print_info_indent "โœ“ vision_amplifier_v2 evaluates to false (rollout has been rolled back)" TESTS_PASSED=$((TESTS_PASSED + 1)) @@ -108,7 +114,7 @@ PROM_RESPONSE=$(curl -fsS --max-time 5 -G "$PROMETHEUS_URL/api/v1/query" \ if [[ -z "$PROM_RESPONSE" ]]; then print_error_indent "Could not query Prometheus at $PROMETHEUS_URL" - print_hint "The grafana/otel-lgtm container exposes Prometheus on port 9090. If port 9090 is not forwarded, the lgtm sibling container has not started โ€” reopen the Codespace." + print_hint "Prometheus runs inside the lgtm sibling compose service on port 9090 (reachable as lgtm:9090 from the workspace container). If it's still unreachable, the lgtm container has not started โ€” reopen the Codespace." TESTS_FAILED=$((TESTS_FAILED + 1)) FAILED_CHECKS+=("prometheus_metrics") else @@ -135,7 +141,7 @@ TEMPO_RESPONSE=$(curl -fsS --max-time 5 -G "$TEMPO_URL/api/search" \ if [[ -z "$TEMPO_RESPONSE" ]]; then print_error_indent "Could not query Tempo at $TEMPO_URL" - print_hint "The grafana/otel-lgtm container exposes Tempo on port 3200. If port 9090 is not forwarded, the lgtm sibling container has not started โ€” reopen the Codespace." + print_hint "Tempo runs inside the lgtm sibling compose service on port 3200 (reachable as lgtm:3200 from the workspace container). If it's still unreachable, the lgtm container has not started โ€” reopen the Codespace." TESTS_FAILED=$((TESTS_FAILED + 1)) FAILED_CHECKS+=("tempo_traces") else From 84e82e2ab19d4cc6362b97e5d541e1b1e08eeaf2 Mon Sep 17 00:00:00 2001 From: Simon Schrottner Date: Mon, 18 May 2026 16:42:52 +0200 Subject: [PATCH 8/8] fix(expert): provision the Feature Flag Metrics dashboard correctly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Grafana never showed the Fun With Flags dashboard because the docker-compose mount was wrong: - Mounted to /otel-lgtm/grafana/dashboards (Grafana's legacy default), which otel-lgtm does not scan. - No provisioning YAML pointing at the dashboard directory at all, so even if the path had been right, Grafana wouldn't have known to load anything from it. otel-lgtm 0.26.0 reads dashboard providers from /otel-lgtm/grafana/conf/provisioning/dashboards/*.yaml and loads dashboards from whatever path each provider references. Add the provisioning YAML next to the dashboard JSON, mount both to the right paths. Existing players need to either Rebuild Container in their Codespace or `docker compose up -d --force-recreate lgtm` โ€” volume mount changes do not apply hot. Signed-off-by: Simon Schrottner --- .../00-blind-by-design_03-expert/docker-compose.yml | 6 +++++- .../expert/dashboards/provisioning.yaml | 10 ++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 adventures/planned/00-blind-by-design/expert/dashboards/provisioning.yaml diff --git a/.devcontainer/00-blind-by-design_03-expert/docker-compose.yml b/.devcontainer/00-blind-by-design_03-expert/docker-compose.yml index 51633621..bbe9a14e 100644 --- a/.devcontainer/00-blind-by-design_03-expert/docker-compose.yml +++ b/.devcontainer/00-blind-by-design_03-expert/docker-compose.yml @@ -54,7 +54,11 @@ services: - GF_SECURITY_ADMIN_PASSWORD=admin volumes: - ../..:/workspaces/${localWorkspaceFolderBasename:-open-ecosystem-challenges}:ro - - ../../adventures/planned/00-blind-by-design/expert/dashboards:/otel-lgtm/grafana/dashboards:ro + # Dashboard auto-provisioning: otel-lgtm scans /otel-lgtm/grafana/conf/provisioning/dashboards/ + # for provider YAMLs, then loads dashboard JSONs from whatever path + # the provider references. We point at .../custom for both. + - ../../adventures/planned/00-blind-by-design/expert/dashboards/provisioning.yaml:/otel-lgtm/grafana/conf/provisioning/dashboards/fun-with-flags.yaml:ro + - ../../adventures/planned/00-blind-by-design/expert/dashboards:/otel-lgtm/grafana/conf/provisioning/dashboards/custom:ro loadgen: image: grafana/k6:1.7.1 diff --git a/adventures/planned/00-blind-by-design/expert/dashboards/provisioning.yaml b/adventures/planned/00-blind-by-design/expert/dashboards/provisioning.yaml new file mode 100644 index 00000000..5f587efb --- /dev/null +++ b/adventures/planned/00-blind-by-design/expert/dashboards/provisioning.yaml @@ -0,0 +1,10 @@ +apiVersion: 1 +providers: + - name: 'Fun With Flags' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /otel-lgtm/grafana/conf/provisioning/dashboards/custom + foldersFromFilesStructure: false