diff --git a/dev-tools/cluster/Dockerfile b/dev-tools/cluster/Dockerfile new file mode 100644 index 0000000000..c33f1dfac8 --- /dev/null +++ b/dev-tools/cluster/Dockerfile @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Runtime image for a local Storm cluster, built FROM the locally compiled +# distribution so it runs *your* code, not a release from Docker Hub. +# +# The build context must be the directory that holds the distribution tarball, +# i.e. storm-dist/binary/final-package/target. Build with: +# +# docker build -f dev-tools/cluster/Dockerfile \ +# --build-arg STORM_VERSION=3.0.0-SNAPSHOT \ +# -t storm-local:3.0.0-SNAPSHOT \ +# storm-dist/binary/final-package/target +# +# or simply `docker compose up --build` from this directory. +FROM eclipse-temurin:21-jre-jammy + +ARG STORM_VERSION=3.0.0-SNAPSHOT + +# The `storm` CLI is a Python 3 script; daemons shell out to `ps` (procps). +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 \ + procps \ + && rm -rf /var/lib/apt/lists/* + +ENV STORM_HOME=/opt/storm +ENV PATH=${STORM_HOME}/bin:${PATH} + +# ADD auto-extracts the tarball; then move the versioned dir to a stable path. +ADD apache-storm-${STORM_VERSION}.tar.gz /opt/ +RUN mv /opt/apache-storm-${STORM_VERSION} ${STORM_HOME} \ + && groupadd -r storm && useradd -r -g storm -d ${STORM_HOME} storm \ + && mkdir -p /data/storm \ + && chown -R storm:storm ${STORM_HOME} /data/storm + +USER storm +WORKDIR ${STORM_HOME} + +# 6627 nimbus thrift | 8080 UI | 8000 logviewer | 6700-6703 worker slots +EXPOSE 6627 8080 8000 6700 6701 6702 6703 + +# Default daemon; overridden per-service in docker-compose.yml. +CMD ["storm", "nimbus"] diff --git a/dev-tools/cluster/Dockerfile.dockerignore b/dev-tools/cluster/Dockerfile.dockerignore new file mode 100644 index 0000000000..a12edc5157 --- /dev/null +++ b/dev-tools/cluster/Dockerfile.dockerignore @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# BuildKit honours .dockerignore next to the Dockerfile. Patterns +# are relative to the build context (storm-dist/binary/final-package/target), +# so we exclude everything and re-include only the tarball the Dockerfile ADDs. +# This keeps the build context to ~the tarball instead of the whole target dir +# (which also holds the .zip and the extracted distribution). +* +!apache-storm-*.tar.gz diff --git a/dev-tools/cluster/FileReadWordCountTopo-cluster.yaml b/dev-tools/cluster/FileReadWordCountTopo-cluster.yaml new file mode 100644 index 0000000000..9c25c465a3 --- /dev/null +++ b/dev-tools/cluster/FileReadWordCountTopo-cluster.yaml @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Config for FileReadWordCountTopo on the local docker cluster. +spout.count: 1 +splitter.count: 2 +counter.count: 2 +input.file: "/topology/randomwords.txt" + +# Two workers force the spout and bolts onto separate worker processes (and, +# with two supervisors, separate containers), so inter-worker tuple traffic +# crosses the network. +topology.workers: 2 + +# Enable the EWMA jitter estimators (RFC 3550) in TaskMetrics so the +# __execute-jitter / __process-jitter / __complete-jitter metrics are emitted. +topology.stats.ewma.enable: true +topology.stats.ewma.smoothing.factor: 0.0625 diff --git a/dev-tools/cluster/README.md b/dev-tools/cluster/README.md new file mode 100644 index 0000000000..3350e2602c --- /dev/null +++ b/dev-tools/cluster/README.md @@ -0,0 +1,265 @@ +# Local Storm cluster (cluster mode) with Docker Compose + +Brings up a real, distributed Storm cluster on your machine — **dev ZooKeeper + +Nimbus + two Supervisors + UI** plus an observability stack (**Pushgateway + +Prometheus + Grafana**). + +Two supervisors with **2 worker slots each (4 slots total)** are intentional: a +topology submitted with `topology.workers >= 2` lands one worker per supervisor +container, so its inter-worker tuple traffic actually crosses the network — the +only path where tuple serialization happens. The +slot count lives in `storm.yaml` (`supervisor.slots.ports`); raise it there if +you want more workers. + +## Architecture + +All containers share one Docker bridge network (`storm`) and resolve each other +by service name. Host-published ports are shown in `()`. The metrics plane is +detailed under [Metrics & reports](#metrics--reports-prometheus--grafana). + +```text + host: docker compose exec nimbus storm jar ... + | submit topology + ========================|=========== docker network: storm ============== + v + +-----------+ +---------------+ +-----------+ + | ZooKeeper |<---->| Nimbus :6627 |<---->| UI :8080 | + +-----------+ +-------+-------+ +-----------+ + | assign workers + +---------------+----------------+ + v v + +--------------------+ tuples +--------------------+ + | supervisor1 |<============>| supervisor2 | + | worker :6700 | (network hop)| worker :6700 | + +--------------------+ +--------------------+ + + metrics: Nimbus --> Pushgateway and workers --> graphite-exporter, + both scraped by Prometheus :9090 --> Grafana :3000 +``` + +## Layout + +| File | Purpose | +|------|---------| +| `Dockerfile` | Runtime image `FROM eclipse-temurin:21-jre`, unpacks the built dist into `/opt/storm`. | +| `Dockerfile.dockerignore` | Keeps the build context to just the dist tarball. | +| `storm.yaml` | Cluster config (ZK + Nimbus seeds + slots), bind-mounted into every daemon. | +| `docker-compose.yml` | dev ZooKeeper, Nimbus, supervisor1, supervisor2, UI, Pushgateway, graphite-exporter, Prometheus, Grafana. | +| `FileReadWordCountTopo-cluster.yaml` | Sample topology config for the smoke test below. | +| `storm-client.yaml` | Client config to submit topologies from the host (e.g. from IntelliJ). | +| `build-image.sh` | One command: rebuild the dist from current source (lib **and** lib-worker) and the Docker image. | +| `prepare-extlib.sh` | Builds the Prometheus reporter + deps into `extlib-daemon/` (mounted on Nimbus). | +| `netsim.sh` | Inject network delay/jitter/loss between worker hosts (tc/netem) to test the network path. | +| `prometheus/prometheus.yml` | Prometheus scrape config (Pushgateway + graphite-exporter). | +| `graphite/graphite-mapping.yml` | Maps Storm metrics-v2 Graphite names into labelled Prometheus series. | +| `grafana/` | Provisioned datasource + the **Storm Cluster** and **Storm Metrics v2** dashboards. | + +## Prerequisites + +> **Platform:** Linux or macOS (or Windows via WSL2). The helper scripts are +> bash and call `mvn` (not `mvn.cmd`), and `netsim.sh` relies on Linux +> `tc`/`netem`. Native Windows is not supported yet. + +1. Build the distribution **and** the Docker image from the current source — one command: + + ```bash + dev-tools/cluster/build-image.sh + ``` + + It rebuilds `storm-client-bin` + `final-package` (so both the daemon `lib` and + the worker `lib-worker` classpaths reflect your code), then builds the + `storm-local` image. Building only `final-package -am` is **not** enough: it + leaves `lib-worker` (the worker classpath) stale, so workers run old code. + +2. Stage the Prometheus reporter (jar + runtime deps) onto Nimbus's classpath: + + ```bash + cd dev-tools/cluster + ./prepare-extlib.sh # fills extlib-daemon/ (git-ignored build artifacts) + ``` + +The Storm version is taken from the repo root `pom.xml` (`project.version`). +`build-image.sh` reads it and writes `dev-tools/cluster/.env`; the compose file +references it as `${STORM_VERSION}` (image tag, build arg, and the storm-perf jar +path), so everything tracks the pom automatically. To pin a different version, +run with `STORM_VERSION=x.y.z` or edit `.env`. + +## Run + +```bash +cd dev-tools/cluster +docker compose up --build -d # build the image and start everything +docker compose ps # all services Up, zookeeper healthy +docker compose logs -f nimbus # follow a daemon +``` + +| Service | URL | Notes | +|---------|-----|-------| +| Storm UI | http://localhost:8080 | topologies, workers, capacity | +| Grafana | http://localhost:3000 | login `admin` / `admin`; **Storm Cluster** + **Storm Metrics v2** dashboards | +| Prometheus | http://localhost:9090 | raw queries / targets | +| Nimbus Thrift | localhost:6627 | submit topologies from the host | + +Tear down — **use `-v`** so the metrics are deleted too: + +```bash +docker compose down -v +``` + +Prometheus and Grafana store their data on disk in the named volumes +`prometheus-data` / `grafana-data` (Prometheus retention is capped at +`--storage.tsdb.retention.time=2h` to keep them small). A plain `docker compose +down` keeps the containers' networks gone but **leaves those volumes on disk**; +`down -v` is what deletes them. The datasource and dashboards are re-provisioned +from files on the next `up`, so wiping the volumes loses only metrics history +and ad-hoc Grafana UI edits. + +## Smoke test: submit a topology + +The Nimbus container has the `storm-perf` jar, a sample input file and the +config mounted under `/topology`. Submit the word-count topology (runs ~120s): + +```bash +docker compose exec -d nimbus \ + storm jar /topology/storm-perf.jar \ + org.apache.storm.perf.FileReadWordCountTopo 120 /topology/topo.yaml +``` + +Watch it in the UI, or via REST: + +```bash +curl -s http://localhost:8080/api/v1/topology/summary | python3 -m json.tool +``` + +`FileReadWordCountTopo-cluster.yaml` sets `topology.workers: 2`, so the two +workers land on `supervisor1` and `supervisor2` — verify with the topology page +(Worker Resources) that the two workers sit on different hosts. + +It is a 3-stage pipeline; spreading it across two workers makes at least one +edge a network hop (where tuple serialization happens): + +```text + FileReadSpout --shuffle (network hop)--> SplitSentenceBolt --fieldsGrouping--> CountBolt + (emits text lines) (emits words) (counts) +``` + +## Simulating network latency and jitter + +Inter-worker traffic between containers is near-instant (~0.05 ms), which hides +the network cost. `netsim.sh` adds realistic +latency/jitter/loss to the worker hosts with Linux `tc`/`netem`. The Storm image +has no `tc`, so the script injects the qdisc from a throwaway helper container +sharing each supervisor's network namespace — no image rebuild needed. + +```bash +./netsim.sh add 50 10 0 # 50 ms delay, 10 ms jitter, 0% loss on each supervisor +./netsim.sh ping # verify: worker<->worker RTT jumps to ~100 ms (2x egress) +./netsim.sh show # inspect the active qdisc +./netsim.sh clear # remove shaping +``` + +netem shapes **all** egress from each supervisor (inter-worker tuples *and* +heartbeats to Nimbus/ZK), so keep the delay moderate (≤ ~150 ms) or heartbeats +may time out. With both supervisors delayed by `D`, worker round-trip latency is +~`2*D`. + +> **Why the script sets a huge queue `limit`.** netem's default queue is only +> 1000 packets. Under a high-throughput perf topology that buffer overflows at +> the added delay and drops tuples even with `loss 0%`, which collapses TCP and +> back-pressures the spout to **zero throughput** (you'll see `transferred 0`). +> `netsim.sh` therefore sets `limit 1000000` (override as the 4th arg) so the +> queue can hold `rate * delay` without dropping. If you ever apply `tc netem` +> by hand, remember to add a large `limit`. + +## Metrics & reports (Prometheus + Grafana) + +Two metric paths feed Prometheus, both push-based (so ephemeral workers need no +scrape targets), and Grafana auto-loads a dashboard for each: + +```text + Nimbus --push--> Pushgateway:9091 -----------------scrape-------------+ + v + supervisor1 worker --+ Prometheus:9090 --> Grafana:3000 + +-- graphite:9109 --> graphite-exporter:9108 --scrape--+ | + supervisor2 worker --+ +--> "Storm Cluster" + +--> "Storm Metrics v2" +``` + + +1. **Cluster summary** — *Storm Cluster* dashboard + `Nimbus → Pushgateway → Prometheus`. Nimbus runs Storm's + `PrometheusPreparableReporter` (enabled via `-c` overrides in + `docker-compose.yml`, jars from `extlib-daemon/`) and pushes cluster-summary + metrics every 10s. Prometheus scrapes the Pushgateway (`honor_labels` keeps + `job="nimbus"`). +2. **Metrics v2 (per-worker/topology)** — *Storm Metrics v2* dashboard + `workers → graphite-exporter → Prometheus`. Every worker runs the + `GraphiteStormReporter` (configured in `storm.yaml` under + `topology.metrics.reporters`) and emits its full Dropwizard metric set in + Graphite plaintext to the graphite-exporter, which `graphite-mapping.yml` + turns into labelled `storm_worker{...}` / `storm_topology{...}` series. + +The pushed series are cluster-level (not per-topology): `summary_cluster_num_supervisors`, +`summary_cluster_num_topologies`, `summary_cluster_num_total_workers`, +`summary_cluster_num_total_used_workers`, `nimbus_total_cpu`, +`nimbus_available_cpu_non_negative`, `nimbus_total_memory`, and the +`summary_topologies_assigned_*` histograms. Quick check: + +```bash +curl -s 'http://localhost:9090/api/v1/query?query=summary_cluster_num_total_workers' +``` + +### Storm Metrics v2 dashboard + +Metrics v2 are emitted **per task** (`org.apache.storm.metrics2.TaskMetrics`), so +the dashboard is filtered by a chained `topology → host → component → task` +variable set, and every series carries `topology_id`, `host`, `component`, +`task`, `port` labels. + +`graphite-mapping.yml` models `TaskMetrics` explicitly into clean metrics. Each +is per `(component, task)`; the `key` label is the metric key — the **own output +stream** for emit/transfer, or the **`sourceComponent:sourceStream`** for the +input metrics (execute/ack/fail/latency): + +| Prometheus metric | TaskMetrics source | type | +|---|---|---| +| `storm_emit_rate` / `storm_emit_total` | `__emit-count` (`.m1_rate` / `.count`) | RateCounter | +| `storm_transfer_rate` / `storm_transfer_total` | `__transfer-count` | RateCounter | +| `storm_execute_rate` / `storm_execute_total` | `__execute-count` | RateCounter | +| `storm_ack_rate` / `storm_ack_total` | `__ack-count` | RateCounter | +| `storm_fail_rate` / `storm_fail_total` | `__fail-count` | RateCounter | +| `storm_execute_latency_ms` | `__execute-latency` | RollingAverageGauge (ms) | +| `storm_process_latency_ms` | `__process-latency` | RollingAverageGauge (ms) | +| `storm_complete_latency_ms` | `__complete-latency` (spout) | RollingAverageGauge (ms) | +| `storm_execute_jitter_ms` | `__execute-jitter` | EwmaGauge (ms) | +| `storm_process_jitter_ms` | `__process-jitter` | EwmaGauge (ms) | +| `storm_complete_jitter_ms` | `__complete-jitter` (spout) | EwmaGauge (ms) | +| `storm_capacity` | `__capacity` (over all streams) | RollingAverageGauge (0..1) | + +Counts/rates are **sampling-scaled** (`topology.stats.sample.rate`), so they +estimate true values; `.m1_rate` is tuples/s averaged over 1 minute. The +**jitter** metrics are RFC 3550 EWMA latency-variation estimators and only flow +when `topology.stats.ewma.enable: true` (set in `storm.yaml`) — pair them with +`netsim.sh` to see network jitter propagate into per-task latency variation. + +Everything else falls through to generic series, still fully queryable: +- `storm_worker{metric=...}` — `__skipped-*`, `__backpressure-last-overflow-count`, + `__send-iconnection-*`, `doHeartbeat-calls`. +- `storm_topology{component="__system"}` — per-worker JVM (`task=-1`): + `memory.heap.*`, `memory.non-heap.*`, `memory.pools.*`, `GC.*.{count,time}`, + `threads.*`. + +List everything currently flowing: + +```bash +curl -s http://localhost:9090/api/v1/label/metric/values | python3 -m json.tool +``` + +## Notes + +- The bundled `storm dev-zookeeper` is single-node and for development only; it + does not snapshot. Swap in a real ZooKeeper for anything beyond local testing. +- Heaps are kept small in `storm.yaml` so the whole cluster fits on a laptop. + Bump `worker.childopts` / `*.childopts` for heavier topologies. +- To run a different topology, mount its jar into the `nimbus` service (see the + `volumes:` of that service) and `storm jar` it the same way. diff --git a/dev-tools/cluster/build-image.sh b/dev-tools/cluster/build-image.sh new file mode 100755 index 0000000000..7b339e715e --- /dev/null +++ b/dev-tools/cluster/build-image.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# One command to (re)build the storm-local Docker image from the CURRENT source. +# +# Why this exists: the binary distribution puts the daemon classpath in `lib` +# (from final-package's Maven deps) but the WORKER classpath in `lib-worker` +# (copied via a fileSet from the storm-client-bin module's target dir). Building +# only `final-package -am` refreshes `lib` but leaves `lib-worker` stale, so +# workers keep running old code. This script rebuilds BOTH storm-client-bin and +# final-package (with -am to refresh their Maven deps), so the tarball -- and the +# image built from it -- reflect the current source on every classpath. +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${HERE}/../.." && pwd)" + +# Single source of truth: the project version in the repo root pom.xml. (The +# pom's own child, not the version.) Override with STORM_VERSION. +POM_VERSION="$(python3 -c "import xml.etree.ElementTree as ET; ns={'m':'http://maven.apache.org/POM/4.0.0'}; print(ET.parse('${REPO_ROOT}/pom.xml').getroot().find('m:version', ns).text)")" +VERSION="${STORM_VERSION:-${POM_VERSION}}" +IMAGE="${IMAGE:-storm-local:${VERSION}}" +TARBALL="${REPO_ROOT}/storm-dist/binary/final-package/target/apache-storm-${VERSION}.tar.gz" + +# Keep docker-compose's .env in sync so the image tag / build arg / jar path match. +printf '# Auto-generated by build-image.sh from the repo root pom.xml.\nSTORM_VERSION=%s\n' "${VERSION}" > "${HERE}/.env" +echo "==> Storm version: ${VERSION} (from pom.xml; wrote ${HERE}/.env)" + +cd "${REPO_ROOT}" + +echo "==> [1/2] Building distribution (storm-client-bin + final-package, current source)" +mvn -q clean install -DskipTests -Pdist -Dgpg.skip=true \ + -pl storm-dist/binary/storm-client-bin,storm-dist/binary/final-package -am + +echo "==> [2/2] Building Docker image ${IMAGE} from the fresh tarball" +docker build -f "${HERE}/Dockerfile" \ + --build-arg "STORM_VERSION=${VERSION}" \ + -t "${IMAGE}" \ + "${REPO_ROOT}/storm-dist/binary/final-package/target" + +echo +echo "Done. Image ${IMAGE} is up to date with the current source." +echo "Next: cd ${HERE} && docker compose up -d (recreates containers with the new image)" diff --git a/dev-tools/cluster/docker-compose.yml b/dev-tools/cluster/docker-compose.yml new file mode 100644 index 0000000000..421fc9161e --- /dev/null +++ b/dev-tools/cluster/docker-compose.yml @@ -0,0 +1,182 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# A local Storm cluster in cluster (distributed) mode: dev ZooKeeper + Nimbus + +# two Supervisors + UI, plus a Pushgateway + Prometheus + Grafana stack for +# metrics. Built from the locally compiled distribution so it runs your code. +# Prereq: run ./prepare-extlib.sh once. Start with: docker compose up --build + +# Build the Storm image once; every storm service reuses it. +# STORM_VERSION must match the project version in the repo root pom.xml. It is +# read from .env (written by build-image.sh from the pom); the default below is +# the fallback when .env is absent. +x-storm: &storm + image: storm-local:${STORM_VERSION:-3.0.0-SNAPSHOT} + build: + context: ../../storm-dist/binary/final-package/target + dockerfile: ../../../../dev-tools/cluster/Dockerfile + args: + STORM_VERSION: "${STORM_VERSION:-3.0.0-SNAPSHOT}" + volumes: + - ./storm.yaml:/opt/storm/conf/storm.yaml:ro + # Sample input read by FileReadSpout. It runs on the workers, so the file + # must exist on every supervisor host, not just where you submit from. + - ../../examples/storm-perf/src/main/sampledata/randomwords.txt:/topology/randomwords.txt:ro + restart: on-failure + networks: + - storm + +services: + # Storm's built-in development ZooKeeper (single node, in-memory-ish, under + # dev.zookeeper.path). + zookeeper: + <<: *storm + hostname: zookeeper + command: storm dev-zookeeper + healthcheck: + test: ["CMD-SHELL", "python3 -c 'import socket; socket.create_connection((\"localhost\", 2181), 2)'"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 20s + + nimbus: + <<: *storm + hostname: nimbus + # Cluster-summary metrics live on the Nimbus leader, so the Prometheus + # Pushgateway reporter is enabled here (and only here) via -c overrides -- + # keeping storm.yaml clean and avoiding multiple daemons clobbering the same + # Pushgateway group. The reporter jars come from the extlib-daemon mount. + command: + - storm + - nimbus + - -c + - 'storm.daemon.metrics.reporter.plugins=["org.apache.storm.metrics.prometheus.PrometheusPreparableReporter"]' + - -c + - storm.daemon.metrics.reporter.interval.secs=10 + - -c + - storm.daemon.metrics.reporter.plugin.prometheus.endpoint=pushgateway:9091 + - -c + - storm.daemon.metrics.reporter.plugin.prometheus.job=nimbus + ports: + - "127.0.0.1:6627:6627" # Thrift, for submitting topologies from the host + volumes: + - ./storm.yaml:/opt/storm/conf/storm.yaml:ro + # Prometheus Pushgateway reporter + its runtime deps (see prepare-extlib.sh). + - ./extlib-daemon:/opt/storm/extlib-daemon:ro + # Storm-perf jar and sample. + - ../../examples/storm-perf/target/storm-perf-${STORM_VERSION:-3.0.0-SNAPSHOT}.jar:/topology/storm-perf.jar:ro + - ../../examples/storm-perf/src/main/sampledata/randomwords.txt:/topology/randomwords.txt:ro + # This is only an example. Customize your cluster conf per benchmark. + - ./FileReadWordCountTopo-cluster.yaml:/topology/topo.yaml:ro + depends_on: + zookeeper: + condition: service_healthy + pushgateway: + condition: service_started + + supervisor1: + <<: *storm + hostname: supervisor1 + command: storm supervisor + depends_on: + - nimbus + + supervisor2: + <<: *storm + hostname: supervisor2 + command: storm supervisor + depends_on: + - nimbus + + ui: + <<: *storm + hostname: ui + command: storm ui + ports: + - "127.0.0.1:8080:8080" # Storm UI http://localhost:8080 + depends_on: + - nimbus + + # Observability: Storm cluster metrics -> Pushgateway -> Prometheus -> Grafana + + # Nimbus pushes cluster-summary metrics here every 10s; Prometheus scrapes it. + pushgateway: + image: prom/pushgateway:v1.10.0 + hostname: pushgateway + networks: + - storm + + # Receives Storm metrics-v2 in Graphite plaintext (port 9109) from every worker + # and exposes them to Prometheus on :9108. Unmapped names still appear (auto + # named), so ALL v2 metrics are available; the mapping just tidies the common + # families into labelled storm_worker / storm_topology series. + graphite-exporter: + image: prom/graphite-exporter:v0.16.0 + hostname: graphite-exporter + command: + - "--graphite.mapping-config=/etc/graphite/graphite-mapping.yml" + volumes: + - ./graphite/graphite-mapping.yml:/etc/graphite/graphite-mapping.yml:ro + networks: + - storm + + prometheus: + image: prom/prometheus:v3.1.0 + hostname: prometheus + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" + # Short retention keeps the on-disk TSDB small for a dev cluster. + - "--storage.tsdb.retention.time=2h" + # TSDB on a named volume (on disk). It is deleted by `docker compose down -v`. + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-data:/prometheus + ports: + - "127.0.0.1:9090:9090" # Prometheus UI http://localhost:9090 + depends_on: + - pushgateway + networks: + - storm + + grafana: + image: grafana/grafana:11.5.1 + hostname: grafana + environment: + GF_SECURITY_ADMIN_USER: admin + GF_SECURITY_ADMIN_PASSWORD: admin + GF_USERS_DEFAULT_THEME: dark + volumes: + - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ./grafana/dashboards:/etc/grafana/dashboards:ro + # Grafana DB/state on a named volume (on disk). Deleted by `down -v`; the + # datasource and dashboards are re-provisioned from files on every start. + - grafana-data:/var/lib/grafana + ports: + - "127.0.0.1:3000:3000" # Grafana -> http://localhost:3000 (admin/admin) + depends_on: + - prometheus + networks: + - storm + +volumes: + # Metrics live here on disk; `docker compose down -v` deletes them. + prometheus-data: + grafana-data: + +networks: + storm: + driver: bridge diff --git a/dev-tools/cluster/extlib-daemon/.gitignore b/dev-tools/cluster/extlib-daemon/.gitignore new file mode 100644 index 0000000000..29c452c780 --- /dev/null +++ b/dev-tools/cluster/extlib-daemon/.gitignore @@ -0,0 +1,2 @@ +# Populated by prepare-extlib.sh (build artifacts, not version-controlled) +*.jar diff --git a/dev-tools/cluster/grafana/dashboards/storm-cluster.json b/dev-tools/cluster/grafana/dashboards/storm-cluster.json new file mode 100644 index 0000000000..735e12bb73 --- /dev/null +++ b/dev-tools/cluster/grafana/dashboards/storm-cluster.json @@ -0,0 +1,103 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] } }, "overrides": [] }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }, + "id": 1, + "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "pluginVersion": "11.5.1", + "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "summary_cluster_num_supervisors{job=\"nimbus\"}", "legendFormat": "supervisors", "refId": "A" } ], + "title": "Supervisors", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] } }, "overrides": [] }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }, + "id": 2, + "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "pluginVersion": "11.5.1", + "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "summary_cluster_num_topologies{job=\"nimbus\"}", "legendFormat": "topologies", "refId": "A" } ], + "title": "Topologies", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] } }, "overrides": [] }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 }, + "id": 3, + "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "pluginVersion": "11.5.1", + "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "summary_cluster_num_total_used_workers{job=\"nimbus\"}", "legendFormat": "used", "refId": "A" } ], + "title": "Worker slots used", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] } }, "overrides": [] }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 }, + "id": 4, + "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "pluginVersion": "11.5.1", + "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "summary_cluster_num_total_workers{job=\"nimbus\"}", "legendFormat": "total", "refId": "A" } ], + "title": "Worker slots total", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" }, "unit": "short" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "id": 5, + "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "summary_cluster_num_total_used_workers{job=\"nimbus\"}", "legendFormat": "used slots", "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "summary_cluster_num_total_workers{job=\"nimbus\"}", "legendFormat": "total slots", "refId": "B" } + ], + "title": "Worker slots over time", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" }, "unit": "percent" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, + "id": 6, + "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "nimbus_total_cpu{job=\"nimbus\"}", "legendFormat": "total cpu (% of core)", "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "nimbus_available_cpu_non_negative{job=\"nimbus\"}", "legendFormat": "available cpu (% of core)", "refId": "B" } + ], + "title": "Cluster CPU", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" }, "unit": "decmbytes" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, + "id": 7, + "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "nimbus_total_memory{job=\"nimbus\"}", "legendFormat": "total memory (MB)", "refId": "A" } + ], + "title": "Cluster memory", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 39, + "tags": ["storm"], + "templating": { "list": [] }, + "time": { "from": "now-30m", "to": "now" }, + "timepicker": {}, + "timezone": "", + "title": "Storm Cluster", + "uid": "storm-cluster", + "version": 1, + "weekStart": "" +} diff --git a/dev-tools/cluster/grafana/dashboards/storm-metrics-v2.json b/dev-tools/cluster/grafana/dashboards/storm-metrics-v2.json new file mode 100644 index 0000000000..0f697ceb9b --- /dev/null +++ b/dev-tools/cluster/grafana/dashboards/storm-metrics-v2.json @@ -0,0 +1,1360 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "storm_emit_rate{topology_id=~\"$topology\", host=~\"$host\", component=~\"$component\", task=~\"$task\"}", + "legendFormat": "{{host}} {{component}} t{{task}} {{stream}}", + "refId": "A" + } + ], + "title": "Emit rate (tuples/s)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "storm_transfer_rate{topology_id=~\"$topology\", host=~\"$host\", component=~\"$component\", task=~\"$task\"}", + "legendFormat": "{{host}} {{component}} t{{task}} {{stream}}", + "refId": "A" + } + ], + "title": "Transfer rate (tuples/s)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "storm_execute_rate{topology_id=~\"$topology\", host=~\"$host\", component=~\"$component\", task=~\"$task\"}", + "legendFormat": "{{host}} {{component}} t{{task}} {{key}}", + "refId": "A" + } + ], + "title": "Execute rate (tuples/s)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "storm_ack_rate{topology_id=~\"$topology\", host=~\"$host\", component=~\"$component\", task=~\"$task\"}", + "legendFormat": "{{host}} {{component}} t{{task}} {{key}}", + "refId": "A" + } + ], + "title": "Ack rate (tuples/s)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "storm_fail_rate{topology_id=~\"$topology\", host=~\"$host\", component=~\"$component\", task=~\"$task\"}", + "legendFormat": "{{host}} {{component}} t{{task}} {{key}}", + "refId": "A" + } + ], + "title": "Fail rate (tuples/s)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "storm_capacity{topology_id=~\"$topology\", host=~\"$host\", component=~\"$component\", task=~\"$task\"}", + "legendFormat": "{{host}} {{component}} t{{task}}", + "refId": "A" + } + ], + "title": "Capacity (per task)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "storm_execute_latency_ms{topology_id=~\"$topology\", host=~\"$host\", component=~\"$component\", task=~\"$task\"}", + "legendFormat": "{{host}} {{component}} t{{task}} {{key}}", + "refId": "A" + } + ], + "title": "Execute latency (ms)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "storm_process_latency_ms{topology_id=~\"$topology\", host=~\"$host\", component=~\"$component\", task=~\"$task\"}", + "legendFormat": "{{host}} {{component}} t{{task}} {{key}}", + "refId": "A" + } + ], + "title": "Process latency (ms)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "storm_complete_latency_ms{topology_id=~\"$topology\", host=~\"$host\", component=~\"$component\", task=~\"$task\"}", + "legendFormat": "{{host}} {{component}} t{{task}} {{key}}", + "refId": "A" + } + ], + "title": "Complete latency (spout, ms)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "storm_emit_total{topology_id=~\"$topology\", host=~\"$host\", component=~\"$component\", task=~\"$task\"}", + "legendFormat": "{{host}} {{component}} t{{task}} {{stream}}", + "refId": "A" + } + ], + "title": "Cumulative emitted (sampling-scaled)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "storm_execute_jitter_ms{topology_id=~\"$topology\", host=~\"$host\", component=~\"$component\", task=~\"$task\"}", + "legendFormat": "{{host}} {{component}} t{{task}} {{key}}", + "refId": "A" + } + ], + "title": "Execute jitter (EWMA, ms)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "storm_process_jitter_ms{topology_id=~\"$topology\", host=~\"$host\", component=~\"$component\", task=~\"$task\"}", + "legendFormat": "{{host}} {{component}} t{{task}} {{key}}", + "refId": "A" + } + ], + "title": "Process jitter (EWMA, ms)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 48 + }, + "id": 13, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "storm_complete_jitter_ms{topology_id=~\"$topology\", host=~\"$host\", component=~\"$component\", task=~\"$task\"}", + "legendFormat": "{{host}} {{component}} t{{task}} {{key}}", + "refId": "A" + } + ], + "title": "Complete jitter (spout, EWMA, ms)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 48 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "storm_worker{topology_id=~\"$topology\", host=~\"$host\", component=~\"$component\", task=~\"$task\", metric=~\"__skipped-.*m1_rate\"}", + "legendFormat": "{{host}} {{component}} t{{task}} {{metric}}", + "refId": "A" + } + ], + "title": "Skipped time rate (ms/s)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 56 + }, + "id": 15, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "storm_worker{topology_id=~\"$topology\", host=~\"$host\", component=~\"$component\", task=~\"$task\", metric=\"__backpressure-last-overflow-count\"}", + "legendFormat": "{{host}} {{component}} t{{task}}", + "refId": "A" + } + ], + "title": "Backpressure overflow count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 56 + }, + "id": 16, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "storm_worker{topology_id=~\"$topology\", host=~\"$host\", metric=~\"__send-iconnection-(sent|pending)-.*\"}", + "legendFormat": "{{host}} {{component}} t{{task}} {{metric}}", + "refId": "A" + } + ], + "title": "Messaging: sent / pending", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 64 + }, + "id": 17, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "storm_worker{topology_id=~\"$topology\", host=~\"$host\", metric=\"doHeartbeat-calls.m1_rate\"}", + "legendFormat": "{{host}} {{metric}}", + "refId": "A" + } + ], + "title": "Heartbeat calls (1m rate)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 64 + }, + "id": 18, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "storm_topology{topology_id=~\"$topology\", host=~\"$host\", metric=~\"memory.heap.(used|committed|max)\"}", + "legendFormat": "{{host}} {{metric}}", + "refId": "A" + } + ], + "title": "JVM heap memory (bytes)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 72 + }, + "id": 19, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "storm_topology{topology_id=~\"$topology\", host=~\"$host\", metric=~\"memory.non-heap.(used|committed)\"}", + "legendFormat": "{{host}} {{metric}}", + "refId": "A" + } + ], + "title": "JVM non-heap memory (bytes)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 72 + }, + "id": 20, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(storm_topology{topology_id=~\"$topology\", host=~\"$host\", metric=~\"GC.*count\"}[1m])", + "legendFormat": "{{host}} {{metric}}", + "refId": "A" + } + ], + "title": "GC collections (1m rate)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 80 + }, + "id": 21, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(storm_topology{topology_id=~\"$topology\", host=~\"$host\", metric=~\"GC.*time\"}[1m])", + "legendFormat": "{{host}} {{metric}}", + "refId": "A" + } + ], + "title": "GC time (1m rate, ms/s)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 80 + }, + "id": 22, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "storm_topology{topology_id=~\"$topology\", host=~\"$host\", metric=\"threads.count\"}", + "legendFormat": "{{host}} count", + "refId": "A" + } + ], + "title": "JVM threads", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 39, + "tags": [ + "storm", + "metrics-v2" + ], + "templating": { + "list": [ + { + "name": "topology", + "label": "topology", + "type": "query", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(storm_capacity, topology_id)", + "query": { + "qryType": 1, + "query": "label_values(storm_capacity, topology_id)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "sort": 1, + "includeAll": true, + "allValue": ".*", + "multi": false, + "current": { + "text": "All", + "value": "$__all" + }, + "options": [] + }, + { + "name": "host", + "label": "host", + "type": "query", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(storm_capacity{topology_id=~\"$topology\"}, host)", + "query": { + "qryType": 1, + "query": "label_values(storm_capacity{topology_id=~\"$topology\"}, host)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "sort": 1, + "includeAll": true, + "allValue": ".*", + "multi": true, + "current": { + "text": "All", + "value": "$__all" + }, + "options": [] + }, + { + "name": "component", + "label": "component", + "type": "query", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(storm_capacity{topology_id=~\"$topology\", host=~\"$host\"}, component)", + "query": { + "qryType": 1, + "query": "label_values(storm_capacity{topology_id=~\"$topology\", host=~\"$host\"}, component)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "sort": 1, + "includeAll": true, + "allValue": ".*", + "multi": true, + "current": { + "text": "All", + "value": "$__all" + }, + "options": [] + }, + { + "name": "task", + "label": "task", + "type": "query", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(storm_capacity{topology_id=~\"$topology\", host=~\"$host\", component=~\"$component\"}, task)", + "query": { + "qryType": 1, + "query": "label_values(storm_capacity{topology_id=~\"$topology\", host=~\"$host\", component=~\"$component\"}, task)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "sort": 1, + "includeAll": true, + "allValue": ".*", + "multi": true, + "current": { + "text": "All", + "value": "$__all" + }, + "options": [] + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "title": "Storm Metrics v2", + "uid": "storm-metrics-v2", + "version": 6, + "weekStart": "" +} diff --git a/dev-tools/cluster/grafana/provisioning/dashboards/provider.yml b/dev-tools/cluster/grafana/provisioning/dashboards/provider.yml new file mode 100644 index 0000000000..caa8fa99ce --- /dev/null +++ b/dev-tools/cluster/grafana/provisioning/dashboards/provider.yml @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: 1 + +providers: + - name: Storm + orgId: 1 + folder: Storm + type: file + disableDeletion: false + editable: true + allowUiUpdates: true + options: + path: /etc/grafana/dashboards + foldersFromFilesStructure: false diff --git a/dev-tools/cluster/grafana/provisioning/datasources/datasource.yml b/dev-tools/cluster/grafana/provisioning/datasources/datasource.yml new file mode 100644 index 0000000000..64d9820527 --- /dev/null +++ b/dev-tools/cluster/grafana/provisioning/datasources/datasource.yml @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: 1 + +datasources: + - name: Prometheus + uid: prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: true diff --git a/dev-tools/cluster/graphite/graphite-mapping.yml b/dev-tools/cluster/graphite/graphite-mapping.yml new file mode 100644 index 0000000000..16aef00d20 --- /dev/null +++ b/dev-tools/cluster/graphite/graphite-mapping.yml @@ -0,0 +1,130 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Maps Storm metrics-v2 Graphite names into clean, labelled Prometheus series. +# +# Storm worker/topology metric names look like (dots are field separators, the +# metric itself is appended after a dash): +# storm.worker......- (per-stream) +# storm.worker.....- (no stream) +# storm.topology.....- (__system JVM, task -1) +# +# Rules are tried in order, first match wins. The first block models TaskMetrics +# (org.apache.storm.metrics2.TaskMetrics) explicitly: the operation names are a +# fixed set, so we hard-code them and avoid the dash-ambiguity of generic parsing. +# Each TaskMetrics metric is per (component, task, stream); the `key` label holds +# the metric key -- the own stream for OUTPUT metrics (emit/transfer) or the +# : for INPUT metrics (execute/ack/fail/latency). +# RateCounter exports both `.count` (cumulative, sampling-scaled) and `.m1_rate` +# (tuples/s over 1 min); latencies/capacity are RollingAverageGauge single values. +# +# Anything not matched by these falls through to the generic storm_worker / +# storm_topology rules, so EVERY v2 metric remains queryable. + +# common worker prefix capture used below: +# $1 topology_id $2 host $3 component $4 stream $5 task $6 port $7 key +mappings: + # ---- TaskMetrics: throughput rates (tuples/s) ---- + - match: '^storm\.worker\.([^.]+)\.([^.]+)\.([^.]+)\.([^.]+)\.(-?\d+)\.(\d+)-__emit-count-(.+)\.m1_rate$' + match_type: regex + name: "storm_emit_rate" + labels: {topology_id: "$1", host: "$2", component: "$3", stream: "$4", task: "$5", port: "$6", key: "$7"} + - match: '^storm\.worker\.([^.]+)\.([^.]+)\.([^.]+)\.([^.]+)\.(-?\d+)\.(\d+)-__transfer-count-(.+)\.m1_rate$' + match_type: regex + name: "storm_transfer_rate" + labels: {topology_id: "$1", host: "$2", component: "$3", stream: "$4", task: "$5", port: "$6", key: "$7"} + - match: '^storm\.worker\.([^.]+)\.([^.]+)\.([^.]+)\.([^.]+)\.(-?\d+)\.(\d+)-__execute-count-(.+)\.m1_rate$' + match_type: regex + name: "storm_execute_rate" + labels: {topology_id: "$1", host: "$2", component: "$3", stream: "$4", task: "$5", port: "$6", key: "$7"} + - match: '^storm\.worker\.([^.]+)\.([^.]+)\.([^.]+)\.([^.]+)\.(-?\d+)\.(\d+)-__ack-count-(.+)\.m1_rate$' + match_type: regex + name: "storm_ack_rate" + labels: {topology_id: "$1", host: "$2", component: "$3", stream: "$4", task: "$5", port: "$6", key: "$7"} + - match: '^storm\.worker\.([^.]+)\.([^.]+)\.([^.]+)\.([^.]+)\.(-?\d+)\.(\d+)-__fail-count-(.+)\.m1_rate$' + match_type: regex + name: "storm_fail_rate" + labels: {topology_id: "$1", host: "$2", component: "$3", stream: "$4", task: "$5", port: "$6", key: "$7"} + + # ---- TaskMetrics: cumulative totals (sampling-scaled) ---- + - match: '^storm\.worker\.([^.]+)\.([^.]+)\.([^.]+)\.([^.]+)\.(-?\d+)\.(\d+)-__emit-count-(.+)\.count$' + match_type: regex + name: "storm_emit_total" + labels: {topology_id: "$1", host: "$2", component: "$3", stream: "$4", task: "$5", port: "$6", key: "$7"} + - match: '^storm\.worker\.([^.]+)\.([^.]+)\.([^.]+)\.([^.]+)\.(-?\d+)\.(\d+)-__transfer-count-(.+)\.count$' + match_type: regex + name: "storm_transfer_total" + labels: {topology_id: "$1", host: "$2", component: "$3", stream: "$4", task: "$5", port: "$6", key: "$7"} + - match: '^storm\.worker\.([^.]+)\.([^.]+)\.([^.]+)\.([^.]+)\.(-?\d+)\.(\d+)-__execute-count-(.+)\.count$' + match_type: regex + name: "storm_execute_total" + labels: {topology_id: "$1", host: "$2", component: "$3", stream: "$4", task: "$5", port: "$6", key: "$7"} + - match: '^storm\.worker\.([^.]+)\.([^.]+)\.([^.]+)\.([^.]+)\.(-?\d+)\.(\d+)-__ack-count-(.+)\.count$' + match_type: regex + name: "storm_ack_total" + labels: {topology_id: "$1", host: "$2", component: "$3", stream: "$4", task: "$5", port: "$6", key: "$7"} + - match: '^storm\.worker\.([^.]+)\.([^.]+)\.([^.]+)\.([^.]+)\.(-?\d+)\.(\d+)-__fail-count-(.+)\.count$' + match_type: regex + name: "storm_fail_total" + labels: {topology_id: "$1", host: "$2", component: "$3", stream: "$4", task: "$5", port: "$6", key: "$7"} + + # ---- TaskMetrics: latencies (ms, RollingAverageGauge) ---- + - match: '^storm\.worker\.([^.]+)\.([^.]+)\.([^.]+)\.([^.]+)\.(-?\d+)\.(\d+)-__execute-latency-(.+)$' + match_type: regex + name: "storm_execute_latency_ms" + labels: {topology_id: "$1", host: "$2", component: "$3", stream: "$4", task: "$5", port: "$6", key: "$7"} + - match: '^storm\.worker\.([^.]+)\.([^.]+)\.([^.]+)\.([^.]+)\.(-?\d+)\.(\d+)-__process-latency-(.+)$' + match_type: regex + name: "storm_process_latency_ms" + labels: {topology_id: "$1", host: "$2", component: "$3", stream: "$4", task: "$5", port: "$6", key: "$7"} + - match: '^storm\.worker\.([^.]+)\.([^.]+)\.([^.]+)\.([^.]+)\.(-?\d+)\.(\d+)-__complete-latency-(.+)$' + match_type: regex + name: "storm_complete_latency_ms" + labels: {topology_id: "$1", host: "$2", component: "$3", stream: "$4", task: "$5", port: "$6", key: "$7"} + + # ---- TaskMetrics: latency jitter (EWMA, RFC 3550; needs topology.stats.ewma.enable=true) ---- + - match: '^storm\.worker\.([^.]+)\.([^.]+)\.([^.]+)\.([^.]+)\.(-?\d+)\.(\d+)-__execute-jitter-(.+)$' + match_type: regex + name: "storm_execute_jitter_ms" + labels: {topology_id: "$1", host: "$2", component: "$3", stream: "$4", task: "$5", port: "$6", key: "$7"} + - match: '^storm\.worker\.([^.]+)\.([^.]+)\.([^.]+)\.([^.]+)\.(-?\d+)\.(\d+)-__process-jitter-(.+)$' + match_type: regex + name: "storm_process_jitter_ms" + labels: {topology_id: "$1", host: "$2", component: "$3", stream: "$4", task: "$5", port: "$6", key: "$7"} + - match: '^storm\.worker\.([^.]+)\.([^.]+)\.([^.]+)\.([^.]+)\.(-?\d+)\.(\d+)-__complete-jitter-(.+)$' + match_type: regex + name: "storm_complete_jitter_ms" + labels: {topology_id: "$1", host: "$2", component: "$3", stream: "$4", task: "$5", port: "$6", key: "$7"} + + # ---- TaskMetrics: capacity (RollingAverageGauge over all streams) ---- + - match: '^storm\.worker\.([^.]+)\.([^.]+)\.([^.]+)\.([^.]+)\.(-?\d+)\.(\d+)-__capacity$' + match_type: regex + name: "storm_capacity" + labels: {topology_id: "$1", host: "$2", component: "$3", stream: "$4", task: "$5", port: "$6"} + + # ---- Generic fallback: any other worker/topology metric (messaging, skipped, + # backpressure, heartbeat, JVM/GC/memory/threads ...) ---- + - match: '^storm\.worker\.([^.]+)\.([^.]+)\.([^.]+)\.([^.]+)\.(-?\d+)\.(\d+)-(.+)$' + match_type: regex + name: "storm_worker" + labels: {topology_id: "$1", host: "$2", component: "$3", stream: "$4", task: "$5", port: "$6", metric: "$7"} + - match: '^storm\.worker\.([^.]+)\.([^.]+)\.([^.]+)\.(-?\d+)\.(\d+)-(.+)$' + match_type: regex + name: "storm_worker" + labels: {topology_id: "$1", host: "$2", component: "$3", stream: "", task: "$4", port: "$5", metric: "$6"} + - match: '^storm\.topology\.([^.]+)\.([^.]+)\.([^.]+)\.(-?\d+)\.(\d+)-(.+)$' + match_type: regex + name: "storm_topology" + labels: {topology_id: "$1", host: "$2", component: "$3", task: "$4", port: "$5", metric: "$6"} diff --git a/dev-tools/cluster/netsim.sh b/dev-tools/cluster/netsim.sh new file mode 100755 index 0000000000..6308e91848 --- /dev/null +++ b/dev-tools/cluster/netsim.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Simulate network delay/jitter/loss on the worker hosts (supervisor containers) +# using Linux tc/netem. The Storm image has no `tc`, so we inject the qdisc from +# a throwaway helper container that shares each supervisor's network namespace +# (--net container:...) -- no image rebuild, no compose change required. +# +# netem applies to *egress* on each supervisor's eth0, so it shapes ALL traffic +# leaving that container (inter-worker tuples, but also heartbeats to Nimbus/ZK). +# Keep the delay moderate (<= ~150ms) so heartbeats don't time out. With both +# supervisors delayed by D, worker<->worker round-trip latency is ~2*D. +# +# netem's default queue is only 1000 packets; under a high-throughput perf +# topology that buffer overflows at the added delay and drops tuples, which +# collapses TCP and back-pressures the spout to ~0. So we set a large `limit` +# (default 1,000,000 packets) -- big enough to hold rate * delay without drops. +# +# Usage: +# ./netsim.sh add [delay_ms] [jitter_ms] [loss_pct] [limit_pkts] # default 50 10 0 1000000 +# ./netsim.sh show +# ./netsim.sh clear +# ./netsim.sh ping # measure RTT between supervisors +# +# Override the shaped containers with TARGETS="..." ./netsim.sh ... +set -euo pipefail + +COMPOSE_SERVICES="${COMPOSE_SERVICES:-supervisor1 supervisor2}" +HELPER_IMAGE="${HELPER_IMAGE:-nicolaka/netshoot}" +IFACE="${IFACE:-eth0}" + +targets() { # echo the containers to shape (resolved once, memoized in TARGETS) + if [[ -z "${TARGETS:-}" ]]; then + TARGETS="$(docker compose ps -q ${COMPOSE_SERVICES} 2>/dev/null | tr '\n' ' ')" + if [[ -z "${TARGETS// /}" ]]; then + echo "error: no running containers for services: ${COMPOSE_SERVICES}" >&2 + echo " run from dev-tools/cluster with the cluster up (docker compose up -d)," >&2 + echo " or set TARGETS=\" ...\" to shape containers explicitly." >&2 + exit 1 + fi + fi + echo "${TARGETS}" +} + +cname() { # human-readable container name for id/name $1 (falls back to $1) + local n; n="$(docker inspect -f '{{.Name}}' "$1" 2>/dev/null)" || true + n="${n#/}" + echo "${n:-$1}" +} + +inns() { # run a command inside container $1's network namespace with NET_ADMIN + local c="$1"; shift + docker run --rm --net "container:${c}" --cap-add NET_ADMIN "${HELPER_IMAGE}" "$@" +} + +cmd="${1:-show}" +case "${cmd}" in + add) + delay="${2:-50}"; jitter="${3:-10}"; loss="${4:-0}"; limit="${5:-1000000}" + for c in $(targets); do + echo "==> $(cname "${c}"): netem delay ${delay}ms ${jitter}ms loss ${loss}% limit ${limit} on ${IFACE}" + inns "${c}" tc qdisc replace dev "${IFACE}" root netem \ + delay "${delay}ms" "${jitter}ms" distribution normal loss "${loss}%" limit "${limit}" + done + ;; + clear) + for c in $(targets); do + echo "==> $(cname "${c}"): removing netem" + inns "${c}" tc qdisc del dev "${IFACE}" root 2>/dev/null || true + done + ;; + show) + for c in $(targets); do + echo "==> $(cname "${c}"):" + inns "${c}" tc qdisc show dev "${IFACE}" + done + ;; + ping) + set -- $(targets) + src="$1"; dst_host="$(cname "${PING_DST:-${2:-supervisor2}}")" + echo "==> RTT from $(cname "${src}") to ${dst_host} (5 pings)" + inns "${src}" ping -c 5 "${dst_host}" + ;; + *) + echo "usage: $0 {add [delay_ms] [jitter_ms] [loss_pct] | show | clear | ping}" >&2 + exit 2 + ;; +esac diff --git a/dev-tools/cluster/prepare-extlib.sh b/dev-tools/cluster/prepare-extlib.sh new file mode 100755 index 0000000000..3a4a498fd0 --- /dev/null +++ b/dev-tools/cluster/prepare-extlib.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Builds the storm-metrics-prometheus reporter and collects it plus its runtime +# dependencies into ./extlib-daemon, which docker-compose mounts onto the Nimbus +# container's daemon classpath. Re-run after changing the module or Storm version. +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${HERE}/../.." && pwd)" +MODULE="external/storm-metrics-prometheus" +OUT="${HERE}/extlib-daemon" + +# Resolve the Storm version from the same source as docker-compose: the .env that +# build-image.sh writes from the repo root pom.xml. Hardcoding a default would cp +# a wrong-named jar after a version bump. Override with STORM_VERSION. +if [[ -z "${STORM_VERSION:-}" ]]; then + if [[ -f "${HERE}/.env" ]]; then + # shellcheck disable=SC1091 + source "${HERE}/.env" + fi +fi +VERSION="${STORM_VERSION:-}" +if [[ -z "${VERSION}" ]]; then + echo "error: STORM_VERSION not set and ${HERE}/.env missing or empty." >&2 + echo " run ./build-image.sh first (it writes .env from pom.xml), or set STORM_VERSION." >&2 + exit 1 +fi + +cd "${REPO_ROOT}" + +echo "==> Building ${MODULE} (${VERSION})" +mvn -q -pl "${MODULE}" -am install -DskipTests + +echo "==> Collecting runtime dependencies into ${OUT}" +rm -f "${OUT}"/*.jar +mvn -q -pl "${MODULE}" dependency:copy-dependencies \ + -DincludeScope=runtime -DexcludeScope=provided \ + -DoutputDirectory="${OUT}" + +cp "${REPO_ROOT}/${MODULE}/target/storm-metrics-prometheus-${VERSION}.jar" "${OUT}/" + +echo "==> ${OUT} now contains:" +ls -1 "${OUT}"/*.jar diff --git a/dev-tools/cluster/prometheus/prometheus.yml b/dev-tools/cluster/prometheus/prometheus.yml new file mode 100644 index 0000000000..090b53a8e1 --- /dev/null +++ b/dev-tools/cluster/prometheus/prometheus.yml @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Scrapes the Pushgateway that Nimbus pushes Storm cluster-summary metrics to. +# honor_labels keeps the `job` label from the pushed series (e.g. job="nimbus") +# instead of overwriting it with the scrape job name. +global: + scrape_interval: 10s + evaluation_interval: 10s + +scrape_configs: + # Cluster-summary metrics pushed by Nimbus. + - job_name: pushgateway + honor_labels: true + static_configs: + - targets: ["pushgateway:9091"] + + # Storm metrics-v2 (per-worker/topology) via the graphite_exporter. + - job_name: graphite + static_configs: + - targets: ["graphite-exporter:9108"] diff --git a/dev-tools/cluster/storm-client.yaml b/dev-tools/cluster/storm-client.yaml new file mode 100644 index 0000000000..f398b0f6a1 --- /dev/null +++ b/dev-tools/cluster/storm-client.yaml @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# CLIENT-side config for submitting topologies to the local docker cluster from +# outside the containers (e.g. running a storm-perf main() from IntelliJ). +# Point it at this file with -Dstorm.conf.file=. +# +# docker-compose.yml publishes Nimbus Thrift on localhost:6627, so the submitter +# only needs to know where Nimbus lives. Port (6627) and transport +# (SimpleTransportPlugin, no auth) match the cluster defaults. +nimbus.seeds: ["localhost"] diff --git a/dev-tools/cluster/storm.yaml b/dev-tools/cluster/storm.yaml new file mode 100644 index 0000000000..84de218d30 --- /dev/null +++ b/dev-tools/cluster/storm.yaml @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Cluster config shared by every daemon. Bind-mounted over +# ${STORM_HOME}/conf/storm.yaml by docker-compose. Hostnames below are the +# docker-compose service names, resolvable on the cluster's bridge network. +storm.zookeeper.servers: + - "zookeeper" +nimbus.seeds: ["nimbus"] + +storm.local.dir: "/data/storm" + +# Two slots per supervisor. With two supervisors that is four worker slots, +# so a topology.workers >= 2 topology spreads workers across containers and +# its inter-worker traffic actually crosses the network. +supervisor.slots.ports: + - 6700 + - 6701 + +ui.port: 8080 + +# Modest heaps so the whole cluster fits comfortably on a laptop. +nimbus.childopts: "-Xmx512m" +ui.childopts: "-Xmx256m" +logviewer.childopts: "-Xmx128m" +supervisor.childopts: "-Xmx256m" +worker.childopts: "-Xmx512m" + +# Metrics v2: every worker reports its Dropwizard metrics (per-task tuple +# rates/latencies/capacity, JCQueue, JVM memory/GC/threads, cgroup, messaging) +# in Graphite plaintext to the graphite_exporter, which Prometheus scrapes. +# This push model handles ephemeral workers without per-worker scrape targets. +topology.metrics.reporters: + - class: "org.apache.storm.metrics2.reporters.GraphiteStormReporter" + report.period: 10 + report.period.units: "SECONDS" + graphite.host: "graphite-exporter" + graphite.port: 9109 + graphite.transport: "tcp" + +# enable EWMA metrics +topology.stats.ewma.enable: true diff --git a/pom.xml b/pom.xml index d98a27c856..ab3808b024 100644 --- a/pom.xml +++ b/pom.xml @@ -274,6 +274,7 @@ **/src/test/resources/test-worker.log.test **/src/test/resources/logviewer-search-context-tests.log.test + **/dev-tools/cluster/grafana/dashboards/*.json **/src/codegen/config.fmpp