From 30e5cfaf89bf9aff5a39d11df1a99eeb839e118d Mon Sep 17 00:00:00 2001 From: Maxwell Maslov Date: Mon, 27 Apr 2026 20:05:28 -0700 Subject: [PATCH 01/22] added worker max RSS helper with tests + instructions --- api/metrics/worker_metrics.py | 20 +++ .../unit/metrics/test_unit_worker_metrics.py | 115 ++++++++++++++++++ rss-collector-readme.md | 49 ++++++++ 3 files changed, 184 insertions(+) create mode 100644 api/metrics/worker_metrics.py create mode 100644 api/tests/unit/metrics/test_unit_worker_metrics.py create mode 100644 rss-collector-readme.md diff --git a/api/metrics/worker_metrics.py b/api/metrics/worker_metrics.py new file mode 100644 index 000000000000..38c50d28d79a --- /dev/null +++ b/api/metrics/worker_metrics.py @@ -0,0 +1,20 @@ +try: + import resource +except ImportError: # pragma: no cover - resource is Unix-only + resource = None # type: ignore[assignment] + + +MAX_RSS_KIB_TO_BYTES = 1024 + + +def get_current_process_max_rss_bytes() -> int | None: + if resource is None or not hasattr(resource, "RUSAGE_SELF"): + return None + + try: + max_rss_kib = int(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) + if max_rss_kib < 0: + return None + return max_rss_kib * MAX_RSS_KIB_TO_BYTES + except (AttributeError, OSError, TypeError, ValueError): + return None diff --git a/api/tests/unit/metrics/test_unit_worker_metrics.py b/api/tests/unit/metrics/test_unit_worker_metrics.py new file mode 100644 index 000000000000..fac0daca2333 --- /dev/null +++ b/api/tests/unit/metrics/test_unit_worker_metrics.py @@ -0,0 +1,115 @@ +from types import SimpleNamespace + +import pytest + +from metrics import worker_metrics + + +def test_get_current_process_max_rss_bytes__resource_usage_available__returns_bytes( + monkeypatch: pytest.MonkeyPatch, +) -> None: + # Given + max_rss_kib = 123 + fake_resource = SimpleNamespace( + RUSAGE_SELF=1, + getrusage=lambda who: SimpleNamespace(ru_maxrss=max_rss_kib), + ) + monkeypatch.setattr(worker_metrics, "resource", fake_resource) + + # When + result = worker_metrics.get_current_process_max_rss_bytes() + + # Then + assert result == max_rss_kib * worker_metrics.MAX_RSS_KIB_TO_BYTES + + +def test_get_current_process_max_rss_bytes__resource_usage_available__uses_current_process( + monkeypatch: pytest.MonkeyPatch, +) -> None: + # Given + requested_resource = None + rusage_self = 1 + + def fake_getrusage(who: int) -> SimpleNamespace: + nonlocal requested_resource + requested_resource = who + return SimpleNamespace(ru_maxrss=123) + + fake_resource = SimpleNamespace( + RUSAGE_SELF=rusage_self, + getrusage=fake_getrusage, + ) + monkeypatch.setattr(worker_metrics, "resource", fake_resource) + + # When + worker_metrics.get_current_process_max_rss_bytes() + + # Then + assert requested_resource == rusage_self + + +def test_get_current_process_max_rss_bytes__resource_module_unavailable__returns_none( + monkeypatch: pytest.MonkeyPatch, +) -> None: + # Given + monkeypatch.setattr(worker_metrics, "resource", None) + + # When + result = worker_metrics.get_current_process_max_rss_bytes() + + # Then + assert result is None + + +def test_get_current_process_max_rss_bytes__max_rss_missing__returns_none( + monkeypatch: pytest.MonkeyPatch, +) -> None: + # Given + fake_resource = SimpleNamespace( + RUSAGE_SELF=1, + getrusage=lambda who: SimpleNamespace(), + ) + monkeypatch.setattr(worker_metrics, "resource", fake_resource) + + # When + result = worker_metrics.get_current_process_max_rss_bytes() + + # Then + assert result is None + + +def test_get_current_process_max_rss_bytes__max_rss_invalid__returns_none( + monkeypatch: pytest.MonkeyPatch, +) -> None: + # Given + fake_resource = SimpleNamespace( + RUSAGE_SELF=1, + getrusage=lambda who: SimpleNamespace(ru_maxrss=-1), + ) + monkeypatch.setattr(worker_metrics, "resource", fake_resource) + + # When + result = worker_metrics.get_current_process_max_rss_bytes() + + # Then + assert result is None + + +def test_get_current_process_max_rss_bytes__resource_error__returns_none( + monkeypatch: pytest.MonkeyPatch, +) -> None: + # Given + def fake_getrusage(who: int) -> SimpleNamespace: + raise OSError("resource usage unavailable") + + fake_resource = SimpleNamespace( + RUSAGE_SELF=1, + getrusage=fake_getrusage, + ) + monkeypatch.setattr(worker_metrics, "resource", fake_resource) + + # When + result = worker_metrics.get_current_process_max_rss_bytes() + + # Then + assert result is None diff --git a/rss-collector-readme.md b/rss-collector-readme.md new file mode 100644 index 000000000000..63c2c7db767c --- /dev/null +++ b/rss-collector-readme.md @@ -0,0 +1,49 @@ +# Worker Metrics Helper Testing + +This note covers how to test the Story #1 RSS helper. + +The helper under test is `get_current_process_max_rss_bytes()`. It reads the +current worker process max RSS high-water mark with +`resource.getrusage(resource.RUSAGE_SELF).ru_maxrss`, converts the Linux KiB +value to bytes, and returns `None` when RSS data cannot be read safely. + +## Run The Focused Unit Tests + +From the API directory: + +```bash +make test opts='tests/unit/metrics/test_unit_worker_metrics.py -n0' +``` + +If the local shell has an invalid `DEBUG` value, set it explicitly: + +```bash +DEBUG=false make test opts='tests/unit/metrics/test_unit_worker_metrics.py -n0' +``` + +## Run Without Docker + +If Docker is unavailable but the API virtualenv is installed: + +```bash +DEBUG=false .venv/bin/pytest tests/unit/metrics/test_unit_worker_metrics.py -n0 +``` + +## Run Lint And Format Checks + +```bash +.venv/bin/ruff check metrics/worker_metrics.py tests/unit/metrics/test_unit_worker_metrics.py +.venv/bin/ruff format --check metrics/worker_metrics.py tests/unit/metrics/test_unit_worker_metrics.py +``` + +## Expected Coverage + +The unit tests cover: + +- successful max-RSS collection +- KiB-to-byte conversion +- use of `resource.RUSAGE_SELF` +- unavailable `resource` module +- missing `ru_maxrss` +- invalid negative RSS values +- unexpected resource errors From cc76ffdc36f3f0c6a8cf254abef115fa41d9052e Mon Sep 17 00:00:00 2001 From: Maxwell Maslov Date: Tue, 28 Apr 2026 10:16:22 -0700 Subject: [PATCH 02/22] Relocated helper testing instructions to relevant /docs folder. --- .../development/worker-metrics-helper-testing.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename rss-collector-readme.md => docs/development/worker-metrics-helper-testing.md (100%) diff --git a/rss-collector-readme.md b/docs/development/worker-metrics-helper-testing.md similarity index 100% rename from rss-collector-readme.md rename to docs/development/worker-metrics-helper-testing.md From 2c8cec4af448650af209685591bd64b2b598f20b Mon Sep 17 00:00:00 2001 From: Maxwell Maslov Date: Wed, 6 May 2026 23:43:30 -0700 Subject: [PATCH 03/22] read worker max RSS from proc status --- api/metrics/worker_metrics.py | 57 ++++++++--- .../unit/metrics/test_unit_worker_metrics.py | 98 ++++++++++--------- .../worker-metrics-helper-testing.md | 25 +++-- 3 files changed, 114 insertions(+), 66 deletions(-) diff --git a/api/metrics/worker_metrics.py b/api/metrics/worker_metrics.py index 38c50d28d79a..3f1ad49ed62f 100644 --- a/api/metrics/worker_metrics.py +++ b/api/metrics/worker_metrics.py @@ -1,20 +1,53 @@ -try: - import resource -except ImportError: # pragma: no cover - resource is Unix-only - resource = None # type: ignore[assignment] +from pathlib import Path +from typing import Iterable - -MAX_RSS_KIB_TO_BYTES = 1024 +PROC_SELF_STATUS_PATH = Path("/proc/self/status") +MAX_RSS_KB_TO_BYTES = 1024 +MAX_RSS_STATUS_FIELD = "VmHWM" def get_current_process_max_rss_bytes() -> int | None: - if resource is None or not hasattr(resource, "RUSAGE_SELF"): + try: + proc_status_lines = PROC_SELF_STATUS_PATH.read_text( + encoding="utf-8" + ).splitlines() + except (FileNotFoundError, OSError, UnicodeDecodeError): + return None + + max_rss_kb = _get_proc_status_memory_kb(proc_status_lines, MAX_RSS_STATUS_FIELD) + if max_rss_kb is None: + return None + + return max_rss_kb * MAX_RSS_KB_TO_BYTES + + +def _get_proc_status_memory_kb( + proc_status_lines: Iterable[str], + field_name: str, +) -> int | None: + for line in proc_status_lines: + name, separator, value = line.strip().partition(":") + if separator and name == field_name: + return _parse_proc_status_memory_kb(value) + + return None + + +def _parse_proc_status_memory_kb(value: str) -> int | None: + parts = value.split() + if len(parts) != 2: + return None + + memory_kb_text, unit = parts + if unit != "kB": return None try: - max_rss_kib = int(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) - if max_rss_kib < 0: - return None - return max_rss_kib * MAX_RSS_KIB_TO_BYTES - except (AttributeError, OSError, TypeError, ValueError): + memory_kb = int(memory_kb_text) + except ValueError: + return None + + if memory_kb < 0: return None + + return memory_kb diff --git a/api/tests/unit/metrics/test_unit_worker_metrics.py b/api/tests/unit/metrics/test_unit_worker_metrics.py index fac0daca2333..164dc0651201 100644 --- a/api/tests/unit/metrics/test_unit_worker_metrics.py +++ b/api/tests/unit/metrics/test_unit_worker_metrics.py @@ -1,58 +1,61 @@ -from types import SimpleNamespace +from pathlib import Path import pytest from metrics import worker_metrics -def test_get_current_process_max_rss_bytes__resource_usage_available__returns_bytes( +class UnreadableStatusPath: + def read_text(self, encoding: str) -> str: + raise OSError("status file unavailable") + + +def test_get_current_process_max_rss_bytes__vmhwm_available__returns_bytes( monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, ) -> None: # Given - max_rss_kib = 123 - fake_resource = SimpleNamespace( - RUSAGE_SELF=1, - getrusage=lambda who: SimpleNamespace(ru_maxrss=max_rss_kib), + max_rss_kb = 123 + status_path = tmp_path / "status" + status_path.write_text( + f"Name:\tgunicorn\nVmHWM:\t{max_rss_kb} kB\n", + encoding="utf-8", ) - monkeypatch.setattr(worker_metrics, "resource", fake_resource) + monkeypatch.setattr(worker_metrics, "PROC_SELF_STATUS_PATH", status_path) # When result = worker_metrics.get_current_process_max_rss_bytes() # Then - assert result == max_rss_kib * worker_metrics.MAX_RSS_KIB_TO_BYTES + assert result == max_rss_kb * worker_metrics.MAX_RSS_KB_TO_BYTES -def test_get_current_process_max_rss_bytes__resource_usage_available__uses_current_process( +def test_get_current_process_max_rss_bytes__vmhwm_has_extra_whitespace__returns_bytes( monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, ) -> None: # Given - requested_resource = None - rusage_self = 1 - - def fake_getrusage(who: int) -> SimpleNamespace: - nonlocal requested_resource - requested_resource = who - return SimpleNamespace(ru_maxrss=123) - - fake_resource = SimpleNamespace( - RUSAGE_SELF=rusage_self, - getrusage=fake_getrusage, + max_rss_kb = 456 + status_path = tmp_path / "status" + status_path.write_text( + f"Name:\tgunicorn\n VmHWM: {max_rss_kb} kB \nVmRSS:\t10 kB\n", + encoding="utf-8", ) - monkeypatch.setattr(worker_metrics, "resource", fake_resource) + monkeypatch.setattr(worker_metrics, "PROC_SELF_STATUS_PATH", status_path) # When - worker_metrics.get_current_process_max_rss_bytes() + result = worker_metrics.get_current_process_max_rss_bytes() # Then - assert requested_resource == rusage_self + assert result == max_rss_kb * worker_metrics.MAX_RSS_KB_TO_BYTES -def test_get_current_process_max_rss_bytes__resource_module_unavailable__returns_none( +def test_get_current_process_max_rss_bytes__status_file_missing__returns_none( monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, ) -> None: # Given - monkeypatch.setattr(worker_metrics, "resource", None) + monkeypatch.setattr(worker_metrics, "PROC_SELF_STATUS_PATH", tmp_path / "missing") # When result = worker_metrics.get_current_process_max_rss_bytes() @@ -61,15 +64,14 @@ def test_get_current_process_max_rss_bytes__resource_module_unavailable__returns assert result is None -def test_get_current_process_max_rss_bytes__max_rss_missing__returns_none( +def test_get_current_process_max_rss_bytes__vmhwm_missing__returns_none( monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, ) -> None: # Given - fake_resource = SimpleNamespace( - RUSAGE_SELF=1, - getrusage=lambda who: SimpleNamespace(), - ) - monkeypatch.setattr(worker_metrics, "resource", fake_resource) + status_path = tmp_path / "status" + status_path.write_text("Name:\tgunicorn\nVmRSS:\t10 kB\n", encoding="utf-8") + monkeypatch.setattr(worker_metrics, "PROC_SELF_STATUS_PATH", status_path) # When result = worker_metrics.get_current_process_max_rss_bytes() @@ -78,15 +80,28 @@ def test_get_current_process_max_rss_bytes__max_rss_missing__returns_none( assert result is None -def test_get_current_process_max_rss_bytes__max_rss_invalid__returns_none( +@pytest.mark.parametrize( + "vmhwm_value", + [ + "-1 kB", + "not-a-number kB", + "123 MB", + "123", + "123 kB extra", + ], +) +def test_get_current_process_max_rss_bytes__vmhwm_invalid__returns_none( monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + vmhwm_value: str, ) -> None: # Given - fake_resource = SimpleNamespace( - RUSAGE_SELF=1, - getrusage=lambda who: SimpleNamespace(ru_maxrss=-1), + status_path = tmp_path / "status" + status_path.write_text( + f"Name:\tgunicorn\nVmHWM:\t{vmhwm_value}\n", + encoding="utf-8", ) - monkeypatch.setattr(worker_metrics, "resource", fake_resource) + monkeypatch.setattr(worker_metrics, "PROC_SELF_STATUS_PATH", status_path) # When result = worker_metrics.get_current_process_max_rss_bytes() @@ -95,18 +110,11 @@ def test_get_current_process_max_rss_bytes__max_rss_invalid__returns_none( assert result is None -def test_get_current_process_max_rss_bytes__resource_error__returns_none( +def test_get_current_process_max_rss_bytes__status_file_read_error__returns_none( monkeypatch: pytest.MonkeyPatch, ) -> None: # Given - def fake_getrusage(who: int) -> SimpleNamespace: - raise OSError("resource usage unavailable") - - fake_resource = SimpleNamespace( - RUSAGE_SELF=1, - getrusage=fake_getrusage, - ) - monkeypatch.setattr(worker_metrics, "resource", fake_resource) + monkeypatch.setattr(worker_metrics, "PROC_SELF_STATUS_PATH", UnreadableStatusPath()) # When result = worker_metrics.get_current_process_max_rss_bytes() diff --git a/docs/development/worker-metrics-helper-testing.md b/docs/development/worker-metrics-helper-testing.md index 63c2c7db767c..f46a60d2aa6a 100644 --- a/docs/development/worker-metrics-helper-testing.md +++ b/docs/development/worker-metrics-helper-testing.md @@ -3,9 +3,14 @@ This note covers how to test the Story #1 RSS helper. The helper under test is `get_current_process_max_rss_bytes()`. It reads the -current worker process max RSS high-water mark with -`resource.getrusage(resource.RUSAGE_SELF).ru_maxrss`, converts the Linux KiB -value to bytes, and returns `None` when RSS data cannot be read safely. +current worker process RSS high-water mark from `VmHWM` in `/proc/self/status`, +converts the Linux `kB` value to bytes, and returns `None` when RSS data cannot +be read safely. + +The helper avoids `resource.getrusage(resource.RUSAGE_SELF).ru_maxrss` because +Linux can preserve that value across `execve`. Reading `VmHWM` from +`/proc/self/status` keeps the high-water mark scoped to the current worker +process, which is the value the 10-second worker metrics updater should export. ## Run The Focused Unit Tests @@ -40,10 +45,12 @@ DEBUG=false .venv/bin/pytest tests/unit/metrics/test_unit_worker_metrics.py -n0 The unit tests cover: -- successful max-RSS collection -- KiB-to-byte conversion -- use of `resource.RUSAGE_SELF` -- unavailable `resource` module -- missing `ru_maxrss` +- successful `VmHWM` collection +- `kB`-to-byte conversion +- leading and trailing whitespace in `/proc/self/status` +- unavailable `/proc/self/status` +- missing `VmHWM` - invalid negative RSS values -- unexpected resource errors +- non-integer RSS values +- unexpected RSS units +- unexpected status-file read errors From b62ffad96aa02b464759214b643e5ac06ce3acf8 Mon Sep 17 00:00:00 2001 From: AAshGray <193518051+AAshGray@users.noreply.github.com> Date: Mon, 27 Apr 2026 17:44:18 -0700 Subject: [PATCH 04/22] created worker metric Prometheus gauge --- api/metrics/worker_metrics.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/api/metrics/worker_metrics.py b/api/metrics/worker_metrics.py index 3f1ad49ed62f..7ede4e7401f0 100644 --- a/api/metrics/worker_metrics.py +++ b/api/metrics/worker_metrics.py @@ -1,10 +1,18 @@ from pathlib import Path from typing import Iterable +import prometheus_client + PROC_SELF_STATUS_PATH = Path("/proc/self/status") MAX_RSS_KB_TO_BYTES = 1024 MAX_RSS_STATUS_FIELD = "VmHWM" +flagsmith_worker_rss_bytes = prometheus_client.Gauge( + "flagsmith_worker_rss_bytes", + "Resident Set Size (RSS) of the worker process in bytes.", + ["pid"], + +) def get_current_process_max_rss_bytes() -> int | None: try: @@ -51,3 +59,4 @@ def _parse_proc_status_memory_kb(value: str) -> int | None: return None return memory_kb + From d3c369194d6f89bd9e56af89143c75ec10452dcf Mon Sep 17 00:00:00 2001 From: AAshGray <193518051+AAshGray@users.noreply.github.com> Date: Wed, 29 Apr 2026 15:50:23 -0700 Subject: [PATCH 05/22] updated the metrics-catalogue with the flagsmith_worker_rss_byes gauge --- .../observability/_metrics-catalogue.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/docs/deployment-self-hosting/observability/_metrics-catalogue.md b/docs/docs/deployment-self-hosting/observability/_metrics-catalogue.md index 6cae297b29e4..9f517f443d49 100644 --- a/docs/docs/deployment-self-hosting/observability/_metrics-catalogue.md +++ b/docs/docs/deployment-self-hosting/observability/_metrics-catalogue.md @@ -101,3 +101,14 @@ Labels: - `task_type` - `result` +### `flagsmith_worker_rss_bytes` + +Gauge. + +Resident Set Size (RSS) of the worker process in bytes. Used to track memory usage of each worker process. The `pid` label identifies which process the measurement belongs to. + +Labels: + - `pid` + + + From 56741a3485c30970c04c5e1ce6a8c836ab2468ab Mon Sep 17 00:00:00 2001 From: AAshGray <193518051+AAshGray@users.noreply.github.com> Date: Mon, 27 Apr 2026 17:44:18 -0700 Subject: [PATCH 06/22] created worker metric Prometheus gauge --- api/metrics/worker_metrics.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/api/metrics/worker_metrics.py b/api/metrics/worker_metrics.py index 7ede4e7401f0..51d59b969342 100644 --- a/api/metrics/worker_metrics.py +++ b/api/metrics/worker_metrics.py @@ -1,3 +1,4 @@ +<<<<<<< HEAD from pathlib import Path from typing import Iterable import prometheus_client @@ -60,3 +61,12 @@ def _parse_proc_status_memory_kb(value: str) -> int | None: return memory_kb +======= +import prometheus_client + +flagsmith_worker_rss_bytes = prometheus_client.Gauge( + "flagsmith_worker_rss_bytes", + "Resident Set Size (RSS) of the worker process in bytes.", + ["pid"] +) +>>>>>>> d7ea9dc2 (created worker metric Prometheus gauge) From 0b49af066cae06781e5d11f3ac5067a96fe5b428 Mon Sep 17 00:00:00 2001 From: AAshGray <193518051+AAshGray@users.noreply.github.com> Date: Tue, 5 May 2026 09:04:10 -0700 Subject: [PATCH 07/22] added a background thread to update resource usage and imported it to the wsgi file --- api/metrics/worker_metrics.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/api/metrics/worker_metrics.py b/api/metrics/worker_metrics.py index 51d59b969342..ccc66d983284 100644 --- a/api/metrics/worker_metrics.py +++ b/api/metrics/worker_metrics.py @@ -1,9 +1,7 @@ -<<<<<<< HEAD from pathlib import Path from typing import Iterable import prometheus_client - PROC_SELF_STATUS_PATH = Path("/proc/self/status") MAX_RSS_KB_TO_BYTES = 1024 MAX_RSS_STATUS_FIELD = "VmHWM" @@ -12,7 +10,7 @@ "flagsmith_worker_rss_bytes", "Resident Set Size (RSS) of the worker process in bytes.", ["pid"], - + multiprocess_mode="liveall" ) def get_current_process_max_rss_bytes() -> int | None: @@ -59,14 +57,4 @@ def _parse_proc_status_memory_kb(value: str) -> int | None: if memory_kb < 0: return None - return memory_kb - -======= -import prometheus_client - -flagsmith_worker_rss_bytes = prometheus_client.Gauge( - "flagsmith_worker_rss_bytes", - "Resident Set Size (RSS) of the worker process in bytes.", - ["pid"] -) ->>>>>>> d7ea9dc2 (created worker metric Prometheus gauge) + return memory_kb \ No newline at end of file From 51386b3577fe6e310f6a9199421f32309f90a4ce Mon Sep 17 00:00:00 2001 From: AAshGray <193518051+AAshGray@users.noreply.github.com> Date: Thu, 7 May 2026 22:23:59 -0700 Subject: [PATCH 08/22] updated new worker_metrics with gauge --- api/metrics/worker_metrics.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/api/metrics/worker_metrics.py b/api/metrics/worker_metrics.py index ccc66d983284..d6a914a56e8d 100644 --- a/api/metrics/worker_metrics.py +++ b/api/metrics/worker_metrics.py @@ -1,6 +1,9 @@ from pathlib import Path from typing import Iterable +from pathlib import Path +from typing import Iterable import prometheus_client +import os PROC_SELF_STATUS_PATH = Path("/proc/self/status") MAX_RSS_KB_TO_BYTES = 1024 @@ -13,6 +16,16 @@ multiprocess_mode="liveall" ) +def update_worker_metrics(): + """ + Background loop that runs inside each worker process to update the RSS memory usage metric. + """ + current_pid = os.getpid() + + rss_value = get_current_process_max_rss_bytes() + if rss_value is not None: + flagsmith_worker_rss_bytes.labels(pid=str(current_pid)).set(rss_value) + def get_current_process_max_rss_bytes() -> int | None: try: proc_status_lines = PROC_SELF_STATUS_PATH.read_text( @@ -57,4 +70,15 @@ def _parse_proc_status_memory_kb(value: str) -> int | None: if memory_kb < 0: return None - return memory_kb \ No newline at end of file + return memory_kb + +def clear_worker_metrics(): + """ + Clear the RSS memory usage metric for the current worker process. + This should be called when a worker process is shutting down to prevent stale metrics. + """ + current_pid = os.getpid() + try: + flagsmith_worker_rss_bytes.remove(pid=str(current_pid)) + except (KeyError, ValueError): + pass \ No newline at end of file From 59255ae16a84dd1810533c2438c9cf2ce6593137 Mon Sep 17 00:00:00 2001 From: AAshGray <193518051+AAshGray@users.noreply.github.com> Date: Thu, 7 May 2026 22:41:33 -0700 Subject: [PATCH 09/22] fixed duplicate imports --- api/metrics/worker_metrics.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/api/metrics/worker_metrics.py b/api/metrics/worker_metrics.py index d6a914a56e8d..5ce38b308aba 100644 --- a/api/metrics/worker_metrics.py +++ b/api/metrics/worker_metrics.py @@ -1,7 +1,5 @@ from pathlib import Path from typing import Iterable -from pathlib import Path -from typing import Iterable import prometheus_client import os From 09b996e3bcb18af6d2de2212da490ed22c95f1c7 Mon Sep 17 00:00:00 2001 From: AAshGray <193518051+AAshGray@users.noreply.github.com> Date: Thu, 7 May 2026 23:21:53 -0700 Subject: [PATCH 10/22] re-arranged update_ and clear_ functions to be adjacent --- api/metrics/worker_metrics.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/api/metrics/worker_metrics.py b/api/metrics/worker_metrics.py index 5ce38b308aba..2c9e1d50669c 100644 --- a/api/metrics/worker_metrics.py +++ b/api/metrics/worker_metrics.py @@ -24,6 +24,17 @@ def update_worker_metrics(): if rss_value is not None: flagsmith_worker_rss_bytes.labels(pid=str(current_pid)).set(rss_value) +def clear_worker_metrics(): + """ + Clear the RSS memory usage metric for the current worker process. + This should be called when a worker process is shutting down to prevent stale metrics. + """ + current_pid = os.getpid() + try: + flagsmith_worker_rss_bytes.remove(pid=str(current_pid)) + except (KeyError, ValueError): + pass + def get_current_process_max_rss_bytes() -> int | None: try: proc_status_lines = PROC_SELF_STATUS_PATH.read_text( @@ -70,13 +81,3 @@ def _parse_proc_status_memory_kb(value: str) -> int | None: return memory_kb -def clear_worker_metrics(): - """ - Clear the RSS memory usage metric for the current worker process. - This should be called when a worker process is shutting down to prevent stale metrics. - """ - current_pid = os.getpid() - try: - flagsmith_worker_rss_bytes.remove(pid=str(current_pid)) - except (KeyError, ValueError): - pass \ No newline at end of file From 6c7b3f60fe2e475702739ca83c3a963768f0ad90 Mon Sep 17 00:00:00 2001 From: AAshGray <193518051+AAshGray@users.noreply.github.com> Date: Thu, 7 May 2026 23:22:46 -0700 Subject: [PATCH 11/22] added unit tests for update_ and clear_worker_metrics --- .../unit/metrics/test_unit_worker_metrics.py | 112 ++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/api/tests/unit/metrics/test_unit_worker_metrics.py b/api/tests/unit/metrics/test_unit_worker_metrics.py index 164dc0651201..f638e264d26b 100644 --- a/api/tests/unit/metrics/test_unit_worker_metrics.py +++ b/api/tests/unit/metrics/test_unit_worker_metrics.py @@ -4,6 +4,30 @@ from metrics import worker_metrics +class MockGaugeLabels: + def __init__(self): + self.set_called_with = None + + def set(self, value): + self.set_called_with = value + + +class MockGauge: + def __init__(self): + self.labels_called_with = None + self.remove_called_with = None + self.mock_labels = MockGaugeLabels() + self.should_raise_on_remove = None + + def labels(self, pid): + self.labels_called_with = pid + return self.mock_labels + + def remove(self, pid): + self.remove_called_with = pid + if self.should_raise_on_remove: + raise self.should_raise_on_remove + class UnreadableStatusPath: def read_text(self, encoding: str) -> str: @@ -121,3 +145,91 @@ def test_get_current_process_max_rss_bytes__status_file_read_error__returns_none # Then assert result is None + +def test_update_worker_metrics__rss_available__updates_gauge( + monkeypatch: pytest.MonkeyPatch, +) -> None: + # Given + mock_rss = 1048576 # 1 MB + mock_pid = 12345 + mock_gauge = MockGauge() + + monkeypatch.setattr(worker_metrics, "get_current_process_max_rss_bytes", lambda: mock_rss) + monkeypatch.setattr(worker_metrics.os, "getpid", lambda: mock_pid) + monkeypatch.setattr(worker_metrics, "flagsmith_worker_rss_bytes", mock_gauge) + + # When + worker_metrics.update_worker_metrics() + + # Then + assert mock_gauge.labels_called_with == str(mock_pid) + assert mock_gauge.mock_labels.set_called_with == mock_rss + +def test_update_worker_metrics__rss_none__does_not_update_gauge( + monkeypatch: pytest.MonkeyPatch, +) -> None: + # Given + mock_pid = 12345 + mock_gauge = MockGauge() + + monkeypatch.setattr(worker_metrics, "get_current_process_max_rss_bytes", lambda: None) + monkeypatch.setattr(worker_metrics.os, "getpid", lambda: mock_pid) + monkeypatch.setattr(worker_metrics, "flagsmith_worker_rss_bytes", mock_gauge) + + # When + worker_metrics.update_worker_metrics() + + # Then + assert mock_gauge.labels_called_with is None + +def test_clear_worker_metrics__removes_gauge_label( + monkeypatch: pytest.MonkeyPatch, +) -> None: + # Given + mock_pid = 67890 + mock_gauge = MockGauge() + + monkeypatch.setattr(worker_metrics.os, "getpid", lambda: mock_pid) + monkeypatch.setattr(worker_metrics, "flagsmith_worker_rss_bytes", mock_gauge) + + # When + worker_metrics.clear_worker_metrics() + + # Then + assert mock_gauge.remove_called_with == str(mock_pid) + + +def test_clear_worker_metrics__keyerror__silently_handles( + monkeypatch: pytest.MonkeyPatch, +) -> None: + # Given + mock_pid = 67890 + mock_gauge = MockGauge() + mock_gauge.should_raise_on_remove = KeyError("Label not found") + + monkeypatch.setattr(worker_metrics.os, "getpid", lambda: mock_pid) + monkeypatch.setattr(worker_metrics, "flagsmith_worker_rss_bytes", mock_gauge) + + # When/Then (should not raise) + worker_metrics.clear_worker_metrics() + + # Then + assert mock_gauge.remove_called_with == str(mock_pid) + + +def test_clear_worker_metrics__valueerror__silently_handles( + monkeypatch: pytest.MonkeyPatch, +) -> None: + # Given + mock_pid = 67890 + mock_gauge = MockGauge() + mock_gauge.should_raise_on_remove = ValueError("Invalid label") + + monkeypatch.setattr(worker_metrics.os, "getpid", lambda: mock_pid) + monkeypatch.setattr(worker_metrics, "flagsmith_worker_rss_bytes", mock_gauge) + + # When/Then (should not raise) + worker_metrics.clear_worker_metrics() + + # Then + assert mock_gauge.remove_called_with == str(mock_pid) \ No newline at end of file From 1738ef71b47b683f65815fa5c0afe25552990ee2 Mon Sep 17 00:00:00 2001 From: AAshGray <193518051+AAshGray@users.noreply.github.com> Date: Thu, 7 May 2026 23:46:46 -0700 Subject: [PATCH 12/22] minor correction to test so mock mirrors gauge syntax more accurately --- api/tests/unit/metrics/test_unit_worker_metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api/tests/unit/metrics/test_unit_worker_metrics.py b/api/tests/unit/metrics/test_unit_worker_metrics.py index f638e264d26b..4b3698fd3900 100644 --- a/api/tests/unit/metrics/test_unit_worker_metrics.py +++ b/api/tests/unit/metrics/test_unit_worker_metrics.py @@ -19,11 +19,11 @@ def __init__(self): self.mock_labels = MockGaugeLabels() self.should_raise_on_remove = None - def labels(self, pid): + def labels(self, *, pid): self.labels_called_with = pid return self.mock_labels - def remove(self, pid): + def remove(self, *, pid): self.remove_called_with = pid if self.should_raise_on_remove: raise self.should_raise_on_remove From c7113e5db099cb1b30c7b70e4f3ab6aac17f7e54 Mon Sep 17 00:00:00 2001 From: HumaGitGud Date: Fri, 8 May 2026 14:03:03 -0700 Subject: [PATCH 13/22] adjusted gauge description --- api/metrics/worker_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/metrics/worker_metrics.py b/api/metrics/worker_metrics.py index 2c9e1d50669c..83aa41d413a3 100644 --- a/api/metrics/worker_metrics.py +++ b/api/metrics/worker_metrics.py @@ -9,7 +9,7 @@ flagsmith_worker_rss_bytes = prometheus_client.Gauge( "flagsmith_worker_rss_bytes", - "Resident Set Size (RSS) of the worker process in bytes.", + "Maximum RSS (high-water mark) of the worker process in bytes, read from VmHWM in /proc/self/status.", ["pid"], multiprocess_mode="liveall" ) From 0a89c75d249a40e08962ac82ab70e89d2938dea0 Mon Sep 17 00:00:00 2001 From: HumaGitGud Date: Fri, 8 May 2026 14:09:17 -0700 Subject: [PATCH 14/22] added return types for mypy to pass --- api/metrics/worker_metrics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/api/metrics/worker_metrics.py b/api/metrics/worker_metrics.py index 83aa41d413a3..92e015f29fa3 100644 --- a/api/metrics/worker_metrics.py +++ b/api/metrics/worker_metrics.py @@ -14,9 +14,9 @@ multiprocess_mode="liveall" ) -def update_worker_metrics(): +def update_worker_metrics() -> None: """ - Background loop that runs inside each worker process to update the RSS memory usage metric. + Update the RSS gauge with the current worker process high-water mark. """ current_pid = os.getpid() @@ -24,7 +24,7 @@ def update_worker_metrics(): if rss_value is not None: flagsmith_worker_rss_bytes.labels(pid=str(current_pid)).set(rss_value) -def clear_worker_metrics(): +def clear_worker_metrics() -> None: """ Clear the RSS memory usage metric for the current worker process. This should be called when a worker process is shutting down to prevent stale metrics. From ef716d6260d52f17f1e3602768bfa2a2b8a8a71f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 8 May 2026 21:14:27 +0000 Subject: [PATCH 15/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- api/metrics/worker_metrics.py | 9 ++++++--- api/tests/unit/metrics/test_unit_worker_metrics.py | 14 +++++++++++--- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/api/metrics/worker_metrics.py b/api/metrics/worker_metrics.py index 92e015f29fa3..ce6d1e9a705e 100644 --- a/api/metrics/worker_metrics.py +++ b/api/metrics/worker_metrics.py @@ -1,7 +1,8 @@ +import os from pathlib import Path from typing import Iterable + import prometheus_client -import os PROC_SELF_STATUS_PATH = Path("/proc/self/status") MAX_RSS_KB_TO_BYTES = 1024 @@ -11,9 +12,10 @@ "flagsmith_worker_rss_bytes", "Maximum RSS (high-water mark) of the worker process in bytes, read from VmHWM in /proc/self/status.", ["pid"], - multiprocess_mode="liveall" + multiprocess_mode="liveall", ) + def update_worker_metrics() -> None: """ Update the RSS gauge with the current worker process high-water mark. @@ -24,6 +26,7 @@ def update_worker_metrics() -> None: if rss_value is not None: flagsmith_worker_rss_bytes.labels(pid=str(current_pid)).set(rss_value) + def clear_worker_metrics() -> None: """ Clear the RSS memory usage metric for the current worker process. @@ -35,6 +38,7 @@ def clear_worker_metrics() -> None: except (KeyError, ValueError): pass + def get_current_process_max_rss_bytes() -> int | None: try: proc_status_lines = PROC_SELF_STATUS_PATH.read_text( @@ -80,4 +84,3 @@ def _parse_proc_status_memory_kb(value: str) -> int | None: return None return memory_kb - diff --git a/api/tests/unit/metrics/test_unit_worker_metrics.py b/api/tests/unit/metrics/test_unit_worker_metrics.py index 4b3698fd3900..ac1e8003086f 100644 --- a/api/tests/unit/metrics/test_unit_worker_metrics.py +++ b/api/tests/unit/metrics/test_unit_worker_metrics.py @@ -4,6 +4,7 @@ from metrics import worker_metrics + class MockGaugeLabels: def __init__(self): self.set_called_with = None @@ -146,6 +147,7 @@ def test_get_current_process_max_rss_bytes__status_file_read_error__returns_none # Then assert result is None + def test_update_worker_metrics__rss_available__updates_gauge( monkeypatch: pytest.MonkeyPatch, ) -> None: @@ -154,7 +156,9 @@ def test_update_worker_metrics__rss_available__updates_gauge( mock_pid = 12345 mock_gauge = MockGauge() - monkeypatch.setattr(worker_metrics, "get_current_process_max_rss_bytes", lambda: mock_rss) + monkeypatch.setattr( + worker_metrics, "get_current_process_max_rss_bytes", lambda: mock_rss + ) monkeypatch.setattr(worker_metrics.os, "getpid", lambda: mock_pid) monkeypatch.setattr(worker_metrics, "flagsmith_worker_rss_bytes", mock_gauge) @@ -165,6 +169,7 @@ def test_update_worker_metrics__rss_available__updates_gauge( assert mock_gauge.labels_called_with == str(mock_pid) assert mock_gauge.mock_labels.set_called_with == mock_rss + def test_update_worker_metrics__rss_none__does_not_update_gauge( monkeypatch: pytest.MonkeyPatch, ) -> None: @@ -172,7 +177,9 @@ def test_update_worker_metrics__rss_none__does_not_update_gauge( mock_pid = 12345 mock_gauge = MockGauge() - monkeypatch.setattr(worker_metrics, "get_current_process_max_rss_bytes", lambda: None) + monkeypatch.setattr( + worker_metrics, "get_current_process_max_rss_bytes", lambda: None + ) monkeypatch.setattr(worker_metrics.os, "getpid", lambda: mock_pid) monkeypatch.setattr(worker_metrics, "flagsmith_worker_rss_bytes", mock_gauge) @@ -182,6 +189,7 @@ def test_update_worker_metrics__rss_none__does_not_update_gauge( # Then assert mock_gauge.labels_called_with is None + def test_clear_worker_metrics__removes_gauge_label( monkeypatch: pytest.MonkeyPatch, ) -> None: @@ -232,4 +240,4 @@ def test_clear_worker_metrics__valueerror__silently_handles( worker_metrics.clear_worker_metrics() # Then - assert mock_gauge.remove_called_with == str(mock_pid) \ No newline at end of file + assert mock_gauge.remove_called_with == str(mock_pid) From 4a3a7a69e188850912ad631d6670383539bf8099 Mon Sep 17 00:00:00 2001 From: HumaGitGud Date: Fri, 8 May 2026 15:19:11 -0700 Subject: [PATCH 16/22] implemented django worker rss middleware to update memory gauge --- api/core/middleware/worker_rss.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 api/core/middleware/worker_rss.py diff --git a/api/core/middleware/worker_rss.py b/api/core/middleware/worker_rss.py new file mode 100644 index 000000000000..cd883f5df9ec --- /dev/null +++ b/api/core/middleware/worker_rss.py @@ -0,0 +1,16 @@ +from django.http import HttpRequest, HttpResponse + +from metrics.worker_metrics import update_worker_metrics + + +class WorkerRSSMiddleware: + def __init__(self, get_response): # type: ignore[no-untyped-def] + self.get_response = get_response + + def __call__(self, request: HttpRequest) -> HttpResponse: + response = self.get_response(request) + try: + update_worker_metrics() + except Exception: + pass + return response From 55752b068f12d8cfa1480df4b0341c1d08c1df76 Mon Sep 17 00:00:00 2001 From: HumaGitGud Date: Fri, 8 May 2026 15:21:48 -0700 Subject: [PATCH 17/22] add WorkerRSSMiddleware in middleware stack --- api/app/settings/common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/api/app/settings/common.py b/api/app/settings/common.py index 634c27cf9bd5..6cbb468e6a9c 100644 --- a/api/app/settings/common.py +++ b/api/app/settings/common.py @@ -364,6 +364,7 @@ "django.contrib.messages.middleware.MessageMiddleware", "django.middleware.clickjacking.XFrameOptionsMiddleware", "simple_history.middleware.HistoryRequestMiddleware", + "core.middleware.worker_rss.WorkerRSSMiddleware", ] ADD_NEVER_CACHE_HEADERS = env.bool("ADD_NEVER_CACHE_HEADERS", True) From 8533a22188647bb4f999c8ec0e3bf9cb3aa5207f Mon Sep 17 00:00:00 2001 From: HumaGitGud Date: Sat, 9 May 2026 18:47:16 -0700 Subject: [PATCH 18/22] implemented tests for middleware --- .../test_unit_core_middleware_worker_rss.py | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 api/tests/unit/core/middleware/test_unit_core_middleware_worker_rss.py diff --git a/api/tests/unit/core/middleware/test_unit_core_middleware_worker_rss.py b/api/tests/unit/core/middleware/test_unit_core_middleware_worker_rss.py new file mode 100644 index 000000000000..f9b4bb71fd64 --- /dev/null +++ b/api/tests/unit/core/middleware/test_unit_core_middleware_worker_rss.py @@ -0,0 +1,53 @@ +from django.http import HttpResponse + +from core.middleware.worker_rss import WorkerRSSMiddleware + + +def test_worker_rss_middleware__any_request__calls_update_after_response(mocker): # type: ignore[no-untyped-def] + # Given + call_order = [] + + def fake_get_response(request): # type: ignore[no-untyped-def] + call_order.append("handled") + return HttpResponse() + + mocker.patch( + "core.middleware.worker_rss.update_worker_metrics", + side_effect=lambda: call_order.append("updated"), + ) + middleware = WorkerRSSMiddleware(fake_get_response) + + # When + middleware(mocker.MagicMock()) + + # Then — metric must be updated after the request is handled, not before + assert call_order == ["handled", "updated"] + + +def test_worker_rss_middleware__any_request__returns_response_unchanged(mocker): # type: ignore[no-untyped-def] + # Given + expected_response = HttpResponse(status=200) + mocker.patch("core.middleware.worker_rss.update_worker_metrics") + middleware = WorkerRSSMiddleware(lambda _request: expected_response) + + # When + result = middleware(mocker.MagicMock()) + + # Then + assert result is expected_response + + +def test_worker_rss_middleware__update_raises__request_still_completes(mocker): # type: ignore[no-untyped-def] + # Given + expected_response = HttpResponse(status=200) + mocker.patch( + "core.middleware.worker_rss.update_worker_metrics", + side_effect=Exception("metric failure"), + ) + middleware = WorkerRSSMiddleware(lambda _request: expected_response) + + # When + result = middleware(mocker.MagicMock()) + + # Then — exception is swallowed, response still returned + assert result is expected_response From 058749ea3505661b1fab3e8af7d1d75b986362db Mon Sep 17 00:00:00 2001 From: HumaGitGud Date: Wed, 13 May 2026 14:58:13 -0700 Subject: [PATCH 19/22] fixed hardcoded line into if statement --- api/app/settings/common.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/api/app/settings/common.py b/api/app/settings/common.py index 6cbb468e6a9c..ffad594e2e63 100644 --- a/api/app/settings/common.py +++ b/api/app/settings/common.py @@ -364,7 +364,6 @@ "django.contrib.messages.middleware.MessageMiddleware", "django.middleware.clickjacking.XFrameOptionsMiddleware", "simple_history.middleware.HistoryRequestMiddleware", - "core.middleware.worker_rss.WorkerRSSMiddleware", ] ADD_NEVER_CACHE_HEADERS = env.bool("ADD_NEVER_CACHE_HEADERS", True) @@ -1421,6 +1420,9 @@ PROMETHEUS_ENABLED = env.bool("PROMETHEUS_ENABLED", False) +if PROMETHEUS_ENABLED: + MIDDLEWARE.append("core.middleware.worker_rss.WorkerRSSMiddleware") + DOCGEN_MODE = env.bool("DOCGEN_MODE", default=False) REQUIRE_AUTHENTICATION_FOR_API_DOCS = env.bool( From c4867d6f853d70e10110f6c84d4da954c44b7191 Mon Sep 17 00:00:00 2001 From: SketchRudy Date: Wed, 13 May 2026 18:10:10 -0700 Subject: [PATCH 20/22] docs: document worker RSS metric and add integration test Story #4 closes out the documentation and verification work for the flagsmith_worker_rss_bytes gauge (Flagsmith#6814). - New operator guide at docs/docs/deployment-self-hosting/observability/worker-rss-monitoring.md covering enabling, PromQL examples, Grafana panel suggestions, and high-water-mark interpretation notes. - Cross-link added to metrics.mdx so the guide is discoverable from the metrics index page. - Corrected the stale catalogue description for flagsmith_worker_rss_bytes to match the post-PR-#3 Python docstring (high-water mark / VmHWM). - Integration test in api/tests/integration/core/ exercises the full path: request through WorkerRSSMiddleware, gauge update, registry scrape. Satisfies story #3 AC #5 escape hatch. - Temporary scaffold note at docs/development/ documents Windows limitations encountered and follow-ups for the team. --- ...test_integration_core_worker_rss_metric.py | 46 +++++++ .../worker-rss-story-4-verification.md | 126 +++++++++++++++++ .../observability/_metrics-catalogue.md | 2 +- .../observability/metrics.mdx | 9 +- .../observability/worker-rss-monitoring.md | 129 ++++++++++++++++++ 5 files changed, 309 insertions(+), 3 deletions(-) create mode 100644 api/tests/integration/core/test_integration_core_worker_rss_metric.py create mode 100644 docs/development/worker-rss-story-4-verification.md create mode 100644 docs/docs/deployment-self-hosting/observability/worker-rss-monitoring.md diff --git a/api/tests/integration/core/test_integration_core_worker_rss_metric.py b/api/tests/integration/core/test_integration_core_worker_rss_metric.py new file mode 100644 index 000000000000..4ee0de31dfa0 --- /dev/null +++ b/api/tests/integration/core/test_integration_core_worker_rss_metric.py @@ -0,0 +1,46 @@ +import os + +from django.conf import settings as django_settings +from django.test import Client, override_settings +from prometheus_client import REGISTRY, generate_latest +from pytest_mock import MockerFixture + +from metrics.worker_metrics import clear_worker_metrics, flagsmith_worker_rss_bytes + + +@override_settings( + MIDDLEWARE=[ + *django_settings.MIDDLEWARE, + "core.middleware.worker_rss.WorkerRSSMiddleware", + ] +) +def test_worker_rss_metric__request_through_middleware__appears_in_prometheus_output( + client: Client, + mocker: MockerFixture, +) -> None: + # Given - deterministic RSS reading so the test is independent of /proc availability + # on macOS/Windows CI runners. + expected_rss = 12_345_678 + mocker.patch( + "metrics.worker_metrics.get_current_process_max_rss_bytes", + return_value=expected_rss, + ) + + # When - any cheap, known-reachable endpoint trips the middleware after response. + response = client.get("/api/v1/swagger.json", HTTP_ACCEPT="application/json") + + # Then - the response is unaffected by the middleware, and the gauge is exposed + # with a sample for the current worker's PID via the Prometheus exposition format. + assert response.status_code == 200 + output = generate_latest(REGISTRY).decode() + assert "flagsmith_worker_rss_bytes" in output + assert f'pid="{os.getpid()}"' in output + + +def teardown_function(function: object) -> None: + # Prevent labelled-child leakage to other tests in the same xdist worker by removing + # this PID's sample after each test. Uses the existing module API. + try: + clear_worker_metrics() + except Exception: + pass diff --git a/docs/development/worker-rss-story-4-verification.md b/docs/development/worker-rss-story-4-verification.md new file mode 100644 index 000000000000..9f367cec52b5 --- /dev/null +++ b/docs/development/worker-rss-story-4-verification.md @@ -0,0 +1,126 @@ +# Story #4 — Worker RSS Documentation and Verification + +This is a temporary scaffold note recording the work and findings of Story #4. Once the team agrees on the final shape, +the content should be folded into `docs/docs/...` (already partially done — see Deliverables below) and this file +removed before the final upstream PR. + +## Goal + +Story #4 closes out the issue [Flagsmith#6814](https://github.com/Flagsmith/flagsmith/issues/6814) work by: + +1. Documenting the `flagsmith_worker_rss_bytes` Prometheus gauge for operators. +2. Adding PromQL and Grafana guidance so the metric is actionable in dashboards. +3. Adding an integration test that proves the metric is exposed through the request → middleware → `/metrics` path + (story #3 only shipped unit tests, so story #4's AC #5 escape hatch applied). +4. Recording verification results for the team. + +## What changed + +| Deliverable | File | Status | +| ------------------------ | -------------------------------------------------------------------------- | ---------------------------------------------------------------- | +| A. Operator docs page | `docs/docs/deployment-self-hosting/observability/worker-rss-monitoring.md` | Created | +| A. Cross-link from index | `docs/docs/deployment-self-hosting/observability/metrics.mdx` | Section "Monitoring worker memory" added | +| B. Catalogue description | `docs/docs/deployment-self-hosting/observability/_metrics-catalogue.md` | Updated to match the corrected Python docstring (VmHWM phrasing) | +| C. Integration test | `api/tests/integration/core/test_integration_core_worker_rss_metric.py` | Created | +| D. Local verification | n/a | Blocked locally — see caveats below | +| E. This note | `docs/development/worker-rss-story-4-verification.md` | Created | + +## Verification + +### Catalogue regeneration (Deliverable B) + +The auto-generation pipeline (`cd api && make generate-docs`) is the canonical way to update `_metrics-catalogue.md`, +but it requires the API venv to fully load Django, which transitively imports `gunicorn` — and `gunicorn` requires the +POSIX `fcntl` module. On Windows this fails at import time. To unblock the sprint, the single stale entry was +hand-edited to match the corrected Python docstring at `api/metrics/worker_metrics.py:13`. The change is a one-entry +diff: + +```diff + ### `flagsmith_worker_rss_bytes` + + Gauge. + +-Resident Set Size (RSS) of the worker process in bytes. Used to track memory usage of each worker process. The `pid` label identifies which process the measurement belongs to. ++Maximum RSS (high-water mark) of the worker process in bytes, read from VmHWM in /proc/self/status. + + Labels: + - `pid` +``` + +**Recommended follow-up before the final upstream PR:** run `cd api && make generate-docs` from Linux/CI/WSL to confirm +the hand-edit matches a clean regen. Expected to be identical. + +### Integration test (Deliverable C) + +Test file: `api/tests/integration/core/test_integration_core_worker_rss_metric.py` + +Approach: override `MIDDLEWARE` directly (not `PROMETHEUS_ENABLED`, which is read at settings-module load time and +cannot be toggled mid-test), then hit a known-reachable endpoint and scrape the in-process Prometheus registry. + +To run locally on Linux or in CI: + +```bash +cd api +uv sync --extra dev +uv run pytest tests/integration/core/test_integration_core_worker_rss_metric.py -n0 +``` + +Expected: 1 passed. + +Why `-n0`: prevents pytest-xdist from running the test in a parallel worker, which would change the PID and complicate +the `os.getpid()` assertion. + +### Local verification (Deliverable D) + +**Blocked on Windows.** The two paths require importing the full Django stack: + +- `make serve` (uses `gunicorn`) fails immediately on Windows: `ModuleNotFoundError: No module named 'fcntl'` in + `gunicorn/util.py`. +- `uv run pytest` fails the same way when loading `tests/integration/core/...` — the URL config imports + `common.gunicorn.utils` which imports `gunicorn`. + +The integration test (C) is the canonical proof; it will run on Linux CI. The "Sample output" section of the operator +docs page uses a representative Prometheus exposition string that matches the format generated by +`prometheus_client.generate_latest()` — anyone with a Linux box or WSL2 should regenerate it from a live `/metrics` +scrape before merge. + +A teammate with WSL2, macOS, Linux, or Docker Desktop running can complete this verification by: + +```bash +# Spin the whole stack +docker compose up -d + +# Send some traffic +curl -sf http://localhost:8000/api/v1/swagger.json -o /dev/null + +# Scrape the metrics endpoint (port 9100 by default) +curl -s http://localhost:9100/metrics | grep flagsmith_worker_rss_bytes +``` + +Paste the output into the **Sample output** section of `worker-rss-monitoring.md` if it differs materially from the +representative sample currently there. + +## Caveats and follow-ups for the team + +1. **`clear_worker_metrics()` is defined but never called.** Story #2 added a cleanup function for stale per-PID samples + on worker shutdown, but nothing wires it into a `SIGTERM` handler or gunicorn's `worker_exit` hook. Worth a follow-up + issue. +2. **`multiprocess_mode="liveall"` is a no-op locally.** It only matters when `PROMETHEUS_MULTIPROC_DIR` is set, which + the production Docker image does (`/tmp/prometheus`) but local dev does not. +3. **VmHWM inheritance across `exec()`.** Flagged in the operator docs as a known quirk: a freshly spawned worker may + inherit a non-zero baseline from its parent. Worth a doc note has been added. +4. **Windows is not a first-class dev platform for this repo.** Anyone on Windows should plan to run tests, the dev + server, and `make generate-docs` inside WSL2 or a Linux container. Adding a note to `api/README.md` would save future + contributors time — possibly a follow-up. + +## Links + +- Story tracker issue (team board): Story #4 +- Upstream issue: [Flagsmith#6814](https://github.com/Flagsmith/flagsmith/issues/6814) +- Operator docs: + [`docs/docs/deployment-self-hosting/observability/worker-rss-monitoring.md`](../docs/deployment-self-hosting/observability/worker-rss-monitoring.md) +- Integration test: + [`api/tests/integration/core/test_integration_core_worker_rss_metric.py`](../../api/tests/integration/core/test_integration_core_worker_rss_metric.py) +- Updated catalogue entry: + [`docs/docs/deployment-self-hosting/observability/_metrics-catalogue.md`](../docs/deployment-self-hosting/observability/_metrics-catalogue.md) + (search for `flagsmith_worker_rss_bytes`) diff --git a/docs/docs/deployment-self-hosting/observability/_metrics-catalogue.md b/docs/docs/deployment-self-hosting/observability/_metrics-catalogue.md index 9f517f443d49..de40c74ec345 100644 --- a/docs/docs/deployment-self-hosting/observability/_metrics-catalogue.md +++ b/docs/docs/deployment-self-hosting/observability/_metrics-catalogue.md @@ -105,7 +105,7 @@ Labels: Gauge. -Resident Set Size (RSS) of the worker process in bytes. Used to track memory usage of each worker process. The `pid` label identifies which process the measurement belongs to. +Maximum RSS (high-water mark) of the worker process in bytes, read from VmHWM in /proc/self/status. Labels: - `pid` diff --git a/docs/docs/deployment-self-hosting/observability/metrics.mdx b/docs/docs/deployment-self-hosting/observability/metrics.mdx index 3becac991b2c..1d02f3b5ad40 100644 --- a/docs/docs/deployment-self-hosting/observability/metrics.mdx +++ b/docs/docs/deployment-self-hosting/observability/metrics.mdx @@ -16,7 +16,12 @@ The metrics provided by Flagsmith are described below. +## Monitoring worker memory + +The `flagsmith_worker_rss_bytes` gauge reports the peak resident-set size for each worker process. See the +[Worker RSS monitoring guide](./worker-rss-monitoring) for PromQL examples, Grafana panels, and interpretation notes. + ## StatsD -The Flagsmith WSGI worker emits per-request access log metrics (request counts, durations, HTTP statuses) to StatsD -when configured. See [StatsD](/deployment-self-hosting/observability/monitoring#statsd) for setup. +The Flagsmith WSGI worker emits per-request access log metrics (request counts, durations, HTTP statuses) to StatsD when +configured. See [StatsD](/deployment-self-hosting/observability/monitoring#statsd) for setup. diff --git a/docs/docs/deployment-self-hosting/observability/worker-rss-monitoring.md b/docs/docs/deployment-self-hosting/observability/worker-rss-monitoring.md new file mode 100644 index 000000000000..2bb4a86e8883 --- /dev/null +++ b/docs/docs/deployment-self-hosting/observability/worker-rss-monitoring.md @@ -0,0 +1,129 @@ +--- +title: Worker RSS monitoring +sidebar_position: 15 +description: Track the peak memory of each Flagsmith API worker process with Prometheus and Grafana. +--- + +The `flagsmith_worker_rss_bytes` gauge exposes the peak resident-set size of every API worker process, labelled by +process ID. This is the most reliable signal for detecting workers that grow unboundedly (a leak) versus workers that +grow under load and stabilise. Use this page once you have Prometheus scraping configured — see +[Monitoring](./monitoring) for setup. + +## Overview + +A worker's RSS is the amount of physical memory the operating system currently has mapped for that process. Python-level +profilers tend to miss leaks that live in C extensions, page caches, or the allocator's free lists, so process-level RSS +is often the only reliable signal in production. + +`flagsmith_worker_rss_bytes` reports the **high-water mark** — the peak RSS observed for the worker since it started. +The value is read from the `VmHWM` line of `/proc/self/status`, which the Linux kernel maintains atomically. The metric +is updated once per HTTP request handled by the worker. + +The gauge has a single label, `pid`, identifying the worker process. When Flagsmith is deployed with multiple gunicorn +workers, you will see one time series per worker. + +## Enabling + +Set the environment variable: + +```bash +PROMETHEUS_ENABLED=true +``` + +This activates the `WorkerRSSMiddleware` that updates the gauge after each request. No further configuration is required +for single-process deployments. + +### Multi-worker deployments + +To aggregate metrics across gunicorn workers, set `PROMETHEUS_MULTIPROC_DIR` to a writable directory: + +```bash +PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus +``` + +The official Flagsmith Docker image sets this automatically. For bare-metal or custom-container deployments, configure +it yourself; otherwise the `/metrics` endpoint will only report data from whichever worker happened to handle the scrape +request. + +## Sample output + +Scraping `/metrics` on a Flagsmith API with two workers running yields output similar to: + +```text +# HELP flagsmith_worker_rss_bytes Maximum RSS (high-water mark) of the worker process in bytes, read from VmHWM in /proc/self/status. +# TYPE flagsmith_worker_rss_bytes gauge +flagsmith_worker_rss_bytes{pid="1234"} 4.8259072e+07 +flagsmith_worker_rss_bytes{pid="1235"} 5.2215808e+07 +``` + +Each `pid` corresponds to a live worker. Values are in bytes; the example above shows roughly 46 MiB and 50 MiB +respectively. + +## PromQL examples + +Useful queries to drop into dashboards or alerts. + +**Per-worker peak RSS (raw):** + +```promql +flagsmith_worker_rss_bytes +``` + +**Maximum peak across all workers:** + +```promql +max(flagsmith_worker_rss_bytes) +``` + +**Peak per worker over the last hour:** + +```promql +max_over_time(flagsmith_worker_rss_bytes[1h]) +``` + +**Growth indicator — peak RSS now minus peak RSS one hour ago:** + +```promql +flagsmith_worker_rss_bytes - flagsmith_worker_rss_bytes offset 1h +``` + +A consistently positive value across many workers and time windows points to a leak. A value that spikes once after a +deployment and then stays flat is normal — the workers grew under load and levelled off. + +## Grafana panel + +A reasonable starting point for a "Worker memory" panel: + +| Setting | Value | +| ------------- | -------------------------------------------- | +| Visualisation | Time series | +| Query | `flagsmith_worker_rss_bytes` | +| Legend | `{{pid}}` | +| Unit | bytes (IEC) — Grafana renders as KiB/MiB/GiB | +| Stacking | Disabled — each worker is independent | + +Add a second panel showing `max(flagsmith_worker_rss_bytes)` for a single-number overview. + +## Interpretation notes + +The metric is a high-water mark, not a current reading. Understanding the implications avoids false alerts. + +- **The value never decreases for a given PID.** Once a worker has peaked at a particular RSS, the gauge for that PID + will stay at that value until the worker process exits. Recovery is observed through PID rotation: when a worker is + recycled (for example, by gunicorn's `--max-requests` setting or by a deployment), the old PID's time series goes + stale and a new PID appears with a fresh, lower value. +- **Steady high RSS is normal after warm-up.** A worker that loads caches at startup will reach its steady-state peak + quickly and stay there. This appears as a flat line in Grafana, not a leak. +- **Periodic large workloads inflate the peak.** If a worker occasionally processes a large payload (for example, a bulk + export), the gauge will pin at that peak for the rest of the worker's lifetime even after the memory has been freed. + Investigate via PID rotation rather than waiting for the value to fall. +- **Leak signature.** A genuine leak shows up as the peak climbing across many worker restarts — every newly forked + worker reaches a higher peak than its predecessor. +- **Quirk: parent-process inheritance.** On Linux, the kernel may preserve the high-water mark across `execve()`, so a + freshly spawned worker can report a non-zero baseline inherited from its parent. Treat the first scrape after a + deployment as informational rather than a true zero. + +## Related documentation + +- [Metrics reference](./metrics) — full catalogue of exported Prometheus metrics. +- [Monitoring](./monitoring) — enabling `/metrics` and other vendor integrations. From b5cfac867c82a5516d3655e3a6b2a0bb347c1fef Mon Sep 17 00:00:00 2001 From: SketchRudy Date: Tue, 19 May 2026 12:59:26 -0700 Subject: [PATCH 21/22] chore: remove sprint scaffold notes ahead of upstream submission --- .../worker-metrics-helper-testing.md | 56 -------- .../worker-rss-story-4-verification.md | 126 ------------------ 2 files changed, 182 deletions(-) delete mode 100644 docs/development/worker-metrics-helper-testing.md delete mode 100644 docs/development/worker-rss-story-4-verification.md diff --git a/docs/development/worker-metrics-helper-testing.md b/docs/development/worker-metrics-helper-testing.md deleted file mode 100644 index f46a60d2aa6a..000000000000 --- a/docs/development/worker-metrics-helper-testing.md +++ /dev/null @@ -1,56 +0,0 @@ -# Worker Metrics Helper Testing - -This note covers how to test the Story #1 RSS helper. - -The helper under test is `get_current_process_max_rss_bytes()`. It reads the -current worker process RSS high-water mark from `VmHWM` in `/proc/self/status`, -converts the Linux `kB` value to bytes, and returns `None` when RSS data cannot -be read safely. - -The helper avoids `resource.getrusage(resource.RUSAGE_SELF).ru_maxrss` because -Linux can preserve that value across `execve`. Reading `VmHWM` from -`/proc/self/status` keeps the high-water mark scoped to the current worker -process, which is the value the 10-second worker metrics updater should export. - -## Run The Focused Unit Tests - -From the API directory: - -```bash -make test opts='tests/unit/metrics/test_unit_worker_metrics.py -n0' -``` - -If the local shell has an invalid `DEBUG` value, set it explicitly: - -```bash -DEBUG=false make test opts='tests/unit/metrics/test_unit_worker_metrics.py -n0' -``` - -## Run Without Docker - -If Docker is unavailable but the API virtualenv is installed: - -```bash -DEBUG=false .venv/bin/pytest tests/unit/metrics/test_unit_worker_metrics.py -n0 -``` - -## Run Lint And Format Checks - -```bash -.venv/bin/ruff check metrics/worker_metrics.py tests/unit/metrics/test_unit_worker_metrics.py -.venv/bin/ruff format --check metrics/worker_metrics.py tests/unit/metrics/test_unit_worker_metrics.py -``` - -## Expected Coverage - -The unit tests cover: - -- successful `VmHWM` collection -- `kB`-to-byte conversion -- leading and trailing whitespace in `/proc/self/status` -- unavailable `/proc/self/status` -- missing `VmHWM` -- invalid negative RSS values -- non-integer RSS values -- unexpected RSS units -- unexpected status-file read errors diff --git a/docs/development/worker-rss-story-4-verification.md b/docs/development/worker-rss-story-4-verification.md deleted file mode 100644 index 9f367cec52b5..000000000000 --- a/docs/development/worker-rss-story-4-verification.md +++ /dev/null @@ -1,126 +0,0 @@ -# Story #4 — Worker RSS Documentation and Verification - -This is a temporary scaffold note recording the work and findings of Story #4. Once the team agrees on the final shape, -the content should be folded into `docs/docs/...` (already partially done — see Deliverables below) and this file -removed before the final upstream PR. - -## Goal - -Story #4 closes out the issue [Flagsmith#6814](https://github.com/Flagsmith/flagsmith/issues/6814) work by: - -1. Documenting the `flagsmith_worker_rss_bytes` Prometheus gauge for operators. -2. Adding PromQL and Grafana guidance so the metric is actionable in dashboards. -3. Adding an integration test that proves the metric is exposed through the request → middleware → `/metrics` path - (story #3 only shipped unit tests, so story #4's AC #5 escape hatch applied). -4. Recording verification results for the team. - -## What changed - -| Deliverable | File | Status | -| ------------------------ | -------------------------------------------------------------------------- | ---------------------------------------------------------------- | -| A. Operator docs page | `docs/docs/deployment-self-hosting/observability/worker-rss-monitoring.md` | Created | -| A. Cross-link from index | `docs/docs/deployment-self-hosting/observability/metrics.mdx` | Section "Monitoring worker memory" added | -| B. Catalogue description | `docs/docs/deployment-self-hosting/observability/_metrics-catalogue.md` | Updated to match the corrected Python docstring (VmHWM phrasing) | -| C. Integration test | `api/tests/integration/core/test_integration_core_worker_rss_metric.py` | Created | -| D. Local verification | n/a | Blocked locally — see caveats below | -| E. This note | `docs/development/worker-rss-story-4-verification.md` | Created | - -## Verification - -### Catalogue regeneration (Deliverable B) - -The auto-generation pipeline (`cd api && make generate-docs`) is the canonical way to update `_metrics-catalogue.md`, -but it requires the API venv to fully load Django, which transitively imports `gunicorn` — and `gunicorn` requires the -POSIX `fcntl` module. On Windows this fails at import time. To unblock the sprint, the single stale entry was -hand-edited to match the corrected Python docstring at `api/metrics/worker_metrics.py:13`. The change is a one-entry -diff: - -```diff - ### `flagsmith_worker_rss_bytes` - - Gauge. - --Resident Set Size (RSS) of the worker process in bytes. Used to track memory usage of each worker process. The `pid` label identifies which process the measurement belongs to. -+Maximum RSS (high-water mark) of the worker process in bytes, read from VmHWM in /proc/self/status. - - Labels: - - `pid` -``` - -**Recommended follow-up before the final upstream PR:** run `cd api && make generate-docs` from Linux/CI/WSL to confirm -the hand-edit matches a clean regen. Expected to be identical. - -### Integration test (Deliverable C) - -Test file: `api/tests/integration/core/test_integration_core_worker_rss_metric.py` - -Approach: override `MIDDLEWARE` directly (not `PROMETHEUS_ENABLED`, which is read at settings-module load time and -cannot be toggled mid-test), then hit a known-reachable endpoint and scrape the in-process Prometheus registry. - -To run locally on Linux or in CI: - -```bash -cd api -uv sync --extra dev -uv run pytest tests/integration/core/test_integration_core_worker_rss_metric.py -n0 -``` - -Expected: 1 passed. - -Why `-n0`: prevents pytest-xdist from running the test in a parallel worker, which would change the PID and complicate -the `os.getpid()` assertion. - -### Local verification (Deliverable D) - -**Blocked on Windows.** The two paths require importing the full Django stack: - -- `make serve` (uses `gunicorn`) fails immediately on Windows: `ModuleNotFoundError: No module named 'fcntl'` in - `gunicorn/util.py`. -- `uv run pytest` fails the same way when loading `tests/integration/core/...` — the URL config imports - `common.gunicorn.utils` which imports `gunicorn`. - -The integration test (C) is the canonical proof; it will run on Linux CI. The "Sample output" section of the operator -docs page uses a representative Prometheus exposition string that matches the format generated by -`prometheus_client.generate_latest()` — anyone with a Linux box or WSL2 should regenerate it from a live `/metrics` -scrape before merge. - -A teammate with WSL2, macOS, Linux, or Docker Desktop running can complete this verification by: - -```bash -# Spin the whole stack -docker compose up -d - -# Send some traffic -curl -sf http://localhost:8000/api/v1/swagger.json -o /dev/null - -# Scrape the metrics endpoint (port 9100 by default) -curl -s http://localhost:9100/metrics | grep flagsmith_worker_rss_bytes -``` - -Paste the output into the **Sample output** section of `worker-rss-monitoring.md` if it differs materially from the -representative sample currently there. - -## Caveats and follow-ups for the team - -1. **`clear_worker_metrics()` is defined but never called.** Story #2 added a cleanup function for stale per-PID samples - on worker shutdown, but nothing wires it into a `SIGTERM` handler or gunicorn's `worker_exit` hook. Worth a follow-up - issue. -2. **`multiprocess_mode="liveall"` is a no-op locally.** It only matters when `PROMETHEUS_MULTIPROC_DIR` is set, which - the production Docker image does (`/tmp/prometheus`) but local dev does not. -3. **VmHWM inheritance across `exec()`.** Flagged in the operator docs as a known quirk: a freshly spawned worker may - inherit a non-zero baseline from its parent. Worth a doc note has been added. -4. **Windows is not a first-class dev platform for this repo.** Anyone on Windows should plan to run tests, the dev - server, and `make generate-docs` inside WSL2 or a Linux container. Adding a note to `api/README.md` would save future - contributors time — possibly a follow-up. - -## Links - -- Story tracker issue (team board): Story #4 -- Upstream issue: [Flagsmith#6814](https://github.com/Flagsmith/flagsmith/issues/6814) -- Operator docs: - [`docs/docs/deployment-self-hosting/observability/worker-rss-monitoring.md`](../docs/deployment-self-hosting/observability/worker-rss-monitoring.md) -- Integration test: - [`api/tests/integration/core/test_integration_core_worker_rss_metric.py`](../../api/tests/integration/core/test_integration_core_worker_rss_metric.py) -- Updated catalogue entry: - [`docs/docs/deployment-self-hosting/observability/_metrics-catalogue.md`](../docs/deployment-self-hosting/observability/_metrics-catalogue.md) - (search for `flagsmith_worker_rss_bytes`) From 3d83f7ab522cbbe9d7417e8aea431ace2d1c6dc9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 May 2026 20:19:16 +0000 Subject: [PATCH 22/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../integration/core/test_integration_core_worker_rss_metric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/tests/integration/core/test_integration_core_worker_rss_metric.py b/api/tests/integration/core/test_integration_core_worker_rss_metric.py index 4ee0de31dfa0..92f2c06d0817 100644 --- a/api/tests/integration/core/test_integration_core_worker_rss_metric.py +++ b/api/tests/integration/core/test_integration_core_worker_rss_metric.py @@ -5,7 +5,7 @@ from prometheus_client import REGISTRY, generate_latest from pytest_mock import MockerFixture -from metrics.worker_metrics import clear_worker_metrics, flagsmith_worker_rss_bytes +from metrics.worker_metrics import clear_worker_metrics @override_settings(