From 46ec45fe50760c760c2ec7ea20ac5f9373ea8647 Mon Sep 17 00:00:00 2001 From: Pervakov Grigorii Date: Wed, 27 May 2026 21:41:21 +0200 Subject: [PATCH 1/4] chore(deps): bump clickhouse-operator-helm version to v0.0.6 --- .changeset/bump-clickhouse-operator-v0.0.6.md | 5 +++++ charts/clickstack-operators/Chart.lock | 6 +++--- charts/clickstack-operators/Chart.yaml | 2 +- charts/clickstack-operators/values.yaml | 6 +++--- integration-tests/full-stack/assert.sh | 4 ++-- 5 files changed, 14 insertions(+), 9 deletions(-) create mode 100644 .changeset/bump-clickhouse-operator-v0.0.6.md diff --git a/.changeset/bump-clickhouse-operator-v0.0.6.md b/.changeset/bump-clickhouse-operator-v0.0.6.md new file mode 100644 index 0000000..228bf81 --- /dev/null +++ b/.changeset/bump-clickhouse-operator-v0.0.6.md @@ -0,0 +1,5 @@ +--- +"helm-charts": patch +--- + +chore(deps): bump clickhouse-operator-helm to v0.0.6 diff --git a/charts/clickstack-operators/Chart.lock b/charts/clickstack-operators/Chart.lock index 469e93c..8d12391 100644 --- a/charts/clickstack-operators/Chart.lock +++ b/charts/clickstack-operators/Chart.lock @@ -4,6 +4,6 @@ dependencies: version: 1.7.0 - name: clickhouse-operator-helm repository: oci://ghcr.io/clickhouse - version: 0.0.2 -digest: sha256:1daf572004da83b1836c8867f11198530652fee6905d4786a2d5eef87bc611cd -generated: "2026-03-04T16:52:51.068188-06:00" + version: 0.0.6 +digest: sha256:5afcb0d78e0ceecf1a18f3f7dfb52ee2627b7acea2621ffa411ee7bfb530adf7 +generated: "2026-06-19T17:41:30.214406+02:00" diff --git a/charts/clickstack-operators/Chart.yaml b/charts/clickstack-operators/Chart.yaml index 86cc5f1..0b6ceee 100644 --- a/charts/clickstack-operators/Chart.yaml +++ b/charts/clickstack-operators/Chart.yaml @@ -13,6 +13,6 @@ dependencies: repository: https://mongodb.github.io/helm-charts alias: mongodb-operator - name: clickhouse-operator-helm - version: "~0.0.2" + version: "~0.0.6" repository: oci://ghcr.io/clickhouse alias: clickhouse-operator diff --git a/charts/clickstack-operators/values.yaml b/charts/clickstack-operators/values.yaml index 85fd44f..701bfd5 100644 --- a/charts/clickstack-operators/values.yaml +++ b/charts/clickstack-operators/values.yaml @@ -9,8 +9,8 @@ mongodb-operator: # See https://clickhouse.com/docs/clickhouse-operator/overview for all options clickhouse-operator: webhook: - enable: false + enabled: false certManager: - enable: false + enabled: false crd: - enable: true + enabled: true diff --git a/integration-tests/full-stack/assert.sh b/integration-tests/full-stack/assert.sh index 2aaf481..579ea0f 100755 --- a/integration-tests/full-stack/assert.sh +++ b/integration-tests/full-stack/assert.sh @@ -12,7 +12,7 @@ echo "Waiting for services to initialize..." sleep 30 echo "Waiting for all pods to be ready..." -kubectl wait --for=condition=Ready pods --all --timeout=600s || true +kubectl wait --for=condition=Ready pods --all --field-selector=status.phase!=Succeeded --timeout=600s || true echo "Pod status:" kubectl get pods -o wide @@ -24,7 +24,7 @@ echo "Checking ClickHouseCluster CR..." kubectl get clickhousecluster -o wide || true echo "Waiting for all pods to be ready (final check)..." -kubectl wait --for=condition=Ready pods --all --timeout=600s +kubectl wait --for=condition=Ready pods --all --field-selector=status.phase!=Succeeded --timeout=600s echo "Final pod status:" kubectl get pods -o wide From a0e16e81d31062a9725fada639dc355d15892d13 Mon Sep 17 00:00:00 2001 From: Warren <5959690+wrn14897@users.noreply.github.com> Date: Mon, 29 Jun 2026 22:52:12 -0700 Subject: [PATCH 2/4] fix(clickhouse): set explicit container resources to prevent OOMKill clickhouse-operator v0.0.6 changed its default resource block to use memory request == limit at 512Mi (operator PR #206). That ceiling is too low for the full ClickStack schema and OOMKills the ClickHouse server under ingestion + background merges, which is what broke the full-stack integration test on the operator bump. Set explicit containerTemplate.resources (2Gi memory, 500m CPU request) so the chart no longer inherits the operator default. Add a unit test asserting the default and a changeset. --- .changeset/clickhouse-explicit-resources.md | 12 ++++++++++++ .../tests/clickhouse-deployment_test.yaml | 14 ++++++++++++++ charts/clickstack/values.yaml | 11 +++++++++++ 3 files changed, 37 insertions(+) create mode 100644 .changeset/clickhouse-explicit-resources.md diff --git a/.changeset/clickhouse-explicit-resources.md b/.changeset/clickhouse-explicit-resources.md new file mode 100644 index 0000000..9640fc7 --- /dev/null +++ b/.changeset/clickhouse-explicit-resources.md @@ -0,0 +1,12 @@ +--- +"helm-charts": patch +--- + +fix(clickhouse): set explicit container resources for the ClickHouse server + +The clickhouse-operator applies a small default resource block (512Mi memory, +request == limit as of operator v0.0.6) when none is provided. That is too low +for the full ClickStack schema (many materialized views) and caused the +ClickHouse server to OOMKill (exit 137) and crash-loop under ingestion plus +background merges. The chart now sets explicit `containerTemplate.resources` +(2Gi memory, 500m CPU request) which can be overridden per environment. diff --git a/charts/clickstack/tests/clickhouse-deployment_test.yaml b/charts/clickstack/tests/clickhouse-deployment_test.yaml index ceef006..44cafaf 100644 --- a/charts/clickstack/tests/clickhouse-deployment_test.yaml +++ b/charts/clickstack/tests/clickhouse-deployment_test.yaml @@ -68,6 +68,20 @@ tests: path: spec.dataVolumeClaimSpec.resources.requests.storage value: 10Gi + - it: should set explicit container resources so the operator default does not OOMKill ClickHouse + templates: + - clickhouse/cluster.yaml + asserts: + - equal: + path: spec.containerTemplate.resources.requests.memory + value: 2Gi + - equal: + path: spec.containerTemplate.resources.limits.memory + value: 2Gi + - equal: + path: spec.containerTemplate.resources.requests.cpu + value: 500m + - it: should resolve keeperClusterRef template expression templates: - clickhouse/cluster.yaml diff --git a/charts/clickstack/values.yaml b/charts/clickstack/values.yaml index edd375c..ec462a7 100644 --- a/charts/clickstack/values.yaml +++ b/charts/clickstack/values.yaml @@ -328,6 +328,17 @@ clickhouse: image: repository: clickhouse/clickhouse-server tag: "25.7-alpine" + # Explicit resources for the ClickHouse server container. Without this + # the clickhouse-operator applies its own small default (512Mi memory, + # with request == limit as of operator v0.0.6), which is too low for the + # full ClickStack schema (many materialized views) and causes the server + # to OOMKill under ingestion + background merges. Override per environment. + resources: + requests: + cpu: 500m + memory: 2Gi + limits: + memory: 2Gi replicas: 1 shards: 1 keeperClusterRef: From ed46ecca65eeb9d9323ba0bc311e4d6bec32a2af Mon Sep 17 00:00:00 2001 From: Warren <5959690+wrn14897@users.noreply.github.com> Date: Mon, 29 Jun 2026 23:38:07 -0700 Subject: [PATCH 3/4] fix(clickhouse): disable operator databaseSync to keep default DB Atomic clickhouse-operator v0.0.6 defaults settings.enableDatabaseSync: true, which creates the `default` database with the Replicated (DatabaseReplicated) engine so table metadata lives in Keeper. That feature targets multi-replica clusters. In this single-replica deployment, a transient Keeper exception during ClickHouse startup (observed: 'Cannot use any of provided ZooKeeper nodes', exit 231) desyncs the Replicated database and silently drops every seeded table -- they never come back, so the otel-collector loops on 'Table default.otel_traces does not exist' and the smoke test's wait_for_table_queryable times out. This is the remaining full-stack integration-test failure on the operator bump. Set enableDatabaseSync: false so `default` stays Atomic and the seeded tables persist on the data volume across restarts. Verified end-to-end on a fresh kind cluster: default DB engine Atomic, all otel_* tables present and queryable, ingestion succeeds. Add a unit test and fold the rationale into the changeset. --- .changeset/clickhouse-explicit-resources.md | 25 +++++++++++++------ .../tests/clickhouse-deployment_test.yaml | 8 ++++++ charts/clickstack/values.yaml | 10 ++++++++ 3 files changed, 35 insertions(+), 8 deletions(-) diff --git a/.changeset/clickhouse-explicit-resources.md b/.changeset/clickhouse-explicit-resources.md index 9640fc7..04e89c4 100644 --- a/.changeset/clickhouse-explicit-resources.md +++ b/.changeset/clickhouse-explicit-resources.md @@ -2,11 +2,20 @@ "helm-charts": patch --- -fix(clickhouse): set explicit container resources for the ClickHouse server - -The clickhouse-operator applies a small default resource block (512Mi memory, -request == limit as of operator v0.0.6) when none is provided. That is too low -for the full ClickStack schema (many materialized views) and caused the -ClickHouse server to OOMKill (exit 137) and crash-loop under ingestion plus -background merges. The chart now sets explicit `containerTemplate.resources` -(2Gi memory, 500m CPU request) which can be overridden per environment. +fix(clickhouse): harden ClickHouse defaults for clickhouse-operator v0.0.6 + +Two operator-default changes in v0.0.6 broke the single-replica ClickHouse +deployment; the chart now overrides both: + +- Explicit `containerTemplate.resources` (2Gi memory, 500m CPU request). The + operator otherwise applies a 512Mi default (request == limit as of v0.0.6), + which is too low for the full ClickStack schema and OOMKills the server + (exit 137) under ingestion plus background merges. + +- `settings.enableDatabaseSync: false`. The operator now defaults this to true, + which creates the `default` database with the Replicated (DatabaseReplicated) + engine so table metadata lives in Keeper. That feature targets multi-replica + clusters; in a single-replica deployment a transient Keeper hiccup during + startup desyncs the Replicated database and silently drops all seeded tables, + which never come back. Keeping `default` Atomic stores tables on the + persistent data volume so they survive restarts. diff --git a/charts/clickstack/tests/clickhouse-deployment_test.yaml b/charts/clickstack/tests/clickhouse-deployment_test.yaml index 44cafaf..b8e252c 100644 --- a/charts/clickstack/tests/clickhouse-deployment_test.yaml +++ b/charts/clickstack/tests/clickhouse-deployment_test.yaml @@ -166,3 +166,11 @@ tests: path: spec.settings.extraUsersConfig.users.app - isNotNull: path: spec.settings.extraUsersConfig.users.otelcollector + + - it: should disable operator databaseSync so the default DB stays Atomic + templates: + - clickhouse/cluster.yaml + asserts: + - equal: + path: spec.settings.enableDatabaseSync + value: false diff --git a/charts/clickstack/values.yaml b/charts/clickstack/values.yaml index ec462a7..e423740 100644 --- a/charts/clickstack/values.yaml +++ b/charts/clickstack/values.yaml @@ -350,6 +350,16 @@ clickhouse: requests: storage: 10Gi settings: + # Disable the operator's database-sync feature (clickhouse-operator + # v0.0.6 defaults enableDatabaseSync: true). When enabled, the operator + # creates the `default` database with the Replicated (DatabaseReplicated) + # engine so table metadata lives in Keeper. That feature targets + # multi-replica clusters; in this single-replica deployment it only adds + # fragility -- a transient Keeper hiccup during startup desyncs the + # Replicated database and silently drops all seeded tables, which never + # come back. Keeping the `default` database Atomic stores tables on the + # persistent data volume so they survive restarts. + enableDatabaseSync: false extraUsersConfig: users: app: From 1399c31300a4f064fc9e6b3a526639c413729841 Mon Sep 17 00:00:00 2001 From: Warren <5959690+wrn14897@users.noreply.github.com> Date: Mon, 29 Jun 2026 23:53:43 -0700 Subject: [PATCH 4/4] docs(clickhouse): trim verbose values comments --- charts/clickstack/values.yaml | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/charts/clickstack/values.yaml b/charts/clickstack/values.yaml index e423740..6c296ec 100644 --- a/charts/clickstack/values.yaml +++ b/charts/clickstack/values.yaml @@ -328,11 +328,6 @@ clickhouse: image: repository: clickhouse/clickhouse-server tag: "25.7-alpine" - # Explicit resources for the ClickHouse server container. Without this - # the clickhouse-operator applies its own small default (512Mi memory, - # with request == limit as of operator v0.0.6), which is too low for the - # full ClickStack schema (many materialized views) and causes the server - # to OOMKill under ingestion + background merges. Override per environment. resources: requests: cpu: 500m @@ -350,15 +345,8 @@ clickhouse: requests: storage: 10Gi settings: - # Disable the operator's database-sync feature (clickhouse-operator - # v0.0.6 defaults enableDatabaseSync: true). When enabled, the operator - # creates the `default` database with the Replicated (DatabaseReplicated) - # engine so table metadata lives in Keeper. That feature targets - # multi-replica clusters; in this single-replica deployment it only adds - # fragility -- a transient Keeper hiccup during startup desyncs the - # Replicated database and silently drops all seeded tables, which never - # come back. Keeping the `default` database Atomic stores tables on the - # persistent data volume so they survive restarts. + # Keep the `default` database Atomic; the Replicated engine the operator + # selects when this is true drops seeded tables on Keeper desync. enableDatabaseSync: false extraUsersConfig: users: