From 008b458750bdea19d37de24392f333343ae4959d Mon Sep 17 00:00:00 2001 From: yiguolei Date: Thu, 11 Jun 2026 15:55:30 +0800 Subject: [PATCH] [improvement](cgroup) inactive_file should be treated as available memory to avoid query be cancelled (#64347) ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Sometimes we may see this errors in cgroup or k8s environment. Allocator sys memory check failed: Cannot alloc:5343, ...,process memory used 85.41 GB exceed limit 108.00 GB or sys available memory 5.88 GB less than low water mark 6.00 GB. The mem_limit term is false (85.41 < 108). The 5343-byte allocation is rejected only by sys available memory 5.88 GB < low water mark 6.00 GB. 5.88 GiB available implies cgroup_mem_usage of about 114 GiB, roughly 29 GiB above process memory used (85.41 GiB); that gap is unmapped read page cache. The kernel reclaims clean page cache before OOM, so the memory is available, but Doris cannot reclaim it and the rejection repeats on later allocations. (low water mark 6.00 GB is the default: min(120 - 108, 120 * 5%) = 6.) Before this PR, cgroup_mem_usage = memory.current - inactive_file - slab_reclaimable. So some active files page cache is not treated as recycleable memory. So cgroup_mem_usage is a bit larger than RSS. ### Release note None ### Check List (For Author) - Test - [ ] Regression test - [ ] Unit Test - [ ] Manual test (add detailed scripts or steps below) - [ ] No need to test or manual test. Explain why: - [ ] This is a refactor/code format and no logic has been changed. - [ ] Previous test can cover this change. - [ ] No code files have been changed. - [ ] Other reason - Behavior changed: - [ ] No. - [ ] Yes. - Does this need documentation? - [ ] No. - [ ] Yes. ### Check List (For Reviewer who merge this PR) - [ ] Confirm the release note - [ ] Confirm test cases - [ ] Confirm document - [ ] Add branch pick label --- be/src/common/cgroup_memory_ctl.cpp | 34 ++++++++++++++++--- be/src/util/cgroup_util.cpp | 7 ++++ be/src/util/mem_info.cpp | 16 ++++----- .../meta_service_rate_limit_helper.cpp | 2 +- 4 files changed, 44 insertions(+), 15 deletions(-) diff --git a/be/src/common/cgroup_memory_ctl.cpp b/be/src/common/cgroup_memory_ctl.cpp index dddcbd50338d82..9a70a615dcc5a9 100644 --- a/be/src/common/cgroup_memory_ctl.cpp +++ b/be/src/common/cgroup_memory_ctl.cpp @@ -93,6 +93,8 @@ struct CgroupsV2Reader : CGroupMemoryCtl::ICgroupsReader { return Status::CgroupError("Error reading {}: {}", file_path.string(), get_str_err_msg()); } + // This means no limit, for example, all process in linux will belong to a cgroup, and + // the default value of the memory limit in memory.max file is "max", which means no limit. if (line == "max") { *value = std::numeric_limits::max(); return Status::OK(); @@ -107,15 +109,37 @@ struct CgroupsV2Reader : CGroupMemoryCtl::ICgroupsReader { std::unordered_map metrics_map; CGroupUtil::read_int_metric_from_cgroup_file((_mount_file_dir / "memory.stat"), metrics_map); - if (*value < metrics_map["inactive_file"]) { - return Status::CgroupError("CgroupsV2Reader read_memory_usage negative memory usage"); + int64_t inactive_file = + metrics_map.contains("inactive_file") ? metrics_map["inactive_file"] : 0; + int64_t active_file = metrics_map.contains("active_file") ? metrics_map["active_file"] : 0; + int64_t slab_reclaimable = + metrics_map.contains("slab_reclaimable") ? metrics_map["slab_reclaimable"] : 0; + if (inactive_file < 0 || active_file < 0 || slab_reclaimable < 0) { + // In this scenario, not return error, ignore it and print log. + LOG(WARNING) << "CgroupsV2Reader read_memory_usage missing expected metrics in " + "memory.stat, inactive_file: " + << inactive_file << ", active_file: " << active_file + << ", slab_reclaimable: " << slab_reclaimable; + return Status::OK(); + } + + const int64_t reclaimable_usage = inactive_file + active_file + slab_reclaimable; + if (*value < reclaimable_usage) { + LOG(WARNING) + << "CgroupsV2Reader read_memory_usage negative memory usage, not - reclaimable " + "usage any more, just return memory.current: " + << *value << ", inactive_file: " << inactive_file + << ", active_file: " << active_file + << ", slab_reclaimable: " << slab_reclaimable; + // In this case, do not return an error, just ignore the negative usage and continue. + // If return error, the upper system will use os available memory instead of cgroup available memory, which may cause OOM more easily. + return Status::OK(); } - // the reason why we subtract inactive_file described here: + // The reclaimable file cache described here should not be counted as used memory: // https://github.com/ClickHouse/ClickHouse/issues/64652#issuecomment-2149630667 - *value -= metrics_map["inactive_file"]; // Part of "slab" that might be reclaimed, such as dentries and inodes. // https://arthurchiao.art/blog/cgroupv2-zh/ - *value -= metrics_map["slab_reclaimable"]; + *value -= reclaimable_usage; return Status::OK(); } diff --git a/be/src/util/cgroup_util.cpp b/be/src/util/cgroup_util.cpp index cfda0a4fb77d35..cc52a60ed75245 100644 --- a/be/src/util/cgroup_util.cpp +++ b/be/src/util/cgroup_util.cpp @@ -178,6 +178,9 @@ std::string CGroupUtil::cgroupv2_of_process() { } // With cgroups v2, there will be a *single* line with prefix "0::/" // (see https://docs.kernel.org/admin-guide/cgroup-v2.html) + // such as 0::/user.slice/user-1005.slice/session-213906.scope this is the cgroup name + // it should be combined with the default cgroup mount point to get the full path to the cgroup, e.g. + // /sys/fs/cgroup/user.slice/user-1005.slice/session-213906.scope std::string cgroup; std::getline(cgroup_name_file, cgroup); static const std::string v2_prefix = "0::/"; @@ -198,6 +201,7 @@ std::optional CGroupUtil::get_cgroupsv2_path(const std::string& sub } std::string cgroup = CGroupUtil::cgroupv2_of_process(); + // /sys/fs/cgroup/user.slice/user-1005.slice/session-213906.scope auto current_cgroup = cgroup.empty() ? default_cgroups_mount : (default_cgroups_mount / cgroup); // Return the bottom-most nested current memory file. If there is no such file at the current @@ -259,6 +263,9 @@ void CGroupUtil::read_int_metric_from_cgroup_file( metrics_map[key] = value; } else if (fields[2] == "kB") { metrics_map[key] = value * 1024L; + } else { + LOG(WARNING) << "Unknown unit in cgroup file " << file_path.string() + << ", line: " << line; } } } diff --git a/be/src/util/mem_info.cpp b/be/src/util/mem_info.cpp index 4b48d40d2d93d2..ebd6f4a5442d13 100644 --- a/be/src/util/mem_info.cpp +++ b/be/src/util/mem_info.cpp @@ -94,7 +94,7 @@ void MemInfo::refresh_proc_meminfo() { if (meminfo.is_open()) { meminfo.close(); } - + _s_cgroup_mem_refresh_state = false; // refresh cgroup memory if (config::enable_use_cgroup_memory_info) { if (_s_cgroup_mem_refresh_wait_times >= 0) { @@ -119,12 +119,13 @@ void MemInfo::refresh_proc_meminfo() { // cgroup mem limit is refreshed every 10 seconds, // cgroup mem usage is refreshed together with memInfo every time, which is very frequent. + // If _s_cgroup_mem_limit == max, it means get cgroup mem limit failed OR the cgroup has no memory limit for example + // there is just "max" in memory.max file. if (_s_cgroup_mem_limit != std::numeric_limits::max()) { int64_t cgroup_mem_usage; auto status = CGroupMemoryCtl::find_cgroup_mem_usage(&cgroup_mem_usage); if (!status.ok()) { _s_cgroup_mem_usage = std::numeric_limits::min(); - _s_cgroup_mem_refresh_state = false; LOG_EVERY_N(WARNING, 500) << "Refresh cgroup memory usage failed, cgroup mem limit: " << _s_cgroup_mem_limit << ", " << status; @@ -132,17 +133,14 @@ void MemInfo::refresh_proc_meminfo() { _s_cgroup_mem_usage = cgroup_mem_usage; _s_cgroup_mem_refresh_state = true; } - } else { - _s_cgroup_mem_refresh_state = false; } - } else { - _s_cgroup_mem_refresh_state = false; } // 1. calculate physical_mem int64_t physical_mem = -1; - - physical_mem = _mem_info_bytes["MemTotal"]; + if (_mem_info_bytes.find("MemTotal") != _mem_info_bytes.end()) { + physical_mem = _mem_info_bytes["MemTotal"]; + } if (_s_cgroup_mem_refresh_state) { // In theory, always cgroup_mem_limit < physical_mem if (physical_mem < 0) { @@ -200,7 +198,7 @@ void MemInfo::refresh_proc_meminfo() { // Process `MemAvailable = MemFree - LowWaterMark + (PageCache - min(PageCache / 2, LowWaterMark))`, // from `MemAvailable` in `/proc/meminfo`, calculated by OS. // CgroupV2 `MemAvailable = cgroup_mem_limit - cgroup_mem_usage`, - // `cgroup_mem_usage = memory.current - inactive_file - slab_reclaimable`, in fact, + // `cgroup_mem_usage = memory.current - inactive_file - active_file - slab_reclaimable`, in fact, // there seems to be some memory that can be reused in `cgroup_mem_usage`. if (mem_available < 0) { mem_available = _s_cgroup_mem_limit - _s_cgroup_mem_usage; diff --git a/cloud/src/meta-service/meta_service_rate_limit_helper.cpp b/cloud/src/meta-service/meta_service_rate_limit_helper.cpp index 0f8c750be6cf3c..9af2aead05cca9 100644 --- a/cloud/src/meta-service/meta_service_rate_limit_helper.cpp +++ b/cloud/src/meta-service/meta_service_rate_limit_helper.cpp @@ -360,7 +360,7 @@ std::optional get_cgroup_memory_info() { } auto metrics = read_metrics_map(*dir / "memory.stat"); int64_t adjusted_usage = *usage; - adjusted_usage -= metrics["inactive_file"]; + adjusted_usage -= metrics["inactive_file"] + metrics["active_file"]; adjusted_usage -= metrics["slab_reclaimable"]; adjusted_usage = std::max(0, adjusted_usage); return CgroupMemoryInfo {limit_bytes, adjusted_usage};