From 2b2116e441b044376c5a19b0bb0d82a99f2b089f Mon Sep 17 00:00:00 2001 From: yiguolei Date: Thu, 11 Jun 2026 15:55:30 +0800 Subject: [PATCH] [improvement](cgroup) inactive_file should be treated as available memory to avoid query be cancelled (#64347) Issue Number: close #xxx Related PR: #xxx Problem Summary: Sometimes we may see this errors in cgroup or k8s environment. Allocator sys memory check failed: Cannot alloc:5343, ...,process memory used 85.41 GB exceed limit 108.00 GB or sys available memory 5.88 GB less than low water mark 6.00 GB. The mem_limit term is false (85.41 < 108). The 5343-byte allocation is rejected only by sys available memory 5.88 GB < low water mark 6.00 GB. 5.88 GiB available implies cgroup_mem_usage of about 114 GiB, roughly 29 GiB above process memory used (85.41 GiB); that gap is unmapped read page cache. The kernel reclaims clean page cache before OOM, so the memory is available, but Doris cannot reclaim it and the rejection repeats on later allocations. (low water mark 6.00 GB is the default: min(120 - 108, 120 * 5%) = 6.) Before this PR, cgroup_mem_usage = memory.current - inactive_file - slab_reclaimable. So some active files page cache is not treated as recycleable memory. So cgroup_mem_usage is a bit larger than RSS. None - Test - [ ] Regression test - [ ] Unit Test - [ ] Manual test (add detailed scripts or steps below) - [ ] No need to test or manual test. Explain why: - [ ] This is a refactor/code format and no logic has been changed. - [ ] Previous test can cover this change. - [ ] No code files have been changed. - [ ] Other reason - Behavior changed: - [ ] No. - [ ] Yes. - Does this need documentation? - [ ] No. - [ ] Yes. - [ ] Confirm the release note - [ ] Confirm test cases - [ ] Confirm document - [ ] Add branch pick label --- be/src/common/cgroup_memory_ctl.cpp | 34 ++++++++++++++++++++++++----- be/src/util/cgroup_util.cpp | 7 ++++++ be/src/util/mem_info.cpp | 16 ++++++-------- 3 files changed, 43 insertions(+), 14 deletions(-) diff --git a/be/src/common/cgroup_memory_ctl.cpp b/be/src/common/cgroup_memory_ctl.cpp index dddcbd50338d82..9a70a615dcc5a9 100644 --- a/be/src/common/cgroup_memory_ctl.cpp +++ b/be/src/common/cgroup_memory_ctl.cpp @@ -93,6 +93,8 @@ struct CgroupsV2Reader : CGroupMemoryCtl::ICgroupsReader { return Status::CgroupError("Error reading {}: {}", file_path.string(), get_str_err_msg()); } + // This means no limit, for example, all process in linux will belong to a cgroup, and + // the default value of the memory limit in memory.max file is "max", which means no limit. if (line == "max") { *value = std::numeric_limits::max(); return Status::OK(); @@ -107,15 +109,37 @@ struct CgroupsV2Reader : CGroupMemoryCtl::ICgroupsReader { std::unordered_map metrics_map; CGroupUtil::read_int_metric_from_cgroup_file((_mount_file_dir / "memory.stat"), metrics_map); - if (*value < metrics_map["inactive_file"]) { - return Status::CgroupError("CgroupsV2Reader read_memory_usage negative memory usage"); + int64_t inactive_file = + metrics_map.contains("inactive_file") ? metrics_map["inactive_file"] : 0; + int64_t active_file = metrics_map.contains("active_file") ? metrics_map["active_file"] : 0; + int64_t slab_reclaimable = + metrics_map.contains("slab_reclaimable") ? metrics_map["slab_reclaimable"] : 0; + if (inactive_file < 0 || active_file < 0 || slab_reclaimable < 0) { + // In this scenario, not return error, ignore it and print log. + LOG(WARNING) << "CgroupsV2Reader read_memory_usage missing expected metrics in " + "memory.stat, inactive_file: " + << inactive_file << ", active_file: " << active_file + << ", slab_reclaimable: " << slab_reclaimable; + return Status::OK(); + } + + const int64_t reclaimable_usage = inactive_file + active_file + slab_reclaimable; + if (*value < reclaimable_usage) { + LOG(WARNING) + << "CgroupsV2Reader read_memory_usage negative memory usage, not - reclaimable " + "usage any more, just return memory.current: " + << *value << ", inactive_file: " << inactive_file + << ", active_file: " << active_file + << ", slab_reclaimable: " << slab_reclaimable; + // In this case, do not return an error, just ignore the negative usage and continue. + // If return error, the upper system will use os available memory instead of cgroup available memory, which may cause OOM more easily. + return Status::OK(); } - // the reason why we subtract inactive_file described here: + // The reclaimable file cache described here should not be counted as used memory: // https://github.com/ClickHouse/ClickHouse/issues/64652#issuecomment-2149630667 - *value -= metrics_map["inactive_file"]; // Part of "slab" that might be reclaimed, such as dentries and inodes. // https://arthurchiao.art/blog/cgroupv2-zh/ - *value -= metrics_map["slab_reclaimable"]; + *value -= reclaimable_usage; return Status::OK(); } diff --git a/be/src/util/cgroup_util.cpp b/be/src/util/cgroup_util.cpp index 5b0b55b183cea1..a5ee540293c08e 100644 --- a/be/src/util/cgroup_util.cpp +++ b/be/src/util/cgroup_util.cpp @@ -177,6 +177,9 @@ std::string CGroupUtil::cgroupv2_of_process() { } // With cgroups v2, there will be a *single* line with prefix "0::/" // (see https://docs.kernel.org/admin-guide/cgroup-v2.html) + // such as 0::/user.slice/user-1005.slice/session-213906.scope this is the cgroup name + // it should be combined with the default cgroup mount point to get the full path to the cgroup, e.g. + // /sys/fs/cgroup/user.slice/user-1005.slice/session-213906.scope std::string cgroup; std::getline(cgroup_name_file, cgroup); static const std::string v2_prefix = "0::/"; @@ -197,6 +200,7 @@ std::optional CGroupUtil::get_cgroupsv2_path(const std::string& sub } std::string cgroup = CGroupUtil::cgroupv2_of_process(); + // /sys/fs/cgroup/user.slice/user-1005.slice/session-213906.scope auto current_cgroup = cgroup.empty() ? default_cgroups_mount : (default_cgroups_mount / cgroup); // Return the bottom-most nested current memory file. If there is no such file at the current @@ -258,6 +262,9 @@ void CGroupUtil::read_int_metric_from_cgroup_file( metrics_map[key] = value; } else if (fields[2] == "kB") { metrics_map[key] = value * 1024L; + } else { + LOG(WARNING) << "Unknown unit in cgroup file " << file_path.string() + << ", line: " << line; } } } diff --git a/be/src/util/mem_info.cpp b/be/src/util/mem_info.cpp index bd6f95f5c56262..9bf36d16c45ef7 100644 --- a/be/src/util/mem_info.cpp +++ b/be/src/util/mem_info.cpp @@ -94,7 +94,7 @@ void MemInfo::refresh_proc_meminfo() { if (meminfo.is_open()) { meminfo.close(); } - + _s_cgroup_mem_refresh_state = false; // refresh cgroup memory if (config::enable_use_cgroup_memory_info) { if (_s_cgroup_mem_refresh_wait_times >= 0) { @@ -119,12 +119,13 @@ void MemInfo::refresh_proc_meminfo() { // cgroup mem limit is refreshed every 10 seconds, // cgroup mem usage is refreshed together with memInfo every time, which is very frequent. + // If _s_cgroup_mem_limit == max, it means get cgroup mem limit failed OR the cgroup has no memory limit for example + // there is just "max" in memory.max file. if (_s_cgroup_mem_limit != std::numeric_limits::max()) { int64_t cgroup_mem_usage; auto status = CGroupMemoryCtl::find_cgroup_mem_usage(&cgroup_mem_usage); if (!status.ok()) { _s_cgroup_mem_usage = std::numeric_limits::min(); - _s_cgroup_mem_refresh_state = false; LOG_EVERY_N(WARNING, 500) << "Refresh cgroup memory usage failed, cgroup mem limit: " << _s_cgroup_mem_limit << ", " << status; @@ -132,17 +133,14 @@ void MemInfo::refresh_proc_meminfo() { _s_cgroup_mem_usage = cgroup_mem_usage; _s_cgroup_mem_refresh_state = true; } - } else { - _s_cgroup_mem_refresh_state = false; } - } else { - _s_cgroup_mem_refresh_state = false; } // 1. calculate physical_mem int64_t physical_mem = -1; - - physical_mem = _mem_info_bytes["MemTotal"]; + if (_mem_info_bytes.find("MemTotal") != _mem_info_bytes.end()) { + physical_mem = _mem_info_bytes["MemTotal"]; + } if (_s_cgroup_mem_refresh_state) { // In theory, always cgroup_mem_limit < physical_mem if (physical_mem < 0) { @@ -200,7 +198,7 @@ void MemInfo::refresh_proc_meminfo() { // Process `MemAvailable = MemFree - LowWaterMark + (PageCache - min(PageCache / 2, LowWaterMark))`, // from `MemAvailable` in `/proc/meminfo`, calculated by OS. // CgroupV2 `MemAvailable = cgroup_mem_limit - cgroup_mem_usage`, - // `cgroup_mem_usage = memory.current - inactive_file - slab_reclaimable`, in fact, + // `cgroup_mem_usage = memory.current - inactive_file - active_file - slab_reclaimable`, in fact, // there seems to be some memory that can be reused in `cgroup_mem_usage`. if (mem_available < 0) { mem_available = _s_cgroup_mem_limit - _s_cgroup_mem_usage;