diff --git a/be/src/common/cgroup_memory_ctl.cpp b/be/src/common/cgroup_memory_ctl.cpp index dddcbd50338d82..9a70a615dcc5a9 100644 --- a/be/src/common/cgroup_memory_ctl.cpp +++ b/be/src/common/cgroup_memory_ctl.cpp @@ -93,6 +93,8 @@ struct CgroupsV2Reader : CGroupMemoryCtl::ICgroupsReader { return Status::CgroupError("Error reading {}: {}", file_path.string(), get_str_err_msg()); } + // This means no limit, for example, all process in linux will belong to a cgroup, and + // the default value of the memory limit in memory.max file is "max", which means no limit. if (line == "max") { *value = std::numeric_limits::max(); return Status::OK(); @@ -107,15 +109,37 @@ struct CgroupsV2Reader : CGroupMemoryCtl::ICgroupsReader { std::unordered_map metrics_map; CGroupUtil::read_int_metric_from_cgroup_file((_mount_file_dir / "memory.stat"), metrics_map); - if (*value < metrics_map["inactive_file"]) { - return Status::CgroupError("CgroupsV2Reader read_memory_usage negative memory usage"); + int64_t inactive_file = + metrics_map.contains("inactive_file") ? metrics_map["inactive_file"] : 0; + int64_t active_file = metrics_map.contains("active_file") ? metrics_map["active_file"] : 0; + int64_t slab_reclaimable = + metrics_map.contains("slab_reclaimable") ? metrics_map["slab_reclaimable"] : 0; + if (inactive_file < 0 || active_file < 0 || slab_reclaimable < 0) { + // In this scenario, not return error, ignore it and print log. + LOG(WARNING) << "CgroupsV2Reader read_memory_usage missing expected metrics in " + "memory.stat, inactive_file: " + << inactive_file << ", active_file: " << active_file + << ", slab_reclaimable: " << slab_reclaimable; + return Status::OK(); + } + + const int64_t reclaimable_usage = inactive_file + active_file + slab_reclaimable; + if (*value < reclaimable_usage) { + LOG(WARNING) + << "CgroupsV2Reader read_memory_usage negative memory usage, not - reclaimable " + "usage any more, just return memory.current: " + << *value << ", inactive_file: " << inactive_file + << ", active_file: " << active_file + << ", slab_reclaimable: " << slab_reclaimable; + // In this case, do not return an error, just ignore the negative usage and continue. + // If return error, the upper system will use os available memory instead of cgroup available memory, which may cause OOM more easily. + return Status::OK(); } - // the reason why we subtract inactive_file described here: + // The reclaimable file cache described here should not be counted as used memory: // https://github.com/ClickHouse/ClickHouse/issues/64652#issuecomment-2149630667 - *value -= metrics_map["inactive_file"]; // Part of "slab" that might be reclaimed, such as dentries and inodes. // https://arthurchiao.art/blog/cgroupv2-zh/ - *value -= metrics_map["slab_reclaimable"]; + *value -= reclaimable_usage; return Status::OK(); } diff --git a/be/src/util/cgroup_util.cpp b/be/src/util/cgroup_util.cpp index 5b0b55b183cea1..a5ee540293c08e 100644 --- a/be/src/util/cgroup_util.cpp +++ b/be/src/util/cgroup_util.cpp @@ -177,6 +177,9 @@ std::string CGroupUtil::cgroupv2_of_process() { } // With cgroups v2, there will be a *single* line with prefix "0::/" // (see https://docs.kernel.org/admin-guide/cgroup-v2.html) + // such as 0::/user.slice/user-1005.slice/session-213906.scope this is the cgroup name + // it should be combined with the default cgroup mount point to get the full path to the cgroup, e.g. + // /sys/fs/cgroup/user.slice/user-1005.slice/session-213906.scope std::string cgroup; std::getline(cgroup_name_file, cgroup); static const std::string v2_prefix = "0::/"; @@ -197,6 +200,7 @@ std::optional CGroupUtil::get_cgroupsv2_path(const std::string& sub } std::string cgroup = CGroupUtil::cgroupv2_of_process(); + // /sys/fs/cgroup/user.slice/user-1005.slice/session-213906.scope auto current_cgroup = cgroup.empty() ? default_cgroups_mount : (default_cgroups_mount / cgroup); // Return the bottom-most nested current memory file. If there is no such file at the current @@ -258,6 +262,9 @@ void CGroupUtil::read_int_metric_from_cgroup_file( metrics_map[key] = value; } else if (fields[2] == "kB") { metrics_map[key] = value * 1024L; + } else { + LOG(WARNING) << "Unknown unit in cgroup file " << file_path.string() + << ", line: " << line; } } } diff --git a/be/src/util/mem_info.cpp b/be/src/util/mem_info.cpp index bd6f95f5c56262..9bf36d16c45ef7 100644 --- a/be/src/util/mem_info.cpp +++ b/be/src/util/mem_info.cpp @@ -94,7 +94,7 @@ void MemInfo::refresh_proc_meminfo() { if (meminfo.is_open()) { meminfo.close(); } - + _s_cgroup_mem_refresh_state = false; // refresh cgroup memory if (config::enable_use_cgroup_memory_info) { if (_s_cgroup_mem_refresh_wait_times >= 0) { @@ -119,12 +119,13 @@ void MemInfo::refresh_proc_meminfo() { // cgroup mem limit is refreshed every 10 seconds, // cgroup mem usage is refreshed together with memInfo every time, which is very frequent. + // If _s_cgroup_mem_limit == max, it means get cgroup mem limit failed OR the cgroup has no memory limit for example + // there is just "max" in memory.max file. if (_s_cgroup_mem_limit != std::numeric_limits::max()) { int64_t cgroup_mem_usage; auto status = CGroupMemoryCtl::find_cgroup_mem_usage(&cgroup_mem_usage); if (!status.ok()) { _s_cgroup_mem_usage = std::numeric_limits::min(); - _s_cgroup_mem_refresh_state = false; LOG_EVERY_N(WARNING, 500) << "Refresh cgroup memory usage failed, cgroup mem limit: " << _s_cgroup_mem_limit << ", " << status; @@ -132,17 +133,14 @@ void MemInfo::refresh_proc_meminfo() { _s_cgroup_mem_usage = cgroup_mem_usage; _s_cgroup_mem_refresh_state = true; } - } else { - _s_cgroup_mem_refresh_state = false; } - } else { - _s_cgroup_mem_refresh_state = false; } // 1. calculate physical_mem int64_t physical_mem = -1; - - physical_mem = _mem_info_bytes["MemTotal"]; + if (_mem_info_bytes.find("MemTotal") != _mem_info_bytes.end()) { + physical_mem = _mem_info_bytes["MemTotal"]; + } if (_s_cgroup_mem_refresh_state) { // In theory, always cgroup_mem_limit < physical_mem if (physical_mem < 0) { @@ -200,7 +198,7 @@ void MemInfo::refresh_proc_meminfo() { // Process `MemAvailable = MemFree - LowWaterMark + (PageCache - min(PageCache / 2, LowWaterMark))`, // from `MemAvailable` in `/proc/meminfo`, calculated by OS. // CgroupV2 `MemAvailable = cgroup_mem_limit - cgroup_mem_usage`, - // `cgroup_mem_usage = memory.current - inactive_file - slab_reclaimable`, in fact, + // `cgroup_mem_usage = memory.current - inactive_file - active_file - slab_reclaimable`, in fact, // there seems to be some memory that can be reused in `cgroup_mem_usage`. if (mem_available < 0) { mem_available = _s_cgroup_mem_limit - _s_cgroup_mem_usage;