From 4c51702c3ac1ce40474c227d0c890b5f9153d32e Mon Sep 17 00:00:00 2001 From: somaz Date: Thu, 11 Jun 2026 15:53:18 +0900 Subject: [PATCH] feat(host_metrics source): add temperature metrics collector --- .../21389_host_metrics_temperature.feature.md | 11 ++++ src/sources/host_metrics/mod.rs | 10 ++- src/sources/host_metrics/temperature.rs | 66 +++++++++++++++++++ .../sources/generated/host_metrics.cue | 21 +++--- .../components/sources/host_metrics.cue | 17 +++++ 5 files changed, 114 insertions(+), 11 deletions(-) create mode 100644 changelog.d/21389_host_metrics_temperature.feature.md create mode 100644 src/sources/host_metrics/temperature.rs diff --git a/changelog.d/21389_host_metrics_temperature.feature.md b/changelog.d/21389_host_metrics_temperature.feature.md new file mode 100644 index 0000000000000..c3593214d9665 --- /dev/null +++ b/changelog.d/21389_host_metrics_temperature.feature.md @@ -0,0 +1,11 @@ +The `host_metrics` source can now collect hardware temperature readings via a +new `temperature` collector. When enabled, it emits `temperature_celsius`, +`temperature_max_celsius`, and `temperature_critical_celsius` gauges, each +tagged with the `component` label of the sensor it was read from. + +The collector is opt-in: add `temperature` to the `collectors` list to enable +it. Components that do not report a given value (for example a missing critical +threshold) are skipped, and environments without temperature sensors simply +produce no metrics. + +authors: somaz94 diff --git a/src/sources/host_metrics/mod.rs b/src/sources/host_metrics/mod.rs index 0052d0d0b8b98..d0350749ce185 100644 --- a/src/sources/host_metrics/mod.rs +++ b/src/sources/host_metrics/mod.rs @@ -40,6 +40,7 @@ mod network; mod process; #[cfg(target_os = "linux")] mod tcp; +mod temperature; /// Collector types. #[serde_as] @@ -78,6 +79,9 @@ pub enum Collector { /// Metrics related to TCP connections. TCP, + + /// Metrics related to component temperatures. + Temperature, } /// Filtering configuration. @@ -186,7 +190,7 @@ pub fn default_namespace() -> Option { Some(String::from("host")) } -const fn example_collectors() -> [&'static str; 9] { +const fn example_collectors() -> [&'static str; 10] { [ "cgroups", "cpu", @@ -197,6 +201,7 @@ const fn example_collectors() -> [&'static str; 9] { "memory", "network", "tcp", + "temperature", ] } @@ -420,6 +425,9 @@ impl HostMetrics { if self.config.has_collector(Collector::TCP) { self.tcp_metrics(&mut buffer).await; } + if self.config.has_collector(Collector::Temperature) { + self.temperature_metrics(&mut buffer).await; + } let metrics = buffer.metrics; self.events_received.emit(CountByteSize( diff --git a/src/sources/host_metrics/temperature.rs b/src/sources/host_metrics/temperature.rs new file mode 100644 index 0000000000000..560addddc3085 --- /dev/null +++ b/src/sources/host_metrics/temperature.rs @@ -0,0 +1,66 @@ +use sysinfo::Components; +use vector_lib::metric_tags; + +use super::HostMetrics; + +const COMPONENT: &str = "component"; +const TEMPERATURE_CELSIUS: &str = "temperature_celsius"; +const TEMPERATURE_MAX_CELSIUS: &str = "temperature_max_celsius"; +const TEMPERATURE_CRITICAL_CELSIUS: &str = "temperature_critical_celsius"; + +impl HostMetrics { + pub async fn temperature_metrics(&self, output: &mut super::MetricsBuffer) { + output.name = "temperature"; + let components = Components::new_with_refreshed_list(); + for component in &components { + let label = component.label(); + let tags = || metric_tags!(COMPONENT => label); + if let Some(temperature) = component.temperature() { + output.gauge(TEMPERATURE_CELSIUS, temperature as f64, tags()); + } + if let Some(max) = component.max() { + output.gauge(TEMPERATURE_MAX_CELSIUS, max as f64, tags()); + } + if let Some(critical) = component.critical() { + output.gauge(TEMPERATURE_CRITICAL_CELSIUS, critical as f64, tags()); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::{ + super::{HostMetrics, HostMetricsConfig, MetricsBuffer, tests::all_gauges}, + COMPONENT, + }; + + #[tokio::test] + async fn generates_temperature_metrics() { + let mut buffer = MetricsBuffer::new(None); + HostMetrics::new(HostMetricsConfig::default()) + .temperature_metrics(&mut buffer) + .await; + let metrics = buffer.metrics; + + // Temperature sensors are not exposed in many environments (containers, + // virtual machines, CI runners), so the component list can legitimately + // be empty. When metrics are produced, they must all be gauges named + // `temperature*` and carry the `component` tag. + assert!(all_gauges(&metrics)); + for metric in &metrics { + assert!( + metric.name().starts_with("temperature"), + "unexpected metric name: {}", + metric.name() + ); + assert!( + metric + .tags() + .expect("temperature metric is missing tags") + .contains_key(COMPONENT), + "temperature metric is missing the `component` tag" + ); + } + } +} diff --git a/website/cue/reference/components/sources/generated/host_metrics.cue b/website/cue/reference/components/sources/generated/host_metrics.cue index 64ab2d676ed4d..2a461fdec453e 100644 --- a/website/cue/reference/components/sources/generated/host_metrics.cue +++ b/website/cue/reference/components/sources/generated/host_metrics.cue @@ -80,17 +80,18 @@ generated: components: sources: host_metrics: configuration: { Only available on Linux. """ - cpu: "Metrics related to CPU utilization." - disk: "Metrics related to disk I/O utilization." - filesystem: "Metrics related to filesystem space utilization." - host: "Metrics related to the host." - load: "Metrics related to the system load average." - memory: "Metrics related to memory utilization." - network: "Metrics related to network utilization." - process: "Metrics related to Process utilization." - tcp: "Metrics related to TCP connections." + cpu: "Metrics related to CPU utilization." + disk: "Metrics related to disk I/O utilization." + filesystem: "Metrics related to filesystem space utilization." + host: "Metrics related to the host." + load: "Metrics related to the system load average." + memory: "Metrics related to memory utilization." + network: "Metrics related to network utilization." + process: "Metrics related to Process utilization." + tcp: "Metrics related to TCP connections." + temperature: "Metrics related to component temperatures." } - examples: ["cgroups", "cpu", "disk", "filesystem", "load", "host", "memory", "network", "tcp"] + examples: ["cgroups", "cpu", "disk", "filesystem", "load", "host", "memory", "network", "tcp", "temperature"] } } } diff --git a/website/cue/reference/components/sources/host_metrics.cue b/website/cue/reference/components/sources/host_metrics.cue index 967088c9e24d2..cdf14a5a7f207 100644 --- a/website/cue/reference/components/sources/host_metrics.cue +++ b/website/cue/reference/components/sources/host_metrics.cue @@ -193,6 +193,11 @@ components: sources: host_metrics: { } } + // Host temperature + temperature_celsius: _host & _temperature_gauge & {description: "The current temperature reported by a hardware component, in degrees Celsius."} + temperature_max_celsius: _host & _temperature_gauge & {description: "The highest temperature recorded for a hardware component, in degrees Celsius."} + temperature_critical_celsius: _host & _temperature_gauge & {description: "The temperature at which a hardware component is considered critical, in degrees Celsius."} + // Helpers _host: { default_namespace: "host" @@ -307,5 +312,17 @@ components: sources: host_metrics: { } } } + + _temperature_gauge: { + type: "gauge" + tags: _host_metrics_tags & { + collector: examples: ["temperature"] + component: { + description: "The label of the hardware component the temperature was read from." + required: true + examples: ["Core 0", "coretemp Package id 0", "nvme Composite"] + } + } + } } }