From 44c9e625b1d42178e0b8b449f9b8a954b585bdeb Mon Sep 17 00:00:00 2001 From: midu Date: Sun, 22 Feb 2026 15:37:06 +0100 Subject: [PATCH] Add kernel stack cost-per-packet metrics, nodeconfig collector, and AI-Helpers docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add ebpf-pmd-jitter collector (Linux): in-tree eBPF program (collector/bpf/latency.c) measures kernel stack packet latency (XDP→TC); exposes latency min/max/avg, jitter, histogram, and collector_up/load_error/object_path_configured. Disabled by default; requires built eBPF object and --collector.ebpf-pmd-jitter.object-path. - Add nodeconfig collector (Linux): runbook-oriented metrics from sysfs and DMI (PCIe NIC link width, slot ok, cores dedicated, memory banks full). Disabled by default. - Add cmd/kernel_stack_stress_server: TCP server for functional test (variable backlog, rcvbuf, read delay, hold connections). - Add kernel_stack_af_packet_functional_test.go: Linux-only root functional test (netns) for conntrack drops, listen overflow, TCPRcvQDrop, traffic+NUMA, traffic+pcap; preserves pcaps under /tmp/node_exporter_kernel_stack_pcaps_*. - Add docs/KERNEL_STACK_AF_PACKET_METRICS.md: full guide correlating metrics with cost per packet and AF_PACKET, optional collectors, examples, and functional test. --- cmd/kernel_stack_stress_server/main.go | 114 ++ collector/bpf/latency.c | 257 +++++ collector/ebpf_pmd_jitter_linux.go | 370 +++++++ collector/nodeconfig_linux.go | 215 ++++ docs/KERNEL_STACK_AF_PACKET_METRICS.md | 544 +++++++++ docs/node-exporter-new-features.excalidraw | 1160 ++++++++++++++++++++ go.mod | 1 + go.sum | 6 +- kernel_stack_af_packet_functional_test.go | 528 +++++++++ 9 files changed, 3193 insertions(+), 2 deletions(-) create mode 100644 cmd/kernel_stack_stress_server/main.go create mode 100644 collector/bpf/latency.c create mode 100644 collector/ebpf_pmd_jitter_linux.go create mode 100644 collector/nodeconfig_linux.go create mode 100644 docs/KERNEL_STACK_AF_PACKET_METRICS.md create mode 100644 docs/node-exporter-new-features.excalidraw create mode 100644 kernel_stack_af_packet_functional_test.go diff --git a/cmd/kernel_stack_stress_server/main.go b/cmd/kernel_stack_stress_server/main.go new file mode 100644 index 0000000000..12a8881d9a --- /dev/null +++ b/cmd/kernel_stack_stress_server/main.go @@ -0,0 +1,114 @@ +// Copyright 2025 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"). +// This small server is used by the kernel stack AF_PACKET functional test +// to generate traffic scenarios (conntrack fill, listen overflow, TCP rcvbuf). +// +//go:build linux + +package main + +import ( + "flag" + "fmt" + "log" + "net" + "os" + "sync" + "time" + + "golang.org/x/sys/unix" +) + +var ( + port = flag.Int("port", 9999, "Listen port") + backlog = flag.Int("backlog", 128, "Listen backlog (use 1 for listen-overflow scenario)") + rcvbuf = flag.Int("rcvbuf", 0, "SO_RCVBUF size (use small value for TCPRcvQDrop scenario)") + hold = flag.Int("hold", 0, "Max connections to accept and hold (0 = accept and close; use with conntrack fill)") + sleep = flag.Duration("read-delay", 0, "Delay between reads (use for slow drain / TCPRcvQDrop)") +) + +func main() { + flag.Parse() + var ln net.Listener + var err error + if *backlog == 1 { + ln, err = listenBacklog1(*port) + } else { + ln, err = net.Listen("tcp", fmt.Sprintf("0.0.0.0:%d", *port)) + } + if err != nil { + log.Fatal(err) + } + defer ln.Close() + + var held sync.WaitGroup + acceptLimit := *hold + if acceptLimit <= 0 { + acceptLimit = 1 << 30 + } + accepted := 0 + var mu sync.Mutex + for { + conn, err := ln.Accept() + if err != nil { + log.Print(err) + continue + } + mu.Lock() + accepted++ + if accepted > acceptLimit { + mu.Unlock() + conn.Close() + continue + } + mu.Unlock() + if *rcvbuf > 0 { + if tcp, ok := conn.(*net.TCPConn); ok { + _ = tcp.SetReadBuffer(*rcvbuf) + } + } + if *hold > 0 { + held.Add(1) + go func(c net.Conn) { + defer held.Done() + defer c.Close() + buf := make([]byte, 1) + for { + if *sleep > 0 { + time.Sleep(*sleep) + } + _, err := c.Read(buf) + if err != nil { + return + } + } + }(conn) + } else { + conn.Close() + } + } +} + +// listenBacklog1 creates a TCP listener with backlog 1 (for listen-overflow scenario). +func listenBacklog1(port int) (net.Listener, error) { + fd, err := unix.Socket(unix.AF_INET, unix.SOCK_STREAM, 0) + if err != nil { + return nil, err + } + if err := unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_REUSEADDR, 1); err != nil { + unix.Close(fd) + return nil, err + } + addr := unix.SockaddrInet4{Port: port} + if err := unix.Bind(fd, &addr); err != nil { + unix.Close(fd) + return nil, err + } + if err := unix.Listen(fd, 1); err != nil { + unix.Close(fd) + return nil, err + } + f := os.NewFile(uintptr(fd), "listener") + // FileListener takes ownership; do not close f here. + return net.FileListener(f) +} diff --git a/collector/bpf/latency.c b/collector/bpf/latency.c new file mode 100644 index 0000000000..109ad71862 --- /dev/null +++ b/collector/bpf/latency.c @@ -0,0 +1,257 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * eBPF packet latency measurement for node_exporter. + * + * Measures time packets spend in the kernel network stack (XDP to TC). + * Used by the node_exporter ebpf-pmd-jitter collector to expose + * PMD-style latency jitter and kernel stack latency metrics. + * + * Build: make build-bpf (from node_exporter root), or: + * clang -O2 -g -target bpf -c collector/bpf/latency.c -o collector/bpf/latency.o \ + * -I/usr/include + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_TRACKED_PACKETS 65536 +#define LATENCY_BUCKET_COUNT 16 + +#ifndef TC_ACT_OK +#define TC_ACT_OK 0 +#endif + +struct packet_timestamp { + __u64 timestamp_ns; + __u32 ifindex; + __u32 len; +}; + +struct interface_stats { + __u64 packets_total; + __u64 bytes_total; + __u64 latency_ns_total; + __u64 latency_min_ns; + __u64 latency_max_ns; + __u64 xdp_packets; + __u64 tc_ingress_packets; + __u64 tc_egress_packets; + __u64 softirq_time_ns; +}; + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, MAX_TRACKED_PACKETS); + __type(key, __u32); + __type(value, struct packet_timestamp); + __uint(pinning, LIBBPF_PIN_BY_NAME); +} packet_timestamps SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(max_entries, 256); + __type(key, __u32); + __type(value, struct interface_stats); + __uint(pinning, LIBBPF_PIN_BY_NAME); +} interface_latency_stats SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, LATENCY_BUCKET_COUNT); + __type(key, __u32); + __type(value, __u64); + __uint(pinning, LIBBPF_PIN_BY_NAME); +} latency_histogram SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); + __uint(pinning, LIBBPF_PIN_BY_NAME); +} global_packets SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); + __uint(pinning, LIBBPF_PIN_BY_NAME); +} global_latency_ns SEC(".maps"); + +static __always_inline __u32 calculate_packet_hash(void *data, void *data_end) +{ + struct ethhdr *eth = data; + __u32 hash = 0; + + if ((void *)(eth + 1) > data_end) + return 0; + + hash = eth->h_source[0] ^ eth->h_source[5]; + hash ^= eth->h_dest[0] ^ eth->h_dest[5]; + + if (eth->h_proto == bpf_htons(ETH_P_IP)) { + struct iphdr *ip = (void *)(eth + 1); + if ((void *)(ip + 1) > data_end) + return hash; + hash ^= ip->saddr ^ ip->daddr ^ ip->protocol ^ ip->id; + if (ip->protocol == IPPROTO_TCP || ip->protocol == IPPROTO_UDP) { + __u16 *ports = (void *)ip + (ip->ihl * 4); + if ((void *)(ports + 2) <= data_end) { + hash ^= ports[0] ^ ports[1]; + } + } + } else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) { + struct ipv6hdr *ip6 = (void *)(eth + 1); + if ((void *)(ip6 + 1) > data_end) + return hash; + hash ^= ip6->saddr.s6_addr32[0] ^ ip6->saddr.s6_addr32[3]; + hash ^= ip6->daddr.s6_addr32[0] ^ ip6->daddr.s6_addr32[3]; + hash ^= ip6->nexthdr; + } + return hash; +} + +static __always_inline __u32 get_latency_bucket(__u64 latency_ns) +{ + __u64 us = latency_ns / 1000; + if (us < 1) return 0; + if (us < 2) return 1; + if (us < 4) return 2; + if (us < 8) return 3; + if (us < 16) return 4; + if (us < 32) return 5; + if (us < 64) return 6; + if (us < 128) return 7; + if (us < 256) return 8; + if (us < 512) return 9; + if (us < 1024) return 10; + if (us < 2048) return 11; + if (us < 4096) return 12; + if (us < 8192) return 13; + if (us < 16384) return 14; + return 15; +} + +static __always_inline void update_histogram(__u64 latency_ns) +{ + __u32 bucket = get_latency_bucket(latency_ns); + __u64 *count = bpf_map_lookup_elem(&latency_histogram, &bucket); + if (count) + __sync_fetch_and_add(count, 1); +} + +static __always_inline void update_interface_stats(__u32 ifindex, __u32 pkt_len, + __u64 latency_ns, int hook_type) +{ + struct interface_stats *stats = bpf_map_lookup_elem(&interface_latency_stats, &ifindex); + if (!stats) { + struct interface_stats new_stats = {}; + new_stats.latency_min_ns = latency_ns > 0 ? latency_ns : ~0ULL; + new_stats.latency_max_ns = latency_ns; + bpf_map_update_elem(&interface_latency_stats, &ifindex, &new_stats, BPF_ANY); + stats = bpf_map_lookup_elem(&interface_latency_stats, &ifindex); + if (!stats) + return; + } + __sync_fetch_and_add(&stats->packets_total, 1); + __sync_fetch_and_add(&stats->bytes_total, pkt_len); + if (latency_ns > 0) { + __sync_fetch_and_add(&stats->latency_ns_total, latency_ns); + if (latency_ns < stats->latency_min_ns || stats->latency_min_ns == 0) + stats->latency_min_ns = latency_ns; + if (latency_ns > stats->latency_max_ns) + stats->latency_max_ns = latency_ns; + } + if (hook_type == 0) + __sync_fetch_and_add(&stats->xdp_packets, 1); + else if (hook_type == 1) + __sync_fetch_and_add(&stats->tc_ingress_packets, 1); + else if (hook_type == 2) + __sync_fetch_and_add(&stats->tc_egress_packets, 1); +} + +static __always_inline void update_global_stats(__u64 latency_ns) +{ + __u32 key = 0; + __u64 *packets = bpf_map_lookup_elem(&global_packets, &key); + if (packets) + __sync_fetch_and_add(packets, 1); + if (latency_ns > 0) { + __u64 *latency = bpf_map_lookup_elem(&global_latency_ns, &key); + if (latency) + __sync_fetch_and_add(latency, latency_ns); + } +} + +SEC("xdp") +int xdp_latency_ingress(struct xdp_md *ctx) +{ + void *data = (void *)(long)ctx->data; + void *data_end = (void *)(long)ctx->data_end; + __u32 pkt_len = (__u32)(data_end - data); + __u64 now = bpf_ktime_get_ns(); + __u32 hash = calculate_packet_hash(data, data_end); + if (hash == 0) + goto out; + struct packet_timestamp ts = { + .timestamp_ns = now, + .ifindex = ctx->ingress_ifindex, + .len = pkt_len, + }; + bpf_map_update_elem(&packet_timestamps, &hash, &ts, BPF_ANY); + update_interface_stats(ctx->ingress_ifindex, pkt_len, 0, 0); +out: + return XDP_PASS; +} + +SEC("tc") +int tc_latency_ingress(struct __sk_buff *skb) +{ + void *data = (void *)(long)skb->data; + void *data_end = (void *)(long)skb->data_end; + __u64 now = bpf_ktime_get_ns(); + __u64 latency_ns = 0; + __u32 hash = calculate_packet_hash(data, data_end); + if (hash != 0) { + struct packet_timestamp *ts = bpf_map_lookup_elem(&packet_timestamps, &hash); + if (ts && ts->timestamp_ns > 0) { + latency_ns = now - ts->timestamp_ns; + update_histogram(latency_ns); + update_global_stats(latency_ns); + } + } + update_interface_stats(skb->ifindex, skb->len, latency_ns, 1); + return TC_ACT_OK; +} + +SEC("tc") +int tc_latency_egress(struct __sk_buff *skb) +{ + void *data = (void *)(long)skb->data; + void *data_end = (void *)(long)skb->data_end; + __u64 now = bpf_ktime_get_ns(); + __u64 latency_ns = 0; + __u32 hash = calculate_packet_hash(data, data_end); + if (hash != 0) { + struct packet_timestamp *ts = bpf_map_lookup_elem(&packet_timestamps, &hash); + if (ts && ts->timestamp_ns > 0) { + latency_ns = now - ts->timestamp_ns; + update_histogram(latency_ns); + update_global_stats(latency_ns); + bpf_map_delete_elem(&packet_timestamps, &hash); + } + } + update_interface_stats(skb->ifindex, skb->len, latency_ns, 2); + return TC_ACT_OK; +} + +char _license[] SEC("license") = "GPL"; diff --git a/collector/ebpf_pmd_jitter_linux.go b/collector/ebpf_pmd_jitter_linux.go new file mode 100644 index 0000000000..305baa3894 --- /dev/null +++ b/collector/ebpf_pmd_jitter_linux.go @@ -0,0 +1,370 @@ +// Copyright 2025 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build linux && !noebpfpmdjitter + +// Package collector provides the ebpf-pmd-jitter collector, which loads an +// in-tree eBPF program (built from collector/bpf/latency.c) to measure kernel +// stack packet latency and exposes PMD-style jitter and latency metrics. +// Requires: --collector.ebpf-pmd-jitter.object-path pointing at the compiled +// latency.o and optionally --collector.ebpf-pmd-jitter.interfaces to attach. +package collector + +import ( + "bytes" + "fmt" + "log/slog" + "net" + "os" + "path/filepath" + "strings" + "sync" + + "github.com/alecthomas/kingpin/v2" + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/link" + "github.com/prometheus/client_golang/prometheus" +) + +const ( + ebpfPmdJitterSubsystem = "ebpf_pmd_jitter" + latencyHistogramBuckets = 16 +) + +var ( + ebpfPmdJitterObjectPath = kingpin.Flag("collector.ebpf-pmd-jitter.object-path", + "Path to compiled eBPF object (latency.o) built from collector/bpf/latency.c. If empty, collector reports no data.").Default("").String() + ebpfPmdJitterInterfaces = kingpin.Flag("collector.ebpf-pmd-jitter.interfaces", + "Comma-separated list of interfaces to attach XDP latency measurement to (e.g. lo,eth0).").Default("lo").String() +) + +type interfaceStats struct { + PacketsTotal uint64 + BytesTotal uint64 + LatencyNsTotal uint64 + LatencyMinNs uint64 + LatencyMaxNs uint64 + XDPPackets uint64 + TCIngressPackets uint64 + TCEgressPackets uint64 + SoftirqTimeNs uint64 +} + +var latencyHistogramBucketLabels = []string{ + "0-1us", "1-2us", "2-4us", "4-8us", "8-16us", "16-32us", "32-64us", + "64-128us", "128-256us", "256-512us", "512-1024us", "1-2ms", + "2-4ms", "4-8ms", "8-16ms", "16ms+", +} + +const maxLoadErrorLabelLen = 200 + +func sanitizeForLabel(s string) string { + s = strings.ReplaceAll(s, "\n", " ") + if len(s) > maxLoadErrorLabelLen { + s = s[:maxLoadErrorLabelLen-3] + "..." + } + return s +} + +type ebpfPmdJitterCollector struct { + objectPath string + interfaces []string + logger *slog.Logger + mu sync.Mutex + lastLoadError string + // Set on first successful load + coll *ebpf.Collection + xdpLinks map[string]link.Link + ifindexToName_ map[int]string + // Descriptors + packetsTotalDesc *prometheus.Desc + bytesTotalDesc *prometheus.Desc + latencyMinNsDesc *prometheus.Desc + latencyMaxNsDesc *prometheus.Desc + latencyAvgNsDesc *prometheus.Desc + jitterNsDesc *prometheus.Desc + histogramDesc *prometheus.Desc + globalPacketsDesc *prometheus.Desc + globalLatencyDesc *prometheus.Desc + collectorUpDesc *prometheus.Desc + objectPathSetDesc *prometheus.Desc + loadErrorDesc *prometheus.Desc +} + +func init() { + registerCollector("ebpf-pmd-jitter", defaultDisabled, NewEbpfPmdJitterCollector) +} + +// NewEbpfPmdJitterCollector returns a collector that loads the eBPF latency +// program from the given object path and exposes PMD jitter and latency metrics. +func NewEbpfPmdJitterCollector(logger *slog.Logger) (Collector, error) { + ifaces := strings.Split(*ebpfPmdJitterInterfaces, ",") + for i := range ifaces { + ifaces[i] = strings.TrimSpace(ifaces[i]) + } + return &ebpfPmdJitterCollector{ + objectPath: *ebpfPmdJitterObjectPath, + interfaces: ifaces, + logger: logger, + xdpLinks: make(map[string]link.Link), + ifindexToName_: make(map[int]string), + packetsTotalDesc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, ebpfPmdJitterSubsystem, "latency_packets_total"), + "Total packets measured for latency (eBPF kernel stack latency).", + []string{"interface"}, nil, + ), + bytesTotalDesc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, ebpfPmdJitterSubsystem, "latency_bytes_total"), + "Total bytes processed (eBPF latency measurement).", + []string{"interface"}, nil, + ), + latencyMinNsDesc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, ebpfPmdJitterSubsystem, "latency_min_ns"), + "Minimum observed packet latency in nanoseconds (kernel stack).", + []string{"interface"}, nil, + ), + latencyMaxNsDesc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, ebpfPmdJitterSubsystem, "latency_max_ns"), + "Maximum observed packet latency in nanoseconds (kernel stack).", + []string{"interface"}, nil, + ), + latencyAvgNsDesc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, ebpfPmdJitterSubsystem, "latency_avg_ns"), + "Average packet latency in nanoseconds (kernel stack).", + []string{"interface"}, nil, + ), + jitterNsDesc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, ebpfPmdJitterSubsystem, "pmd_jitter_ns"), + "PMD-style latency jitter in nanoseconds (latency_max_ns - latency_min_ns).", + []string{"interface"}, nil, + ), + histogramDesc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, ebpfPmdJitterSubsystem, "latency_histogram"), + "Histogram of packet latencies (kernel stack) by bucket.", + []string{"bucket"}, nil, + ), + globalPacketsDesc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, ebpfPmdJitterSubsystem, "global_packets_total"), + "Total packets measured globally by eBPF latency program.", + nil, nil, + ), + globalLatencyDesc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, ebpfPmdJitterSubsystem, "global_latency_ns_total"), + "Total latency in nanoseconds (global).", + nil, nil, + ), + collectorUpDesc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, ebpfPmdJitterSubsystem, "collector_up"), + "Whether the eBPF PMD jitter collector loaded and attached successfully (1 = yes, 0 = no).", + nil, nil, + ), + objectPathSetDesc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, ebpfPmdJitterSubsystem, "object_path_configured"), + "Whether --collector.ebpf-pmd-jitter.object-path was set to a non-empty value (1 = yes, 0 = no). Use with collector_up: 0+0 = path not set; 0+1 = path set but load/attach failed.", + nil, nil, + ), + loadErrorDesc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, ebpfPmdJitterSubsystem, "load_error"), + "Set to 1 when the eBPF program failed to load or attach; the 'error' label contains the reason. Absent when collector_up is 1.", + []string{"error"}, nil, + ), + }, nil +} + +func (c *ebpfPmdJitterCollector) Update(ch chan<- prometheus.Metric) error { + objectPathSet := 0.0 + if c.objectPath != "" { + objectPathSet = 1.0 + } + ch <- prometheus.MustNewConstMetric(c.objectPathSetDesc, prometheus.GaugeValue, objectPathSet) + + if c.objectPath == "" { + ch <- prometheus.MustNewConstMetric(c.collectorUpDesc, prometheus.GaugeValue, 0) + return ErrNoData + } + + c.mu.Lock() + if c.coll == nil { + if err := c.loadAndAttach(); err != nil { + c.mu.Unlock() + c.lastLoadError = err.Error() + c.logger.Info("ebpf-pmd-jitter: load/attach failed", "err", err) + ch <- prometheus.MustNewConstMetric(c.collectorUpDesc, prometheus.GaugeValue, 0) + ch <- prometheus.MustNewConstMetric(c.loadErrorDesc, prometheus.GaugeValue, 1, sanitizeForLabel(err.Error())) + return ErrNoData + } + } + coll := c.coll + c.mu.Unlock() + + ch <- prometheus.MustNewConstMetric(c.collectorUpDesc, prometheus.GaugeValue, 1) + + // Global stats + globalPackets := c.sumPerCPUArray(coll.Maps["global_packets"], 0) + globalLatency := c.sumPerCPUArray(coll.Maps["global_latency_ns"], 0) + ch <- prometheus.MustNewConstMetric(c.globalPacketsDesc, prometheus.GaugeValue, float64(globalPackets)) + ch <- prometheus.MustNewConstMetric(c.globalLatencyDesc, prometheus.GaugeValue, float64(globalLatency)) + + // Histogram + histMap := coll.Maps["latency_histogram"] + if histMap != nil { + for i := 0; i < latencyHistogramBuckets; i++ { + count := c.sumPerCPUArray(histMap, uint32(i)) + ch <- prometheus.MustNewConstMetric(c.histogramDesc, prometheus.GaugeValue, float64(count), latencyHistogramBucketLabels[i]) + } + } + + // Per-interface stats and jitter + ifStatsMap := coll.Maps["interface_latency_stats"] + if ifStatsMap != nil { + c.collectInterfaceStats(ifStatsMap, ch) + } + + return nil +} + +func (c *ebpfPmdJitterCollector) loadAndAttach() error { + objPath := filepath.Clean(c.objectPath) + data, err := os.ReadFile(objPath) + if err != nil { + return fmt.Errorf("read object file: %w", err) + } + + spec, err := ebpf.LoadCollectionSpecFromReader(bytes.NewReader(data)) + if err != nil { + return fmt.Errorf("load collection spec: %w", err) + } + + // Map pinning must be on a BPF filesystem (e.g. /sys/fs/bpf), not /tmp. + const bpfPinBase = "/sys/fs/bpf" + pinDir := filepath.Join(bpfPinBase, fmt.Sprintf("node_exporter_%d", os.Getpid())) + if err := os.MkdirAll(pinDir, 0700); err != nil { + return fmt.Errorf("create bpf pin dir %s (is %s mounted?): %w", pinDir, bpfPinBase, err) + } + // Do not remove pinDir while the collection is in use; kernel holds map refs via pins. + + opts := ebpf.CollectionOptions{ + Maps: ebpf.MapOptions{PinPath: pinDir}, + } + + coll, err := ebpf.NewCollectionWithOptions(spec, opts) + if err != nil { + return fmt.Errorf("new collection: %w", err) + } + + xdpProg := coll.Programs["xdp_latency_ingress"] + if xdpProg == nil { + coll.Close() + return fmt.Errorf("program xdp_latency_ingress not found in object") + } + + for _, ifname := range c.interfaces { + if ifname == "" { + continue + } + iface, err := net.InterfaceByName(ifname) + if err != nil { + c.logger.Debug("ebpf-pmd-jitter: interface not found", "interface", ifname, "err", err) + continue + } + c.ifindexToName_[iface.Index] = ifname + l, err := link.AttachXDP(link.XDPOptions{ + Program: xdpProg, + Interface: iface.Index, + Flags: link.XDPGenericMode, + }) + if err != nil { + c.logger.Debug("ebpf-pmd-jitter: attach XDP failed", "interface", ifname, "err", err) + continue + } + c.xdpLinks[ifname] = l + } + + c.coll = coll + return nil +} + +func (c *ebpfPmdJitterCollector) collectInterfaceStats(ifStatsMap *ebpf.Map, ch chan<- prometheus.Metric) { + iter := ifStatsMap.Iterate() + var key uint32 + for iter.Next(&key, nil) { + var values []interfaceStats + if err := ifStatsMap.Lookup(&key, &values); err != nil { + continue + } + ifname := c.ifindexToName_[int(key)] + if ifname == "" { + ifname = c.resolveInterfaceName(int(key)) + } + var total interfaceStats + for _, v := range values { + total.PacketsTotal += v.PacketsTotal + total.BytesTotal += v.BytesTotal + total.LatencyNsTotal += v.LatencyNsTotal + total.XDPPackets += v.XDPPackets + total.TCIngressPackets += v.TCIngressPackets + total.TCEgressPackets += v.TCEgressPackets + if v.LatencyMinNs > 0 && (total.LatencyMinNs == 0 || v.LatencyMinNs < total.LatencyMinNs) { + total.LatencyMinNs = v.LatencyMinNs + } + if v.LatencyMaxNs > total.LatencyMaxNs { + total.LatencyMaxNs = v.LatencyMaxNs + } + } + + ch <- prometheus.MustNewConstMetric(c.packetsTotalDesc, prometheus.GaugeValue, float64(total.PacketsTotal), ifname) + ch <- prometheus.MustNewConstMetric(c.bytesTotalDesc, prometheus.GaugeValue, float64(total.BytesTotal), ifname) + ch <- prometheus.MustNewConstMetric(c.latencyMinNsDesc, prometheus.GaugeValue, float64(total.LatencyMinNs), ifname) + ch <- prometheus.MustNewConstMetric(c.latencyMaxNsDesc, prometheus.GaugeValue, float64(total.LatencyMaxNs), ifname) + + jitter := float64(0) + if total.LatencyMaxNs >= total.LatencyMinNs { + jitter = float64(total.LatencyMaxNs - total.LatencyMinNs) + } + ch <- prometheus.MustNewConstMetric(c.jitterNsDesc, prometheus.GaugeValue, jitter, ifname) + + if total.PacketsTotal > 0 { + avg := float64(total.LatencyNsTotal) / float64(total.PacketsTotal) + ch <- prometheus.MustNewConstMetric(c.latencyAvgNsDesc, prometheus.GaugeValue, avg, ifname) + } + } +} + +func (c *ebpfPmdJitterCollector) resolveInterfaceName(ifindex int) string { + if ifindex <= 0 { + return fmt.Sprintf("if%d", ifindex) + } + if iface, err := net.InterfaceByIndex(ifindex); err == nil { + return iface.Name + } + return fmt.Sprintf("if%d", ifindex) +} + +func (c *ebpfPmdJitterCollector) sumPerCPUArray(m *ebpf.Map, key uint32) uint64 { + if m == nil { + return 0 + } + var values []uint64 + if err := m.Lookup(&key, &values); err != nil { + var single uint64 + if err := m.Lookup(&key, &single); err != nil { + return 0 + } + return single + } + var total uint64 + for _, v := range values { + total += v + } + return total +} diff --git a/collector/nodeconfig_linux.go b/collector/nodeconfig_linux.go new file mode 100644 index 0000000000..46c6a343b4 --- /dev/null +++ b/collector/nodeconfig_linux.go @@ -0,0 +1,215 @@ +// Copyright 2025 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build linux && !nonodeconfig + +package collector + +import ( + "errors" + "fmt" + "log/slog" + "os" + "path/filepath" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/procfs/sysfs" +) + +const ( + nodeconfigSubsystem = "nodeconfig" + // PCI base class 0x02 = Network controller (see PCI SIG class codes). + pciClassNetwork = 0x02 + // DMI/SMBIOS structure types (see DMTF DSP0134). + dmiType16PhysicalMemoryArray = 16 + dmiType17MemoryDevice = 17 + // Type 16 byte offset: Number of Memory Devices in this array. + dmiType16NumDevicesOffset = 13 + // Type 17 byte offsets: Size (WORD, MB). 0 = no device, 0x7FFF = unknown. + dmiType17SizeOffsetLo = 12 + dmiType17SizeOffsetHi = 13 + dmiSizeNotPopulated = 0x7FFF +) + +type nodeconfigCollector struct { + fs sysfs.FS + logger *slog.Logger + pcieNICMinLinkWidthDesc *prometheus.Desc + pcieSlotOkDesc *prometheus.Desc + coresDedicatedDesc *prometheus.Desc + memoryBanksFullDesc *prometheus.Desc +} + +func init() { + registerCollector("nodeconfig", defaultDisabled, NewNodeconfigCollector) +} + +// NewNodeconfigCollector returns a new Collector exposing node-level configuration +// facts useful for runbooks (e.g. DPDK troubleshooting: PCIe slot, memory banks, CPU isolation). +func NewNodeconfigCollector(logger *slog.Logger) (Collector, error) { + fs, err := sysfs.NewFS(*sysPath) + if err != nil { + return nil, fmt.Errorf("failed to open sysfs: %w", err) + } + + return &nodeconfigCollector{ + fs: fs, + logger: logger, + pcieNICMinLinkWidthDesc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, nodeconfigSubsystem, "pcie_nic_min_link_width"), + "Minimum current PCIe link width (lanes) among PCI network controllers. Use in runbooks to infer PCIe slot correctness (e.g. expect >= 16 for x16 slots). -1 if no network PCIe devices or width unknown.", + nil, nil, + ), + pcieSlotOkDesc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, nodeconfigSubsystem, "pcie_slot_ok"), + "Whether PCIe slot/width is considered correct (1) or not (0). Derived from PCIe: 1 when minimum NIC link width >= 16, 0 otherwise. Absent if no network PCIe devices.", + nil, nil, + ), + coresDedicatedDesc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, nodeconfigSubsystem, "cores_dedicated"), + "Whether CPU cores are dedicated/isolated for workload (e.g. DPDK). 1 if at least one CPU is in /sys/devices/system/cpu/isolated, 0 otherwise.", + nil, nil, + ), + memoryBanksFullDesc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, nodeconfigSubsystem, "memory_banks_full"), + "Whether memory channels/banks are fully populated (1) or not (0). Derived from DMI/SMBIOS: 1 when all memory device slots have a populated DIMM, 0 otherwise. Absent if DMI not available.", + nil, nil, + ), + }, nil +} + +func (c *nodeconfigCollector) Update(ch chan<- prometheus.Metric) error { + // PCIe: min link width among network-class PCI devices; pcie_slot_ok derived from it + minWidth := c.pcieNICMinLinkWidth() + if minWidth >= 0 { + ch <- prometheus.MustNewConstMetric(c.pcieNICMinLinkWidthDesc, prometheus.GaugeValue, minWidth) + pcieOk := 0.0 + if minWidth >= 16 { + pcieOk = 1.0 + } + ch <- prometheus.MustNewConstMetric(c.pcieSlotOkDesc, prometheus.GaugeValue, pcieOk) + } + + // Cores dedicated: from sysfs isolated CPUs + dedicated := 0.0 + isolated, err := c.fs.IsolatedCPUs() + if err != nil && !errors.Is(err, os.ErrNotExist) { + c.logger.Debug("nodeconfig: could not read isolated CPUs", "err", err) + } else if len(isolated) > 0 { + dedicated = 1.0 + } + ch <- prometheus.MustNewConstMetric(c.coresDedicatedDesc, prometheus.GaugeValue, dedicated) + + // Memory banks full: from DMI/SMBIOS + if full, ok := c.memoryBanksFullFromDMI(); ok { + ch <- prometheus.MustNewConstMetric(c.memoryBanksFullDesc, prometheus.GaugeValue, full) + } + + return nil +} + +// pcieNICMinLinkWidth returns the minimum current link width among PCI network controllers, +// or -1 if none or unknown. +func (c *nodeconfigCollector) pcieNICMinLinkWidth() float64 { + devices, err := c.fs.PciDevices() + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return -1 + } + c.logger.Debug("nodeconfig: failed to list PCI devices", "err", err) + return -1 + } + + var minWidth float64 = -1 + for _, device := range devices { + baseClass := uint8((device.Class >> 16) & 0xff) + if baseClass != pciClassNetwork { + continue + } + if device.CurrentLinkWidth == nil { + continue + } + w := *device.CurrentLinkWidth + if w < 0 { + continue + } + if minWidth < 0 || w < minWidth { + minWidth = w + } + } + return minWidth +} + +// memoryBanksFullFromDMI reads DMI/SMBIOS from /sys/firmware/dmi/entries/ and returns +// (1.0, true) if all memory device slots are populated, (0.0, true) if not, (_, false) if unknown. +func (c *nodeconfigCollector) memoryBanksFullFromDMI() (float64, bool) { + entriesPath := filepath.Join(*sysPath, "firmware", "dmi", "entries") + entries, err := os.ReadDir(entriesPath) + if err != nil { + if !errors.Is(err, os.ErrNotExist) { + c.logger.Debug("nodeconfig: could not read DMI entries", "path", entriesPath, "err", err) + } + return 0, false + } + + var totalSlots int // from Type 16 Number of Memory Devices + var totalType17 int // count of Type 17 entries (one per slot) + var populatedCount int + + for _, e := range entries { + if !e.IsDir() { + continue + } + base := e.Name() + typePath := filepath.Join(entriesPath, base, "type") + dataPath := filepath.Join(entriesPath, base, "data") + typeBuf, err := os.ReadFile(typePath) + if err != nil { + continue + } + var dmiType int + if _, err := fmt.Sscanf(string(typeBuf), "%d", &dmiType); err != nil { + continue + } + data, err := os.ReadFile(dataPath) + if err != nil { + continue + } + switch dmiType { + case dmiType16PhysicalMemoryArray: + if len(data) > dmiType16NumDevicesOffset { + totalSlots += int(data[dmiType16NumDevicesOffset]) + } + case dmiType17MemoryDevice: + totalType17++ + if len(data) > dmiType17SizeOffsetHi { + size := uint16(data[dmiType17SizeOffsetLo]) | uint16(data[dmiType17SizeOffsetHi])<<8 + if size > 0 && size != dmiSizeNotPopulated { + populatedCount++ + } + } + } + } + + // Use Type 16 total slots if present; else use Type 17 count as total (one entry per slot). + if totalSlots == 0 { + totalSlots = totalType17 + } + if totalSlots == 0 { + return 0, false + } + if populatedCount >= totalSlots { + return 1.0, true + } + return 0.0, true +} diff --git a/docs/KERNEL_STACK_AF_PACKET_METRICS.md b/docs/KERNEL_STACK_AF_PACKET_METRICS.md new file mode 100644 index 0000000000..0a9d0ad972 --- /dev/null +++ b/docs/KERNEL_STACK_AF_PACKET_METRICS.md @@ -0,0 +1,544 @@ +# Correlating Exported Metrics: Kernel Stack Cost per Packet and AF_PACKET + +**Diagram:** For a visual overview of the new features and how they interact with the Linux system, kernel, eBPF program, interfaces, DPDK, and hugepages, open [node-exporter-new-features.excalidraw.json](node-exporter-new-features.excalidraw.json) in [Excalidraw](https://excalidraw.com) (or the VS Code Excalidraw extension). + +This document ties together the node_exporter metrics that help you **detect and quantify** high cost per packet in the Linux kernel network stack, and to **correlate** that cost with AF_PACKET (and similar) usage. It explains how the exported metrics relate to each other and how to conclude that AF_PACKET (or the kernel packet path) is increasing cost per packet. + +--- + +## 1. Why “cost per packet” and AF_PACKET matter + +### 1.1 Cost per packet in the kernel stack + +Every packet that goes through the kernel network stack consumes CPU and time: + +- **Interrupt / NAPI** → **netif_receive_skb** → protocol handling → **delivery to sockets** (e.g. AF_PACKET, AF_INET) → **kfree_skb** / **consume_skb**. + +The **time from driver (or XDP) entry until the packet leaves the stack** is the “kernel stack latency” for that packet. The **CPU time spent per packet** (e.g. in softirq, or in kernel system time) is the “cost per packet.” When that cost or latency goes up, throughput drops and latency increases. + +### 1.2 How AF_PACKET increases cost per packet + +**AF_PACKET** (and similar mechanisms that receive a copy of every packet in the kernel) increase cost per packet because: + +1. **Extra work per packet**: For each packet, the kernel may fan out copies to one or more AF_PACKET sockets (`packet_rcv()` and related paths), then run `consume_skb()` (or equivalent) for each copy. +2. **More CPU in softirq**: That work usually runs in **softirq** (e.g. **NET_RX**). So when many AF_PACKET sockets are open (e.g. `tcpdump`, capture tools), you see: + - **Higher softirq time** on the CPUs handling that traffic. + - **Longer time per packet in the stack** (higher “kernel stack latency”). + - **Higher jitter** (variance in that latency). +3. **Impact on reserved/isolated CPUs**: If the same CPUs are used for both AF_PACKET and for DPDK or other latency-sensitive workloads, those CPUs can hit 90–100% softirq and you get throughput drops (e.g. 30–70%), connection issues, and higher latency. + +So: **AF_PACKET (and similar kernel copy-to-userspace paths) are one major way the kernel stack’s cost per packet is increased.** The metrics below let you observe that increase and correlate it with softirq, CPU isolation, and throughput. + +--- + +## 2. What the eBPF “kernel stack latency” actually measures + +The **ebpf-pmd-jitter** collector uses an in-tree eBPF program (`collector/bpf/latency.c`) that measures **time spent in the kernel network stack** for each packet: + +| Stage | Hook | Meaning | +|--------|------|--------| +| **Start** | **XDP** (`xdp_latency_ingress`) | Packet timestamp when it first enters the stack (driver / XDP). | +| **End** | **TC ingress** or **TC egress** | Packet timestamp when it reaches TC (after protocol and socket handling). | + +So: + +- **Latency** = `(time at TC) - (time at XDP)` = time the packet spent in the kernel path between XDP and TC (NAPI, netif_receive_skb, protocol, **AF_PACKET/socket delivery**, etc.). +- **Jitter** = variance of that latency (e.g. max − min over a window). + +When AF_PACKET (or any extra per-packet work) increases, this **kernel stack latency** and **jitter** tend to increase. So these metrics are a **direct indicator of “cost per packet” in the stack**, and help you conclude that the kernel path (including AF_PACKET) is one way that cost is increased. + +--- + +## 3. Metric catalog: what is exported and how it correlates + +All metric names below are as exported by node_exporter when the corresponding collector is enabled. **Coverage at `localhost:9192/metrics`:** metrics appear only if the collector that produces them is enabled and (where applicable) the kernel or filesystem provides the data (e.g. conntrack metrics only if the `nf_conntrack` module is loaded). Default-enabled collectors (cpu, meminfo, netdev, netstat, sockstat, conntrack when loaded, pressure, etc.) are always present on a typical run; **nodeconfig**, **ebpf-pmd-jitter**, **zoneinfo**, **meminfo_numa**, and **pcidevice** are disabled by default—enable them as in §6 to see those metrics at `:9192/metrics`. Everything in this document works as expected when the listed collectors are enabled and the eBPF object/BPF fs are set up for ebpf-pmd-jitter. + +### 3.1 Kernel stack cost and jitter (eBPF) + +| Metric | Description | Correlation to “cost per packet” / AF_PACKET | +|--------|-------------|----------------------------------------------| +| `node_ebpf_pmd_jitter_latency_min_ns` | Min time (ns) packet in stack (XDP→TC) per interface. | Baseline; rises when stack does more work per packet (e.g. AF_PACKET). | +| `node_ebpf_pmd_jitter_latency_max_ns` | Max time (ns) packet in stack per interface. | Spikes when some packets pay extra cost (e.g. copy to many sockets). | +| `node_ebpf_pmd_jitter_latency_avg_ns` | Average time (ns) in stack per interface. | Direct measure of average cost per packet in the kernel path. | +| `node_ebpf_pmd_jitter_pmd_jitter_ns` | Jitter (max − min) per interface. | High jitter often goes with AF_PACKET/capture (variable extra work per packet). | +| `node_ebpf_pmd_jitter_latency_histogram` | Distribution of latency by bucket (0–1µs … 16ms+). | Shift to higher buckets = more packets with high stack cost. | +| `node_ebpf_pmd_jitter_global_packets_total` | Total packets measured globally. | Volume; combine with latency to reason about total cost. | +| `node_ebpf_pmd_jitter_global_latency_ns_total` | Total latency (ns) globally. | Total “cost” in time; divide by packets for average cost per packet. | +| `node_ebpf_pmd_jitter_collector_up` | 1 if eBPF loaded/attached. | Must be 1 for above metrics to be meaningful. | +| `node_ebpf_pmd_jitter_object_path_configured` | 1 if object-path set. | With `collector_up=0`, distinguishes “not configured” vs “load failed”. | +| `node_ebpf_pmd_jitter_load_error` | 1 on load/attach failure; `error` label has reason. | Debug why kernel stack metrics are missing. | + +**Collector:** `ebpf-pmd-jitter` (optional). Requires eBPF object and `/sys/fs/bpf` mounted. + +### 3.2 Node configuration and isolation (runbook context) + +| Metric | Description | Correlation to “cost per packet” / AF_PACKET | +|--------|-------------|----------------------------------------------| +| `node_nodeconfig_cores_dedicated` | 1 if any CPU in `/sys/.../cpu/isolated`. | Indicates reserved/isolated CPUs; high softirq here + AF_PACKET is a common failure mode. | +| `node_nodeconfig_pcie_nic_min_link_width` | Min PCIe link width among NICs. | Runbook context (e.g. slot correctness). | +| `node_nodeconfig_pcie_slot_ok` | 1 if min NIC width ≥ 16. | Runbook context. | +| `node_nodeconfig_memory_banks_full` | 1 if DMI says all memory slots populated. | Runbook context. | + +**Collector:** `nodeconfig` (optional). + +### 3.3 CPU and softirq (where the cost shows up) + +| Metric | Description | Correlation to “cost per packet” / AF_PACKET | +|--------|-------------|----------------------------------------------| +| `node_cpu_seconds_total{cpu="...", mode="softirq"}` | CPU time in softirq per CPU. | **Primary signal**: AF_PACKET work runs in softirq; high rate = high cost. | +| `node_cpu_seconds_total{mode="system"}` | Kernel (system) time. | Kernel stack and AF_PACKET also show up here. | +| `node_cpu_isolated{cpu="..."}` | 1 if that CPU is isolated. | Isolated CPUs often used for DPDK/capture; combine with softirq to find overload. | + +**Collector:** `cpu` (default). + +### 3.4 Softirq breakdown (NET_RX and others) + +| Metric | Description | Correlation to “cost per packet” / AF_PACKET | +|--------|-------------|----------------------------------------------| +| `node_softirqs_functions_total{cpu="...", type="NET_RX"}` | NET_RX softirq count per CPU. | **NET_RX** dominates when copying to many AF_PACKET sockets; compare before/during capture. | + +**Collector:** `softirqs` (optional). + +### 3.5 Network throughput and drops + +| Metric | Description | Correlation to “cost per packet” / AF_PACKET | +|--------|-------------|----------------------------------------------| +| `node_network_receive_bytes_total`, `node_network_transmit_bytes_total` | Bytes per interface. | Throughput; drop in rate can correlate with high stack cost / AF_PACKET. | +| `node_network_receive_drop_total`, `node_network_receive_errs_total` | Drops/errors per interface. | Can increase when CPUs are overloaded by softirq/AF_PACKET. | + +**Collector:** `netdev` (default). + +### 3.6 Hugepages (runbook / DPDK context) + +Hugepages are **not** exposed by the new nodeconfig or ebpf-pmd-jitter collectors. They come from existing **meminfo** (and optionally **meminfo_numa**, **zoneinfo**) collectors, which read `/proc/meminfo` (and NUMA/zone stats). + +| Metric | Description | Correlation to “cost per packet” / AF_PACKET | +|--------|-------------|----------------------------------------------| +| `node_memory_HugePages_Total` | Total number of huge pages configured. | Runbook: DPDK and high-throughput packet paths often use hugepages; non-zero indicates hugepage pool is configured. | +| `node_memory_HugePages_Free` | Free huge pages. | Runbook: low free count can indicate DPDK/other apps consuming the pool. | +| `node_memory_HugePages_Rsvd`, `node_memory_HugePages_Surp` | Reserved and surplus huge pages. | Runbook context for capacity. | +| `node_memory_Hugepagesize_bytes` | Size of one huge page (e.g. 2 MiB). | Runbook context. | +| `node_memory_AnonHugePages_bytes` | Anonymous memory backed by transparent huge pages. | General memory use; not specific to DPDK. | +| `node_memory_numa_HugePages_*` (with label `node`) | Per-NUMA-node huge page counts (meminfo_numa). | Runbook: when correlating with isolated CPUs, NUMA locality and hugepage usage matter for DPDK/PMD. | +| `node_zoneinfo_nr_anon_transparent_hugepages` | Transparent huge pages (zoneinfo). | General THP usage. | + +**How hugepages factor in:** They do **not** measure kernel stack cost or AF_PACKET directly. They provide **runbook and environment context** on nodes where you are correlating AF_PACKET vs DPDK: + +- On **DPDK nodes**, PMD and packet buffers often use **hugepages**. If you see high softirq on isolated CPUs (possible AF_PACKET/capture) and you expect DPDK to be using hugepages, checking `node_memory_HugePages_Total` / `_Free` confirms hugepage configuration and usage; together with `node_nodeconfig_cores_dedicated` and `node_cpu_isolated`, you have a fuller picture (isolated CPUs, PCIe slot, memory banks, hugepages) for the same runbook. +- So: use hugepage metrics as **context** alongside nodeconfig (cores_dedicated, pcie_slot_ok, memory_banks_full) when reasoning about DPDK vs kernel path (AF_PACKET) and cost per packet. + +**Collectors:** `meminfo` (default), `meminfo_numa` (optional), `zoneinfo` (default). + +### 3.7 Conntrack (nf_conntrack) — connection tracking table + +When the kernel connection tracking table is full, **new flows cannot be established** and packets are dropped. This is a common cause of “random” packet loss and connection failures under load. + +| Metric | Description | Correlation | +|--------|-------------|-------------| +| `node_nf_conntrack_entries` | Current number of allocated conntrack flow entries. | Nearing limit → table filling; correlate with drops. | +| `node_nf_conntrack_entries_limit` | Maximum size of the conntrack table (`nf_conntrack_max`). | When `entries` ≈ `entries_limit`, new entries fail. | +| `node_nf_conntrack_stat_found` | Number of successful conntrack lookups. | Normal operation. | +| `node_nf_conntrack_stat_invalid` | Packets that could not be tracked. | Can increase with bad packets or table pressure. | +| `node_nf_conntrack_stat_ignore` | Packets already connected (existing entry). | Normal. | +| `node_nf_conntrack_stat_insert` | New entries inserted. | Insert rate vs limit drives fullness. | +| `node_nf_conntrack_stat_insert_failed` | Insert attempts that failed (e.g. duplicate). | Correlate with table pressure. | +| `node_nf_conntrack_stat_drop` | **Packets dropped due to conntrack failure** (allocation or helper). | **Primary drop signal** when conntrack is the cause. | +| `node_nf_conntrack_stat_early_drop` | **Entries dropped to make room** (table was full). | **Direct signal** that the table hit max; packets were dropped. | +| `node_nf_conntrack_stat_search_restart` | Lookups restarted due to hashtable resize. | High rate can indicate churn. | + +**Collector:** `conntrack` (default when conntrack module loaded). Source: `/proc/sys/net/netfilter/nf_conntrack_count`, `nf_conntrack_max`, `/proc/net/nf_conntrack` stats. + +### 3.8 Netstat — TCP/UDP and buffer-related counters + +From `/proc/net/netstat` and `/proc/net/snmp`. The default netstat collector exposes a subset of fields (see `--collector.netstat.fields`). Key metrics for TCP buffers, listen queue, and drops: + +| Metric | Description | Correlation | +|--------|-------------|-------------| +| `node_netstat_Tcp_CurrEstab` | Currently established TCP connections. | Connection count; high with many flows. | +| `node_netstat_Tcp_ActiveOpens`, `node_netstat_Tcp_PassiveOpens` | Open attempts. | Connection churn. | +| `node_netstat_Tcp_InSegs`, `node_netstat_Tcp_OutSegs` | Segments in/out. | Traffic volume. | +| `node_netstat_Tcp_RetransSegs` | Retransmitted segments. | Loss or latency; can rise with buffer pressure. | +| `node_netstat_TcpExt_TCPRcvQDrop` | **Segments dropped because receive queue was full**. | **TCP receive buffer full** → application not reading fast enough or buffer too small. | +| `node_netstat_TcpExt_ListenOverflows` | **Times the listen queue overflowed** (SYN queue full). | **Listen backlog full** → new connections dropped. | +| `node_netstat_TcpExt_ListenDrops` | **Times a SYN was dropped** (e.g. backlog full). | **Direct listen-queue drop** signal. | +| `node_netstat_TcpExt_TCPTimeouts` | TCP timeouts. | Can increase with loss or congestion. | +| `node_netstat_TcpExt_TCPOFOQueue` | Out-of-order queue length. | Reordering / burst. | +| `node_netstat_Udp_RcvbufErrors` | **UDP receive buffer errors** (drops). | **UDP socket receive buffer full**. | +| `node_netstat_Udp_SndbufErrors` | **UDP send buffer errors** (drops). | **UDP socket send buffer full**. | +| `node_netstat_Udp6_RcvbufErrors`, `node_netstat_Udp6_SndbufErrors` | Same for IPv6. | Same interpretation. | + +**Collector:** `netstat` (default). Source: `/proc/net/netstat`, `/proc/net/snmp`, `/proc/net/snmp6`. + +### 3.9 Sockstat — socket memory and usage + +Socket layer memory and in-use socket counts. Relevant for buffer and connection scaling. + +| Metric | Description | Correlation | +|--------|-------------|-------------| +| `node_sockstat_sockets_used` | Number of IPv4 sockets in use. | Total socket usage. | +| `node_sockstat_TCP_inuse` | TCP sockets in use. | TCP connection count (similar to CurrEstab but from sock layer). | +| `node_sockstat_TCP_orphan` | Orphaned TCP (no user ref). | Can grow under load. | +| `node_sockstat_TCP_tw` | TIME_WAIT sockets. | High with many short-lived connections. | +| `node_sockstat_TCP_alloc` | TCP sockets allocated. | Allocation count. | +| `node_sockstat_TCP_mem` | **TCP socket memory (pages)**. | **TCP buffer memory**; high = many/big buffers. | +| `node_sockstat_TCP_mem_bytes` | **TCP socket memory in bytes**. | **Direct TCP buffer memory** (mem × page size). | +| `node_sockstat_UDP_inuse`, `node_sockstat_UDP_mem`, `node_sockstat_UDP_mem_bytes` | Same for UDP. | UDP buffer and usage. | + +**Collector:** `sockstat` (default). Source: `/proc/net/sockstat`, `/proc/net/sockstat6`. + +### 3.10 NUMA — locality and cross-node access + +When the kernel or userspace (e.g. OVS, DPDK) allocates memory on a “wrong” NUMA node (e.g. NIC on node 0, process on node 1), **latency and cost per packet increase** due to remote memory access. These metrics help detect NUMA-unfriendly placement. + +| Metric | Description | Correlation | +|--------|-------------|-------------| +| `node_zoneinfo_numa_hit_total{node="...", zone="..."}` | Allocations satisfied from the intended node. | Local allocations. | +| `node_zoneinfo_numa_miss_total{node="...", zone="..."}` | **Allocations satisfied from another node** (remote). | **High miss** → NUMA-unfriendly; higher latency. | +| `node_zoneinfo_numa_foreign_total` | “Intended here, hit elsewhere.” | Another view of remote allocation. | +| `node_zoneinfo_numa_local_total` | Allocations from local node. | Local. | +| `node_zoneinfo_numa_other_total` | Allocations from other node. | **High other** → cross-node access; bad for latency. | +| `node_memory_numa_*{node="..."}` | Per-NUMA memory stats (meminfo_numa). | Memory usage per node; pair with NIC/process placement. | +| `node_pcidevice_numa_node` | NUMA node of PCI device (e.g. NIC). | Which node the NIC is on; compare to process/OVS NUMA. | + +**Collectors:** `zoneinfo` (optional), `meminfo_numa` (optional), `pcidevice` (optional). Sources: `/proc/zoneinfo`, sysfs NUMA meminfo/numastat, PCI sysfs. + +### 3.11 Pressure — memory and I/O stall + +PSI (Pressure Stall Information) indicates when workloads are stalled due to memory or I/O. Can correlate with buffer pressure and cost per packet. + +| Metric | Description | Correlation | +|--------|-------------|-------------| +| `node_pressure_memory_stalled_seconds_total` | Time no process could make progress due to memory. | Memory pressure; can accompany buffer/socket pressure. | +| `node_pressure_memory_waiting_seconds_total` | Time processes waited for memory. | Same. | +| `node_pressure_io_stalled_seconds_total`, `node_pressure_io_waiting_seconds_total` | I/O pressure. | Disk/network stack contention. | +| `node_pressure_cpu_waiting_seconds_total` | CPU pressure. | CPU saturation. | + +**Collector:** `pressure` (default). Source: `/proc/pressure/*`. + +--- + +## 4. How the metrics correlate: decision flow + +Use this flow to **correlate** metrics and **support the conclusion** that AF_PACKET (or the kernel packet path) is increasing cost per packet: + +1. **Kernel stack cost (eBPF)** + - Check `node_ebpf_pmd_jitter_latency_avg_ns`, `node_ebpf_pmd_jitter_pmd_jitter_ns`, and `node_ebpf_pmd_jitter_latency_histogram`. + - **Rising** average latency or jitter, or histogram shifting to higher buckets → **higher cost per packet in the kernel stack**. + +2. **Where the cost appears (CPU)** + - Check `rate(node_cpu_seconds_total{mode="softirq"}[5m])` and, if needed, `node_softirqs_functions_total{type="NET_RX"}`. + - **High softirq share** (e.g. > 0.9) and/or **high NET_RX** on the same CPUs that handle the traffic → cost is in the **kernel receive path** (where AF_PACKET runs). + +3. **Context (isolation and throughput)** + - `node_nodeconfig_cores_dedicated == 1` and `node_cpu_isolated{cpu="X"} == 1`: reserved CPUs in use. + - If those CPUs are the ones with high softirq and high kernel stack latency → typical “reserved CPUs hammered by kernel path” pattern. + - `node_network_*` throughput down and/or drops up **together with** high softirq and high eBPF latency → **consistent with** high cost per packet (e.g. AF_PACKET) degrading performance. + +4. **Conclusion** + - **High kernel stack latency/jitter (eBPF) + high softirq (and optionally NET_RX) + throughput degradation (and optionally drops)** → the **kernel stack** (and mechanisms like **AF_PACKET** that add per-packet work in that path) is **one way** cost per packet is increased. + - You can then correlate with known AF_PACKET use (tcpdump, capture tools) or with tooling (e.g. `/proc/net/packet`, tracing) to confirm AF_PACKET specifically. + +5. **Optional: runbook context (nodeconfig, hugepages)** + - On nodes that may run **DPDK** or other high-throughput packet paths: use `node_nodeconfig_cores_dedicated`, `node_nodeconfig_pcie_slot_ok`, and **hugepage metrics** (`node_memory_HugePages_Total`, `node_memory_HugePages_Free`, etc.) as **context**—they do not measure cost per packet but help confirm isolation, NIC slot, and hugepage configuration when you are deciding between “kernel path (AF_PACKET) overload” vs “DPDK/PMD setup.” + +6. **Other drop and latency causes (see §7)** + - **Conntrack full (Example A):** `node_nf_conntrack_entries` ≈ `node_nf_conntrack_entries_limit` and `node_nf_conntrack_stat_early_drop` or `_stat_drop` increasing → packet drops from full connection tracking table. + - **TCP/UDP buffer full (Example C):** `node_netstat_TcpExt_TCPRcvQDrop`, `ListenOverflows`, or `node_netstat_Udp_*bufErrors` increasing → drops from socket buffers; can be exacerbated by high cost per packet (AF_PACKET). + - **NUMA-unfriendly path (Example B):** high `node_zoneinfo_numa_miss_total` (or numa_other) + high eBPF latency → kernel/OVS path paying remote-memory cost per packet. + +--- + +## 5. PromQL examples + +### 5.1 Average kernel stack latency (when eBPF is used) + +```promql +# Per-interface average (only when collector_up == 1) +node_ebpf_pmd_jitter_latency_avg_ns + and on() node_ebpf_pmd_jitter_collector_up == 1 +``` + +### 5.2 Jitter (max − min) per interface + +```promql +node_ebpf_pmd_jitter_pmd_jitter_ns + and on() node_ebpf_pmd_jitter_collector_up == 1 +``` + +### 5.3 Softirq share of CPU (all CPUs) + +```promql +rate(node_cpu_seconds_total{mode="softirq"}[5m]) + / ignoring(mode) group_left() +rate(node_cpu_seconds_total[5m]) +``` + +### 5.4 High softirq on isolated CPUs (alert pattern) + +```promql +( + rate(node_cpu_seconds_total{mode="softirq"}[5m]) + / on(cpu) group_left() + rate(node_cpu_seconds_total[5m]) +) > 0.9 +and on(cpu) node_cpu_isolated == 1 +``` + +### 5.5 Correlation: high kernel stack latency and high softirq + +```promql +# Example: nodes where both kernel stack avg latency and softirq share are high +( + node_ebpf_pmd_jitter_latency_avg_ns > 1000 + and on() node_ebpf_pmd_jitter_collector_up == 1 +) +and on(instance) ( + rate(node_cpu_seconds_total{mode="softirq"}[5m]) + / ignoring(mode) group_left() + rate(node_cpu_seconds_total[5m]) +) > 0.7 +``` + +(Adjust thresholds and grouping to match your labels and SLOs.) + +### 5.6 Conntrack table near full (Example A) + +```promql +# Usage ratio: entries / limit (alert when > 0.9) +node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.9 +``` + +```promql +# Any conntrack drop indicates table pressure or allocation failure +increase(node_nf_conntrack_stat_drop[5m]) > 0 +or +increase(node_nf_conntrack_stat_early_drop[5m]) > 0 +``` + +### 5.7 TCP receive buffer drops (Example C) + +```promql +# TCP receive queue drops (segments dropped due to full rcvbuf) +increase(node_netstat_TcpExt_TCPRcvQDrop[5m]) > 0 +``` + +```promql +# Listen queue overflows (SYN backlog full) +increase(node_netstat_TcpExt_ListenOverflows[5m]) > 0 +``` + +### 5.8 NUMA miss ratio (Example B) + +```promql +# Per-node NUMA miss rate (high = allocations served from wrong node) +rate(node_zoneinfo_numa_miss_total[5m]) + / (rate(node_zoneinfo_numa_hit_total[5m]) + rate(node_zoneinfo_numa_miss_total[5m])) +``` + +--- + +## 6. Collectors to enable + +| Collector | Default | Enable with | Purpose | +|-----------|---------|-------------|---------| +| conntrack | on | (default, if loaded) | **Example A**: `node_nf_conntrack_entries`, `_entries_limit`, `_stat_drop`, `_stat_early_drop`. | +| cpu | on | (default) | Softirq and system time, `node_cpu_isolated`. | +| meminfo | on | (default) | **Hugepages**: `node_memory_HugePages_*`, etc. Runbook/DPDK context. | +| meminfo_numa | off | `--collector.meminfo_numa` | Per-NUMA memory; **Example B** (NUMA/OVS). | +| netdev | on | (default) | Throughput and drops: `node_network_*`. | +| netstat | on | (default) | **Example C**: `node_netstat_TcpExt_TCPRcvQDrop`, `ListenOverflows`, `Udp_*bufErrors`. | +| nodeconfig | off | `--collector.nodeconfig` | `node_nodeconfig_cores_dedicated`, PCIe/memory context. | +| pcidevice | off | `--collector.pcidevice` | **Example B**: `node_pcidevice_numa_node` (NIC NUMA). | +| pressure | on | (default) | Memory/IO stall; can correlate with buffer pressure. | +| softirqs | off | `--collector.softirqs` | NET_RX and other softirq vectors. | +| sockstat | on | (default) | **Example C**: `node_sockstat_TCP_mem_bytes`, socket usage. | +| zoneinfo | off | `--collector.zoneinfo` | **Example B**: `node_zoneinfo_numa_*` (NUMA hit/miss). | +| ebpf-pmd-jitter | off | `--collector.ebpf-pmd-jitter --collector.ebpf-pmd-jitter.object-path=...` | Kernel stack latency and jitter; requires eBPF object and `/sys/fs/bpf`. | + +--- + +## 7. Detailed examples: conntrack drops, NUMA latency, TCP buffers + +The following three examples show how to use the exported metrics at `localhost:9192/metrics` (or your node_exporter endpoint) to diagnose specific failure modes. Each example ties back to **cost per packet** and, where relevant, **AF_PACKET** (or kernel path) impact. + +--- + +### Example A: Full nf_conntrack table leads to packet drops + +**What happens:** The kernel connection tracking table has a fixed maximum (`nf_conntrack_max`). Every new flow (e.g. new TCP connection, UDP flow, or NAT session) allocates an entry. When the table is full: + +1. New entries cannot be allocated. +2. The kernel **drops packets** that would create new flows. +3. You see `node_nf_conntrack_stat_drop` and/or `node_nf_conntrack_stat_early_drop` increase; established flows may keep working while new connections fail. + +**Exported metrics that matter:** + +| Metric | How to use it | +|--------|----------------| +| `node_nf_conntrack_entries` | Current number of conntrack entries. If it stays near the limit, the table is full. | +| `node_nf_conntrack_entries_limit` | Maximum table size. Compare to `entries`; ratio close to 1.0 = full. | +| `node_nf_conntrack_stat_early_drop` | **Increments when the kernel drops existing entries to make room** (table was full). **Primary counter** for “table full” drops. | +| `node_nf_conntrack_stat_drop` | **Increments when packets are dropped due to conntrack failure** (allocation failed or protocol helper). Includes both “no room” and other conntrack failures. | +| `node_nf_conntrack_stat_insert` | New entries inserted. High rate with high `entries` → table fills quickly. | +| `node_network_receive_drop_total`, `node_network_receive_errs_total` | Interface-level drops. Can rise when conntrack drops packets before they are “received” by a socket; correlate with conntrack drop spikes. | + +**Decision flow:** + +1. Check **usage ratio**: `node_nf_conntrack_entries / node_nf_conntrack_entries_limit`. If consistently > 0.85–0.95, the table is under heavy pressure. +2. Check **drop counters**: `increase(node_nf_conntrack_stat_early_drop[5m]) > 0` or `increase(node_nf_conntrack_stat_drop[5m]) > 0`. Any increase confirms conntrack-related drops. +3. Correlate in time with **symptoms**: new connection failures, timeouts, or interface drops (`node_network_receive_drop_total`). +4. **Conclusion:** Full conntrack table → packet drops for new flows. Remediation: increase `nf_conntrack_max` (and often `nf_conntrack_buckets`), or reduce connection churn / shorten timeouts. + +**PromQL (alerts):** + +```promql +# Alert when table usage > 90% +(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.9 + +# Alert when any early_drop (table was full) +increase(node_nf_conntrack_stat_early_drop[5m]) > 0 +``` + +**Relation to AF_PACKET / cost per packet:** Conntrack is in the same kernel path as AF_PACKET. When the table is full, **every packet that would create a new flow pays the cost of a failed conntrack lookup/insert and then drop**—so CPU is still spent (cost per packet) but the packet is dropped. High conntrack churn can also increase softirq and latency (more lookups, resizes); correlate with `node_cpu_seconds_total{mode="softirq"}` and eBPF latency if available. + +--- + +### Example B: High packet latency caused by OVS (or kernel path) not NUMA-aware optimized + +**What happens:** On NUMA systems, memory access is faster when the CPU and the memory (or device) are on the same NUMA node. If Open vSwitch (OVS) or the kernel datapath allocates buffers or runs work on a different node than the NIC (or the other end of the path), you get **remote memory access** → higher latency and higher cost per packet. OVS not bound to the NIC’s NUMA node is a common cause. + +**Exported metrics that matter:** + +| Metric | How to use it | +|--------|----------------| +| `node_zoneinfo_numa_hit_total{node="N", zone="Z"}` | Allocations satisfied from the intended node (local). | +| `node_zoneinfo_numa_miss_total{node="N", zone="Z"}` | **Allocations satisfied from another node (remote).** High rate here (or high miss ratio) = many allocations served from the “wrong” node. | +| `node_zoneinfo_numa_other_total` | Allocations from a non-local node. High = cross-node access. | +| `node_zoneinfo_numa_local_total` | Local allocations. Compare to `numa_other_total`; if “other” is a large fraction, workload is not NUMA-local. | +| `node_memory_numa_*{node="..."}` (meminfo_numa) | Per-node memory usage. Helps see which node the process/OVS is using. | +| `node_pcidevice_numa_node` | **NUMA node of each PCI device (e.g. NIC).** Compare to the NUMA node of the process handling the traffic; mismatch suggests non-local path. | +| `node_ebpf_pmd_jitter_latency_avg_ns`, `node_ebpf_pmd_jitter_pmd_jitter_ns` | **Kernel stack latency and jitter.** When NUMA is wrong, latency and jitter often go up (remote access adds delay and variance). | +| `node_cpu_seconds_total{mode="softirq"}`, `node_cpu_isolated` | Which CPUs are busy with softirq; if those CPUs are on a different node than the NIC, you have a NUMA mismatch. | + +**Decision flow:** + +1. Identify **NIC NUMA node**: use `node_pcidevice_numa_node` for the relevant NIC(s). +2. Check **NUMA allocation quality**: compute miss ratio per node, e.g. `rate(node_zoneinfo_numa_miss_total[5m]) / (rate(node_zoneinfo_numa_hit_total[5m]) + rate(node_zoneinfo_numa_miss_total[5m]))`. High ratio on the node(s) handling packet I/O → allocations often served from remote node. +3. Correlate with **latency**: if eBPF is available, `node_ebpf_pmd_jitter_latency_avg_ns` and `node_ebpf_pmd_jitter_pmd_jitter_ns` rising together with high NUMA miss → kernel path (including OVS if in kernel) is paying remote-memory cost per packet. +4. **Conclusion:** High NUMA miss (or high “other”) + high kernel stack latency/jitter → packet path is not NUMA-optimized; consider binding OVS/process to the NIC’s NUMA node and ensuring hugepage/memory allocation is local. + +**PromQL (NUMA miss ratio per node):** + +```promql +# Per-node NUMA miss ratio (by zone) +sum by (node) (rate(node_zoneinfo_numa_miss_total[5m])) + / (sum by (node) (rate(node_zoneinfo_numa_hit_total[5m])) + + sum by (node) (rate(node_zoneinfo_numa_miss_total[5m]))) +``` + +**Relation to AF_PACKET / cost per packet:** Both “OVS not NUMA-aware” and “AF_PACKET on wrong node” increase **cost per packet** (extra cycles for remote access or extra copy). The eBPF latency metrics reflect the combined effect: higher latency and jitter when the path is not NUMA-local. + +--- + +### Example C: TCP buffer monitoring and cost + +**What happens:** TCP and UDP socket buffers are finite. If the application does not read (or send) fast enough, the kernel **drops segments** or **refuses new connections** (listen queue). This shows up as TCP receive-queue drops, listen overflows, or UDP buffer errors. Buffer pressure can also correlate with high CPU (e.g. softirq) or AF_PACKET: if the kernel is busy copying to many AF_PACKET sockets, it may not drain TCP buffers in time → more drops. + +**Exported metrics that matter:** + +| Metric | How to use it | +|--------|----------------| +| `node_netstat_TcpExt_TCPRcvQDrop` | **Segments dropped because the TCP receive queue was full.** Application not reading fast enough or rcvbuf too small. | +| `node_netstat_TcpExt_ListenOverflows` | **Times the listen queue overflowed** (SYN backlog full). New connections dropped. | +| `node_netstat_TcpExt_ListenDrops` | **Times a SYN was dropped** (e.g. listen backlog full). | +| `node_netstat_Udp_RcvbufErrors`, `node_netstat_Udp_SndbufErrors` | **UDP receive/send buffer errors** (drops). | +| `node_netstat_Udp6_RcvbufErrors`, `node_netstat_Udp6_SndbufErrors` | Same for IPv6. | +| `node_sockstat_TCP_mem`, `node_sockstat_TCP_mem_bytes` | **Total TCP socket memory (pages/bytes).** High = many or large TCP buffers; correlate with memory pressure. | +| `node_sockstat_TCP_inuse`, `node_sockstat_sockets_used` | Socket counts. High with high TCPRcvQDrop → many connections and buffer pressure. | +| `node_netstat_Tcp_CurrEstab`, `node_netstat_Tcp_RetransSegs` | Established connections and retransmits; context for load and loss. | +| `node_pressure_memory_stalled_seconds_total`, `node_pressure_memory_waiting_seconds_total` | Memory pressure; can accompany buffer pressure. | + +**Decision flow:** + +1. **TCP receive queue drops:** `increase(node_netstat_TcpExt_TCPRcvQDrop[5m]) > 0`. Any increase = kernel dropped segments due to full rcvbuf. +2. **Listen queue:** `increase(node_netstat_TcpExt_ListenOverflows[5m]) > 0` or `ListenDrops` increasing → new connections dropped (backlog full). +3. **UDP buffers:** `increase(node_netstat_Udp_RcvbufErrors[5m]) > 0` or `SndbufErrors` → UDP socket buffer full. +4. **Context:** Compare with `node_sockstat_TCP_mem_bytes` (total TCP buffer memory), `node_sockstat_TCP_inuse` (connection count), and memory pressure. If TCPRcvQDrop or ListenOverflows rise while softirq or eBPF latency is high, **cost per packet** (e.g. AF_PACKET or other kernel work) may be delaying buffer processing and contributing to drops. + +**PromQL (alerts):** + +```promql +# TCP receive queue drops +increase(node_netstat_TcpExt_TCPRcvQDrop[5m]) > 0 + +# Listen queue overflows +increase(node_netstat_TcpExt_ListenOverflows[5m]) > 0 + +# UDP buffer errors +increase(node_netstat_Udp_RcvbufErrors[5m]) > 0 or increase(node_netstat_Udp_SndbufErrors[5m]) > 0 +``` + +**Relation to AF_PACKET / cost per packet:** When the kernel spends more time per packet (e.g. copying to AF_PACKET sockets), it can drain TCP/UDP socket buffers more slowly. So **high cost per packet** (observed via eBPF latency and softirq) can **contribute** to buffer full → TCPRcvQDrop, ListenOverflows, or RcvbufErrors. Use TCP/UDP buffer metrics as **outcomes** and eBPF + softirq as **cause** to conclude that the kernel path (including AF_PACKET) is increasing cost per packet and indirectly causing buffer drops. + +--- + +## 8. Summary + +- **Cost per packet** in the kernel = CPU time and wall-clock time spent per packet in the stack (interrupt/NAPI → protocol → socket delivery → free). +- **AF_PACKET** (and similar) **increase** that cost by adding per-packet work (e.g. `packet_rcv()`, copies, `consume_skb()`) in softirq. +- **Exported metrics** that matter: + - **eBPF**: `node_ebpf_pmd_jitter_latency_*_ns`, `node_ebpf_pmd_jitter_pmd_jitter_ns`, `node_ebpf_pmd_jitter_latency_histogram`, global totals → **direct** kernel stack cost and jitter. + - **CPU**: `node_cpu_seconds_total{mode="softirq"}`, `node_cpu_isolated` → **where** the cost shows up. + - **Softirqs**: `node_softirqs_functions_total{type="NET_RX"}` → **receive path** (AF_PACKET-heavy). + - **Nodeconfig**: `node_nodeconfig_cores_dedicated`, `node_nodeconfig_pcie_slot_ok`, etc. → **runbook context** (isolated CPUs, PCIe, memory banks). + - **Meminfo**: `node_memory_HugePages_*`, `node_memory_Hugepagesize_bytes` → **runbook context** (hugepages / DPDK). + - **Netdev**: `node_network_*` → **throughput and drops** (outcome of high cost). + - **Conntrack** (Example A): `node_nf_conntrack_entries`, `node_nf_conntrack_entries_limit`, `node_nf_conntrack_stat_drop`, `node_nf_conntrack_stat_early_drop` → **full table → packet drops**. + - **Netstat** (Example C): `node_netstat_TcpExt_TCPRcvQDrop`, `ListenOverflows`, `node_netstat_Udp_*bufErrors` → **TCP/UDP buffer drops**. + - **Sockstat**: `node_sockstat_TCP_mem_bytes`, `node_sockstat_TCP_inuse` → **socket/buffer usage**. + - **NUMA** (Example B): `node_zoneinfo_numa_*`, `node_memory_numa_*`, `node_pcidevice_numa_node` → **NUMA-unfriendly path → high latency**. + - **Pressure**: `node_pressure_memory_*`, `node_pressure_io_*` → **stall**; can correlate with buffer pressure. + +**Detailed examples:** **Example A** (full nf_conntrack → packet drops) uses conntrack metrics and drop counters. **Example B** (high packet latency from OVS not NUMA-optimized) uses NUMA hit/miss, pcidevice NUMA node, and eBPF latency. **Example C** (TCP buffer monitoring) uses netstat TCPRcvQDrop, ListenOverflows, UDP buffer errors, and sockstat; it ties buffer drops to cost per packet and AF_PACKET. + +By correlating **rising kernel stack latency/jitter** with **high softirq** (and optionally NET_RX, conntrack drops, TCP buffer drops, or NUMA miss), you can **determine that the kernel stack—and mechanisms like AF_PACKET—are one way the cost per packet is increased**, and then narrow down to AF_PACKET via known capture activity or system tooling. + +--- + +## 9. Functional test + +A **Linux-only functional test** (`kernel_stack_af_packet_functional_test.go`) generates the scenarios above in a **dedicated network namespace** at **Gbps-scale data rates** where possible, stresses the kernel stack, and checks that the corresponding metrics and **.pcap** captures are produced. The test logs effective throughput (e.g. Gbps) for traffic scenarios and documents limitations of what the metrics can show in this environment (§9.1). + +- **Scenario A (conntrack):** Lowers `nf_conntrack_max` in the netns, runs a TCP server that holds many connections, opens more than the limit from the host, then scrapes `node_nf_conntrack_entries`, `_entries_limit`, `_stat_drop`, `_stat_early_drop` and validates that table pressure and/or drops are reflected; traffic is captured to `scenario_a_conntrack.pcap`. **Limitation:** node_exporter may read `/proc` from the host, so conntrack metrics can reflect the host table, not the netns (§9.1). +- **Scenario C (listen overflow):** Server with listen backlog 1; many connections are opened quickly; asserts `node_netstat_TcpExt_ListenOverflows` / `ListenDrops` and captures to `scenario_c_listen.pcap`. +- **Scenario C (TCP rcvbuf):** Server with small `SO_RCVBUF` and slow read; client sends at high rate until rcvbuf fills; asserts `node_netstat_TcpExt_TCPRcvQDrop` and captures to `scenario_c_rcvq.pcap`. +- **Scenario B (traffic for NUMA):** High-throughput traffic over many connections to exercise the stack; on real NUMA hardware, `node_zoneinfo_numa_*` and `node_pcidevice_numa_node` would correlate with latency. **Limitation:** NUMA topology is not replicated in the netns (§9.1). +- **Traffic + pcap:** High data-rate traffic (target Gbps-scale) while capturing so netdev receive bytes, softirq, and drop metrics reflect per-packet cost; validates that the capture produces a valid `.pcap`. The test logs effective rate (Gbps) achieved. + +**Requirements:** Linux, root (for `ip netns`, `tcpdump`), `node_exporter` binary (e.g. `make build`), and the stress server built from `cmd/kernel_stack_stress_server`. + +**Run:** From the repo root, build the binary and run the test as root. If `go` is not in root’s PATH (e.g. it lives under `/usr/local/go/bin`), use the full path or preserve `PATH`: + +```bash +make build +# Option A: preserve your PATH so root can find go +sudo env "PATH=$PATH" go test -v . -run TestKernelStackAFPacketScenarios -timeout 120s + +# Option B: use full path to go +sudo $(which go) test -v . -run TestKernelStackAFPacketScenarios -timeout 120s +``` + +The test creates netns `kernel_stack_ftest`, a veth pair, starts node_exporter inside the netns on port 9192, runs each scenario, scrapes metrics from the host, and writes `.pcap` files under `/tmp/node_exporter_kernel_stack_pcaps_/`. It skips if not root or if `node_exporter` / `tcpdump` / `ip` are unavailable. + +### 9.1 Limitations of the functional test (what the metrics can and cannot show) + +The test runs in a **network namespace** with **veth** pairs. The following limitations determine what the exported metrics can and cannot demonstrate in this environment: + +| Area | Limitation | What the metrics show in the test | +|------|------------|-----------------------------------| +| **Conntrack (Example A)** | node_exporter often shares the host’s `/proc` (or the test’s procfs is host-mounted). Conntrack counters and `nf_conntrack_max` are then from the **host**, not the netns. | The test can fill the netns conntrack table and cause drops in the netns; if the exporter reads host `/proc`, `node_nf_conntrack_entries_limit` will be the host limit (e.g. 262144), not the netns value (e.g. 50). The test logs this and only asserts conntrack pressure when the scraped limit looks like the netns value. | +| **NUMA (Example B)** | NUMA topology and PCI NUMA node are properties of real hardware. A netns has no separate NUMA topology; veth is not a physical NIC. | Traffic is generated at high rate to stress the stack. `node_zoneinfo_numa_*` and `node_pcidevice_numa_node` are **not** exercised by the test in a meaningful way; they are relevant on multi-NUMA machines with real NICs. The test documents that NUMA correlation requires real hardware. | +| **eBPF kernel stack latency** | The ebpf-pmd-jitter collector attaches to real devices (XDP/TC). In the test, interfaces are veth; the eBPF object may not be loaded or may not attach. | The test does **not** require eBPF to pass. Kernel stack latency and jitter metrics are meaningful when the collector is enabled and attached on real NICs under Gbps-scale load. | +| **Throughput (Gbps)** | veth is loopback-like; achievable rate is high but not representative of a physical 1/10/25 Gbps NIC. | The test aims for **Gbps-scale** data rates (many connections, large transfers) so that `node_network_receive_bytes_total`, softirq, and drop metrics reflect real stress. Achieved rate is logged (e.g. effective Gbps) so limitations of the environment are clear. | +| **AF_PACKET cost** | Per-packet cost (softirq, NET_RX) increases when tcpdump or another capture is active. Correlation of softirq with capture requires sustained high packet rate. | The test runs tcpdump in the netns during traffic and validates pcaps. It generates enough traffic so that netdev and (on real hardware) softirq metrics would show the effect of capture; in the test environment, the main outcome is valid metrics and pcaps. | +| **TCPRcvQDrop / ListenOverflows** | These are global kernel counters (netstat/sockstat). When node_exporter runs in the netns and reads netns `/proc`, the counters are netns-local. | The test reliably triggers ListenOverflows and TCPRcvQDrop in the netns and asserts the corresponding metrics, demonstrating that the **metrics correctly reflect** buffer and listen-queue drops under load. | diff --git a/docs/node-exporter-new-features.excalidraw b/docs/node-exporter-new-features.excalidraw new file mode 100644 index 0000000000..a6ac0872a9 --- /dev/null +++ b/docs/node-exporter-new-features.excalidraw @@ -0,0 +1,1160 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://excalidraw.com", + "elements": [ + { + "id": "title", + "type": "text", + "x": 320, + "y": 20, + "width": 460, + "height": 36, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 1, + "version": 1, + "versionNonce": 1001, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "text": "Node Exporter: New Features & Linux / eBPF / DPDK Interaction", + "fontSize": 24, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Node Exporter: New Features & Linux / eBPF / DPDK Interaction" + }, + { + "id": "box-node-exporter", + "type": "rectangle", + "x": 340, + "y": 70, + "width": 420, + "height": 60, + "angle": 0, + "strokeColor": "#1971c2", + "backgroundColor": "#a5d8ff", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": { "type": 3 }, + "seed": 2, + "version": 1, + "versionNonce": 1002, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false + }, + { + "id": "text-node-exporter", + "type": "text", + "x": 450, + "y": 85, + "width": 200, + "height": 30, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 3, + "version": 1, + "versionNonce": 1003, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "text": "Node Exporter (process)", + "fontSize": 20, + "fontFamily": 1, + "textAlign": "center", + "verticalAlign": "top", + "containerId": null, + "originalText": "Node Exporter (process)" + }, + { + "id": "box-nodeconfig", + "type": "rectangle", + "x": 120, + "y": 160, + "width": 280, + "height": 100, + "angle": 0, + "strokeColor": "#2f9e44", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": { "type": 3 }, + "seed": 4, + "version": 1, + "versionNonce": 1004, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false + }, + { + "id": "text-nodeconfig", + "type": "text", + "x": 160, + "y": 165, + "width": 200, + "height": 90, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 5, + "version": 1, + "versionNonce": 1005, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "text": "Nodeconfig collector\n• pcie_nic_min_link_width\n• pcie_slot_ok\n• cores_dedicated\n• memory_banks_full", + "fontSize": 14, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Nodeconfig collector\n• pcie_nic_min_link_width\n• pcie_slot_ok\n• cores_dedicated\n• memory_banks_full" + }, + { + "id": "box-ebpf-collector", + "type": "rectangle", + "x": 700, + "y": 160, + "width": 300, + "height": 120, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": { "type": 3 }, + "seed": 6, + "version": 1, + "versionNonce": 1006, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false + }, + { + "id": "text-ebpf-collector", + "type": "text", + "x": 720, + "y": 165, + "width": 260, + "height": 110, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 7, + "version": 1, + "versionNonce": 1007, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "text": "eBPF PMD Jitter collector\n• Loads latency.o\n• Pins maps → /sys/fs/bpf\n• Attaches XDP to interfaces\n• Reads: latency, jitter, histogram", + "fontSize": 14, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "eBPF PMD Jitter collector\n• Loads latency.o\n• Pins maps → /sys/fs/bpf\n• Attaches XDP to interfaces\n• Reads: latency, jitter, histogram" + }, + { + "id": "box-sysfs", + "type": "rectangle", + "x": 40, + "y": 320, + "width": 220, + "height": 140, + "angle": 0, + "strokeColor": "#495057", + "backgroundColor": "#e9ecef", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": { "type": 3 }, + "seed": 8, + "version": 1, + "versionNonce": 1008, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false + }, + { + "id": "text-sysfs", + "type": "text", + "x": 60, + "y": 325, + "width": 180, + "height": 130, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 9, + "version": 1, + "versionNonce": 1009, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "text": "Sysfs (Linux)\n• PCI: link width\n (network class)\n• /sys/devices/system/\n cpu/isolated\n→ Nodeconfig", + "fontSize": 14, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Sysfs (Linux)\n• PCI: link width\n (network class)\n• /sys/devices/system/\n cpu/isolated\n→ Nodeconfig" + }, + { + "id": "box-dmi", + "type": "rectangle", + "x": 40, + "y": 500, + "width": 220, + "height": 80, + "angle": 0, + "strokeColor": "#495057", + "backgroundColor": "#e9ecef", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": { "type": 3 }, + "seed": 10, + "version": 1, + "versionNonce": 1010, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false + }, + { + "id": "text-dmi", + "type": "text", + "x": 60, + "y": 510, + "width": 180, + "height": 60, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 11, + "version": 1, + "versionNonce": 1011, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "text": "DMI/SMBIOS\n/sys/firmware/dmi/entries/\n(Type 16/17) → memory_banks_full", + "fontSize": 14, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "DMI/SMBIOS\n/sys/firmware/dmi/entries/\n(Type 16/17) → memory_banks_full" + }, + { + "id": "box-bpffs", + "type": "rectangle", + "x": 700, + "y": 320, + "width": 200, + "height": 80, + "angle": 0, + "strokeColor": "#5f3dc4", + "backgroundColor": "#d0bfff", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": { "type": 3 }, + "seed": 12, + "version": 1, + "versionNonce": 1012, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false + }, + { + "id": "text-bpffs", + "type": "text", + "x": 720, + "y": 335, + "width": 160, + "height": 50, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 13, + "version": 1, + "versionNonce": 1013, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "text": "/sys/fs/bpf\nMap pinning (eBPF)", + "fontSize": 14, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "/sys/fs/bpf\nMap pinning (eBPF)" + }, + { + "id": "box-kernel", + "type": "rectangle", + "x": 320, + "y": 320, + "width": 340, + "height": 200, + "angle": 0, + "strokeColor": "#f08c00", + "backgroundColor": "#ffe066", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": { "type": 3 }, + "seed": 14, + "version": 1, + "versionNonce": 1014, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false + }, + { + "id": "text-kernel", + "type": "text", + "x": 340, + "y": 325, + "width": 300, + "height": 190, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 15, + "version": 1, + "versionNonce": 1015, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "text": "Linux kernel\n• XDP hook ← latency.c (timestamp start)\n• Network stack (NAPI, netif_receive_skb)\n• AF_PACKET / sockets (cost per packet)\n• TC hook ← latency.c (timestamp end)\n• Latency = XDP→TC time", + "fontSize": 14, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Linux kernel\n• XDP hook ← latency.c (timestamp start)\n• Network stack (NAPI, netif_receive_skb)\n• AF_PACKET / sockets (cost per packet)\n• TC hook ← latency.c (timestamp end)\n• Latency = XDP→TC time" + }, + { + "id": "box-interfaces", + "type": "rectangle", + "x": 320, + "y": 560, + "width": 200, + "height": 80, + "angle": 0, + "strokeColor": "#087f5b", + "backgroundColor": "#c3fae8", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": { "type": 3 }, + "seed": 16, + "version": 1, + "versionNonce": 1016, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false + }, + { + "id": "text-interfaces", + "type": "text", + "x": 340, + "y": 575, + "width": 160, + "height": 50, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 17, + "version": 1, + "versionNonce": 1017, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "text": "Interfaces (lo, eth0…)\nXDP attached by collector", + "fontSize": 14, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Interfaces (lo, eth0…)\nXDP attached by collector" + }, + { + "id": "box-dpdk", + "type": "rectangle", + "x": 580, + "y": 560, + "width": 220, + "height": 100, + "angle": 0, + "strokeColor": "#c92a2a", + "backgroundColor": "#ffd8a8", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": { "type": 3 }, + "seed": 18, + "version": 1, + "versionNonce": 1018, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false + }, + { + "id": "text-dpdk", + "type": "text", + "x": 600, + "y": 565, + "width": 180, + "height": 90, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 19, + "version": 1, + "versionNonce": 1019, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "text": "DPDK (optional)\n• PMD on isolated CPUs\n• Hugepages\n• PCIe NIC (nodeconfig\n pcie_slot_ok / link width)", + "fontSize": 14, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "DPDK (optional)\n• PMD on isolated CPUs\n• Hugepages\n• PCIe NIC (nodeconfig\n pcie_slot_ok / link width)" + }, + { + "id": "box-hugepages", + "type": "rectangle", + "x": 860, + "y": 560, + "width": 160, + "height": 70, + "angle": 0, + "strokeColor": "#862e9c", + "backgroundColor": "#e5dbff", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": { "type": 3 }, + "seed": 20, + "version": 1, + "versionNonce": 1020, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false + }, + { + "id": "text-hugepages", + "type": "text", + "x": 880, + "y": 575, + "width": 120, + "height": 40, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 21, + "version": 1, + "versionNonce": 1021, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "text": "Hugepages\n(DPDK / tuning)", + "fontSize": 14, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Hugepages\n(DPDK / tuning)" + }, + { + "id": "arrow-exporter-nodeconfig", + "type": "arrow", + "x": 340, + "y": 130, + "width": 0, + "height": 0, + "angle": 0, + "strokeColor": "#2f9e44", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 22, + "version": 1, + "versionNonce": 1022, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "points": [[0, 0], [-120, 30]], + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "startBinding": null, + "endBinding": null + }, + { + "id": "arrow-exporter-ebpf", + "type": "arrow", + "x": 760, + "y": 130, + "width": 0, + "height": 0, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 23, + "version": 1, + "versionNonce": 1023, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "points": [[0, 0], [60, 30]], + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "startBinding": null, + "endBinding": null + }, + { + "id": "arrow-nodeconfig-sysfs", + "type": "arrow", + "x": 200, + "y": 260, + "width": 0, + "height": 0, + "angle": 0, + "strokeColor": "#495057", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 24, + "version": 1, + "versionNonce": 1024, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "points": [[0, 0], [-80, 60]], + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "startBinding": null, + "endBinding": null + }, + { + "id": "arrow-nodeconfig-dmi", + "type": "arrow", + "x": 200, + "y": 260, + "width": 0, + "height": 0, + "angle": 0, + "strokeColor": "#495057", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 25, + "version": 1, + "versionNonce": 1025, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "points": [[0, 0], [-80, 250]], + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "startBinding": null, + "endBinding": null + }, + { + "id": "arrow-ebpf-bpffs", + "type": "arrow", + "x": 800, + "y": 280, + "width": 0, + "height": 0, + "angle": 0, + "strokeColor": "#5f3dc4", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 26, + "version": 1, + "versionNonce": 1026, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "points": [[0, 0], [0, 40]], + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "startBinding": null, + "endBinding": null + }, + { + "id": "arrow-ebpf-kernel", + "type": "arrow", + "x": 700, + "y": 220, + "width": 0, + "height": 0, + "angle": 0, + "strokeColor": "#f08c00", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 27, + "version": 1, + "versionNonce": 1027, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "points": [[0, 0], [-260, 100]], + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "startBinding": null, + "endBinding": null + }, + { + "id": "arrow-kernel-interfaces", + "type": "arrow", + "x": 490, + "y": 520, + "width": 0, + "height": 0, + "angle": 0, + "strokeColor": "#087f5b", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 28, + "version": 1, + "versionNonce": 1028, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "points": [[0, 0], [0, 40]], + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "startBinding": null, + "endBinding": null + }, + { + "id": "label-reads", + "type": "text", + "x": 100, + "y": 268, + "width": 80, + "height": 25, + "angle": 0, + "strokeColor": "#2f9e44", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 29, + "version": 1, + "versionNonce": 1029, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "text": "reads", + "fontSize": 14, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "reads" + }, + { + "id": "label-load-pin-attach", + "type": "text", + "x": 430, + "y": 218, + "width": 120, + "height": 25, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 30, + "version": 1, + "versionNonce": 1030, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "text": "load / attach / read", + "fontSize": 14, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "load / attach / read" + }, + { + "id": "label-packets", + "type": "text", + "x": 455, + "y": 535, + "width": 100, + "height": 25, + "angle": 0, + "strokeColor": "#087f5b", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 31, + "version": 1, + "versionNonce": 1031, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "text": "packets (XDP/TC)", + "fontSize": 14, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "packets (XDP/TC)" + }, + { + "id": "scenario-title", + "type": "text", + "x": 80, + "y": 660, + "width": 520, + "height": 28, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 100, + "version": 1, + "versionNonce": 1100, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "text": "Scenarios (KERNEL_STACK_AF_PACKET_METRICS.md): Example A/B/C + AF_PACKET cost", + "fontSize": 18, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Scenarios (KERNEL_STACK_AF_PACKET_METRICS.md): Example A/B/C + AF_PACKET cost" + }, + { + "id": "box-example-a", + "type": "rectangle", + "x": 40, + "y": 700, + "width": 240, + "height": 100, + "angle": 0, + "strokeColor": "#c92a2a", + "backgroundColor": "#ffec99", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": { "type": 3 }, + "seed": 101, + "version": 1, + "versionNonce": 1101, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false + }, + { + "id": "text-example-a", + "type": "text", + "x": 55, + "y": 705, + "width": 210, + "height": 90, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 102, + "version": 1, + "versionNonce": 1102, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "text": "Example A: Full nf_conntrack → packet drops\n• node_nf_conntrack_entries / _entries_limit\n• node_nf_conntrack_stat_drop, _stat_early_drop\n• Source: /proc/sys/net/netfilter, conntrack collector", + "fontSize": 12, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Example A: Full nf_conntrack → packet drops\n• node_nf_conntrack_entries / _entries_limit\n• node_nf_conntrack_stat_drop, _stat_early_drop\n• Source: /proc/sys/net/netfilter, conntrack collector" + }, + { + "id": "box-example-b", + "type": "rectangle", + "x": 300, + "y": 700, + "width": 260, + "height": 100, + "angle": 0, + "strokeColor": "#5f3dc4", + "backgroundColor": "#e5dbff", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": { "type": 3 }, + "seed": 103, + "version": 1, + "versionNonce": 1103, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false + }, + { + "id": "text-example-b", + "type": "text", + "x": 315, + "y": 705, + "width": 230, + "height": 90, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 104, + "version": 1, + "versionNonce": 1104, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "text": "Example B: OVS / path not NUMA-optimized → high latency\n• node_zoneinfo_numa_miss_total, numa_other_total\n• node_pcidevice_numa_node, node_ebpf_pmd_jitter_latency_*_ns\n• Source: /proc/zoneinfo, sysfs PCI, zoneinfo + pcidevice collectors", + "fontSize": 12, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Example B: OVS / path not NUMA-optimized → high latency\n• node_zoneinfo_numa_miss_total, numa_other_total\n• node_pcidevice_numa_node, node_ebpf_pmd_jitter_latency_*_ns\n• Source: /proc/zoneinfo, sysfs PCI, zoneinfo + pcidevice collectors" + }, + { + "id": "box-example-c", + "type": "rectangle", + "x": 580, + "y": 700, + "width": 260, + "height": 100, + "angle": 0, + "strokeColor": "#087f5b", + "backgroundColor": "#c3fae8", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": { "type": 3 }, + "seed": 105, + "version": 1, + "versionNonce": 1105, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false + }, + { + "id": "text-example-c", + "type": "text", + "x": 595, + "y": 705, + "width": 230, + "height": 90, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 106, + "version": 1, + "versionNonce": 1106, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "text": "Example C: TCP buffer monitoring\n• node_netstat_TcpExt_TCPRcvQDrop, ListenOverflows\n• node_netstat_Udp_*bufErrors, node_sockstat_TCP_mem_bytes\n• Source: /proc/net/netstat, snmp, sockstat collector", + "fontSize": 12, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Example C: TCP buffer monitoring\n• node_netstat_TcpExt_TCPRcvQDrop, ListenOverflows\n• node_netstat_Udp_*bufErrors, node_sockstat_TCP_mem_bytes\n• Source: /proc/net/netstat, snmp, sockstat collector" + }, + { + "id": "box-afpacket-cost", + "type": "rectangle", + "x": 860, + "y": 700, + "width": 220, + "height": 100, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": { "type": 3 }, + "seed": 107, + "version": 1, + "versionNonce": 1107, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false + }, + { + "id": "text-afpacket-cost", + "type": "text", + "x": 875, + "y": 705, + "width": 190, + "height": 90, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 108, + "version": 1, + "versionNonce": 1108, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "text": "AF_PACKET cost per packet\n• node_cpu_seconds_total{mode=softirq}\n• node_softirqs_* type=NET_RX\n• node_ebpf_pmd_jitter_*_ns (kernel stack latency)\n• High → throughput drop, correlate with netdev drops", + "fontSize": 12, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "AF_PACKET cost per packet\n• node_cpu_seconds_total{mode=softirq}\n• node_softirqs_* type=NET_RX\n• node_ebpf_pmd_jitter_*_ns (kernel stack latency)\n• High → throughput drop, correlate with netdev drops" + }, + { + "id": "box-proc-sources", + "type": "rectangle", + "x": 320, + "y": 830, + "width": 380, + "height": 70, + "angle": 0, + "strokeColor": "#495057", + "backgroundColor": "#f1f3f5", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": { "type": 3 }, + "seed": 109, + "version": 1, + "versionNonce": 1109, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false + }, + { + "id": "text-proc-sources", + "type": "text", + "x": 335, + "y": 835, + "width": 350, + "height": 60, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roundness": null, + "seed": 110, + "version": 1, + "versionNonce": 1110, + "isDeleted": false, + "boundElements": null, + "updated": 1, + "link": null, + "locked": false, + "text": "Data sources for scenarios: /proc/sys/net/netfilter (conntrack), /proc/net/netstat & snmp (netstat), /proc/net/sockstat (sockstat), /proc/zoneinfo (zoneinfo), /proc/pressure/* (pressure). All read by Node Exporter → localhost:9192/metrics", + "fontSize": 12, + "fontFamily": 1, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Data sources for scenarios: /proc/sys/net/netfilter (conntrack), /proc/net/netstat & snmp (netstat), /proc/net/sockstat (sockstat), /proc/zoneinfo (zoneinfo), /proc/pressure/* (pressure). All read by Node Exporter → localhost:9192/metrics" + } + ], + "appState": { + "gridSize": 20, + "viewBackgroundColor": "#ffffff" + }, + "files": {} +} diff --git a/go.mod b/go.mod index 056a340030..d636e1e7f7 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,7 @@ go 1.24.0 require ( github.com/alecthomas/kingpin/v2 v2.4.0 github.com/beevik/ntp v1.5.0 + github.com/cilium/ebpf v0.19.0 github.com/coreos/go-systemd/v22 v22.7.0 github.com/dennwc/btrfs v0.0.0-20241002142654-12ae127e0bf6 github.com/ema/qdisc v1.0.0 diff --git a/go.sum b/go.sum index 1ef3b5a97a..05b3448a06 100644 --- a/go.sum +++ b/go.sum @@ -25,6 +25,8 @@ github.com/dennwc/ioctl v1.0.0 h1:DsWAAjIxRqNcLn9x6mwfuf2pet3iB7aK90K4tF16rLg= github.com/dennwc/ioctl v1.0.0/go.mod h1:ellh2YB5ldny99SBU/VX7Nq0xiZbHphf1DrtHxxjMk0= github.com/ema/qdisc v1.0.0 h1:EHLG08FVRbWLg8uRICa3xzC9Zm0m7HyMHfXobWFnXYg= github.com/ema/qdisc v1.0.0/go.mod h1:FhIc0fLYi7f+lK5maMsesDqwYojIOh3VfRs8EVd5YJQ= +github.com/go-quicktest/qt v1.101.1-0.20240301121107-c6c8733fa1e6 h1:teYtXy9B7y5lHTp8V9KPxpYRAVA7dozigQcMiBust1s= +github.com/go-quicktest/qt v1.101.1-0.20240301121107-c6c8733fa1e6/go.mod h1:p4lGIVX+8Wa6ZPNDvqcxq36XpUDLh42FLetFU7odllI= github.com/godbus/dbus/v5 v5.2.2 h1:TUR3TgtSVDmjiXOgAAyaZbYmIeP3DPkld3jgKGV8mXQ= github.com/godbus/dbus/v5 v5.2.2/go.mod h1:3AAv2+hPq5rdnr5txxxRwiGjPXamgoIHgz9FPBfOp3c= github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo= @@ -90,8 +92,8 @@ github.com/prometheus/exporter-toolkit v0.15.0 h1:Pcle5sSViwR1x0gdPd0wtYrPQENBie github.com/prometheus/exporter-toolkit v0.15.0/go.mod h1:OyRWd2iTo6Xge9Kedvv0IhCrJSBu36JCfJ2yVniRIYk= github.com/prometheus/procfs v0.19.2 h1:zUMhqEW66Ex7OXIiDkll3tl9a1ZdilUOd/F6ZXw4Vws= github.com/prometheus/procfs v0.19.2/go.mod h1:M0aotyiemPhBCM0z5w87kL22CxfcH05ZpYlu+b4J7mw= -github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= -github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= +github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= +github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= github.com/safchain/ethtool v0.7.0 h1:rlJzfDetsVvT61uz8x1YIcFn12akMfuPulHtZjtb7Is= github.com/safchain/ethtool v0.7.0/go.mod h1:MenQKEjXdfkjD3mp2QdCk8B/hwvkrlOTm/FD4gTpFxQ= github.com/siebenmann/go-kstat v0.0.0-20210513183136-173c9b0a9973 h1:GfSdC6wKfTGcgCS7BtzF5694Amne1pGCSTY252WhlEY= diff --git a/kernel_stack_af_packet_functional_test.go b/kernel_stack_af_packet_functional_test.go new file mode 100644 index 0000000000..897ca67809 --- /dev/null +++ b/kernel_stack_af_packet_functional_test.go @@ -0,0 +1,528 @@ +// Copyright 2025 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"). +// Functional test for scenarios in docs/KERNEL_STACK_AF_PACKET_METRICS.md: +// Example A (conntrack full → drops), Example C (TCP listen/buffer drops), +// traffic generation and pcap capture in a dedicated network namespace. +// +//go:build linux + +package main + +import ( + "bufio" + "bytes" + "fmt" + "io" + "net" + "net/http" + "os" + "os/exec" + "path/filepath" + "regexp" + "strconv" + "sync" + "testing" + "time" +) + +const pcapDirPrefix = "/tmp/node_exporter_kernel_stack_pcaps_" + +const ( + netnsName = "kernel_stack_ftest" + vethHost = "veth_ftest_host" + vethNetns = "veth_ftest_ns" + netnsAddr = "10.0.0.1" + hostAddr = "10.0.0.2" + exporterPort = "9192" + stressPort = "9999" + conntrackMax = 50 // low limit to replicate Example A: table full → drops + conntrackBurst = 100 // connection attempts to exceed limit and trigger drops + listenBurst = 100 // parallel SYNs to overflow backlog=1 (Example C) + rcvqSendBytes = 512 * 1024 // send volume to fill rcvbuf and trigger TCPRcvQDrop (Example C) + // Gbps-scale traffic: target high data rate to stress stack and produce measurable metrics. + trafficScenarioBConns = 80 // Scenario B (NUMA/traffic): parallel connections + trafficScenarioBBytes = 1 << 20 * 32 // 32 MB total (~Gbps-scale over a few seconds) + trafficScenarioPcapConns = 100 // TrafficAndPcap: connections for AF_PACKET cost + trafficScenarioPcapBytes = 1 << 20 * 64 // 64 MB total per run +) + +func TestKernelStackAFPacketScenarios(t *testing.T) { + if os.Getuid() != 0 { + t.Skip("functional test requires root (for network namespace and tcpdump)") + } + binary, err := findNodeExporterBinary() + if err != nil { + t.Skipf("node_exporter binary not found: %v", err) + } + stressBin, err := buildStressServer(t) + if err != nil { + t.Skipf("stress server build failed: %v", err) + } + // Preserve .pcap files in /tmp for inspection after the test. + pcapDir := pcapDirPrefix + strconv.FormatInt(time.Now().Unix(), 10) + if err := os.MkdirAll(pcapDir, 0755); err != nil { + t.Fatalf("create pcap dir %s: %v", pcapDir, err) + } + t.Logf("preserving .pcap files in %s", pcapDir) + t.Logf("replication uses Gbps-scale data rates where possible; limitations (conntrack/NUMA/eBPF/veth) documented in docs/KERNEL_STACK_AF_PACKET_METRICS.md §9.1") + pcaps := map[string]string{ + "scenario_a_conntrack.pcap": "", + "scenario_c_listen.pcap": "", + "scenario_c_rcvq.pcap": "", + "scenario_traffic.pcap": "", + } + for k := range pcaps { + pcaps[k] = filepath.Join(pcapDir, k) + } + + // Create network namespace and veth pair. + if err := createNetnsAndVeth(t); err != nil { + t.Fatalf("create netns and veth: %v", err) + } + defer cleanupNetns(t) + + // Start node_exporter inside the netns. + exporterCmd := exec.Command("ip", "netns", "exec", netnsName, binary, + "--web.listen-address=0.0.0.0:"+exporterPort, + "--path.procfs=/proc", "--path.sysfs=/sys") + if err := exporterCmd.Start(); err != nil { + t.Fatalf("start node_exporter: %v", err) + } + defer func() { + if exporterCmd.Process != nil { + exporterCmd.Process.Kill() + } + }() + metricsURL := "http://" + netnsAddr + ":" + exporterPort + "/metrics" + if err := waitForExporter(metricsURL); err != nil { + t.Fatalf("exporter not ready: %v", err) + } + + // Scenario A: Replicates KERNEL_STACK_AF_PACKET_METRICS.md Example A — full nf_conntrack + // table leads to packet drops (new flows cannot be allocated, stat_drop/stat_early_drop). + t.Run("ScenarioA_ConntrackDrops", func(t *testing.T) { + _ = runInNetns(t, "sysctl", "-w", "net.netfilter.nf_conntrack_max="+strconv.Itoa(conntrackMax)) + _ = runInNetns(t, "sysctl", "-w", "net.netfilter.nf_conntrack_tcp_timeout_established=60") + // Server holds many connections so conntrack table fills; we then exceed limit with a burst. + serverCmd := exec.Command("ip", "netns", "exec", netnsName, stressBin, "-port", stressPort, "-hold", strconv.Itoa(conntrackBurst)) + if err := serverCmd.Start(); err != nil { + t.Skipf("start stress server: %v", err) + } + defer serverCmd.Process.Kill() + time.Sleep(500 * time.Millisecond) + pcapPath := pcaps["scenario_a_conntrack.pcap"] + stopCapture := startPcapInNetns(t, pcapPath, 250) + // Burst 1: exceed conntrack limit so new connections are dropped (replicate the issue). + ok1, fail1 := openManyConnectionsWithStats(t, netnsAddr+":"+stressPort, conntrackBurst) + t.Logf("Scenario A burst 1: %d connected, %d failed (conntrack limit=%d)", ok1, fail1, conntrackMax) + // Burst 2: more attempts to stress table and trigger early_drop. + ok2, fail2 := openManyConnectionsWithStats(t, netnsAddr+":"+stressPort, conntrackBurst) + t.Logf("Scenario A burst 2: %d connected, %d failed", ok2, fail2) + stopCapture() + // Scrape and assert conntrack metrics show table pressure and/or drops. + before := scrapeMetricValues(t, metricsURL, "node_nf_conntrack_entries", "node_nf_conntrack_entries_limit", "node_nf_conntrack_stat_drop", "node_nf_conntrack_stat_early_drop") + entries := before["node_nf_conntrack_entries"] + limit := before["node_nf_conntrack_entries_limit"] + drop := before["node_nf_conntrack_stat_drop"] + earlyDrop := before["node_nf_conntrack_stat_early_drop"] + if limit > 0 && entries >= limit*0.8 { + t.Logf("conntrack entries %.0f / limit %.0f (table pressure)", entries, limit) + } + if drop > 0 || earlyDrop > 0 { + t.Logf("conntrack drops: stat_drop=%.0f stat_early_drop=%.0f", drop, earlyDrop) + } + // Node_exporter in netns still reads host /proc (same mount ns), so conntrack + // metrics are from the host, not the netns we stressed. Only assert when limit + // looks like our netns value (e.g. we set 50). + if limit > 0 && limit < 1000 { + if entries < limit*0.5 && drop == 0 && earlyDrop == 0 { + t.Logf("conntrack may not have been stressed (entries=%.0f limit=%.0f)", entries, limit) + } + } else { + t.Logf("conntrack metrics from host (limit=%.0f); netns stress not visible in metrics", limit) + } + validatePcap(t, pcapPath) + }) + + // Scenario C (1): Replicates doc Example C — listen queue overflow (SYN backlog full) + // → ListenOverflows / ListenDrops when backlog=1 and many SYNs arrive. + t.Run("ScenarioC_ListenOverflow", func(t *testing.T) { + serverCmd := exec.Command("ip", "netns", "exec", netnsName, stressBin, "-port", stressPort, "-backlog", "1", "-hold", "10") + if err := serverCmd.Start(); err != nil { + t.Skipf("start stress server: %v", err) + } + defer serverCmd.Process.Kill() + time.Sleep(300 * time.Millisecond) + pcapPath := pcaps["scenario_c_listen.pcap"] + stopCapture := startPcapInNetns(t, pcapPath, 200) + // Many parallel SYNs to overflow the listen queue (replicate the issue). + ok, fail := openManyConnectionsWithStats(t, netnsAddr+":"+stressPort, listenBurst) + t.Logf("Scenario C listen: %d connected, %d failed (backlog=1)", ok, fail) + stopCapture() + metrics := scrapeMetricValues(t, metricsURL, "node_netstat_TcpExt_ListenOverflows", "node_netstat_TcpExt_ListenDrops", "node_netstat_TcpExt_TCPRcvQDrop") + if v := metrics["node_netstat_TcpExt_ListenOverflows"]; v > 0 { + t.Logf("ListenOverflows=%.0f", v) + } + if v := metrics["node_netstat_TcpExt_ListenDrops"]; v > 0 { + t.Logf("ListenDrops=%.0f", v) + } + validatePcap(t, pcapPath) + }) + + // Scenario C (2): Replicates doc Example C — TCP receive queue full (application not + // reading fast enough or rcvbuf too small) → TCPRcvQDrop. Small rcvbuf + slow read fills + // kernel queue; sender runs with write deadline to avoid blocking. + t.Run("ScenarioC_TCPRcvQDrop", func(t *testing.T) { + serverCmd := exec.Command("ip", "netns", "exec", netnsName, stressBin, + "-port", stressPort, "-hold", "1", "-rcvbuf", "512", "-read-delay", "1ms") + if err := serverCmd.Start(); err != nil { + t.Skipf("start stress server: %v", err) + } + defer serverCmd.Process.Kill() + time.Sleep(300 * time.Millisecond) + pcapPath := pcaps["scenario_c_rcvq.pcap"] + stopCapture := startPcapInNetns(t, pcapPath, 150) + // Send much more than rcvbuf (512 B) and faster than server reads to fill queue. + sendFasterThanRead(t, netnsAddr+":"+stressPort, rcvqSendBytes) + stopCapture() + metrics := scrapeMetricValues(t, metricsURL, "node_netstat_TcpExt_TCPRcvQDrop", "node_sockstat_TCP_mem_bytes") + if v := metrics["node_netstat_TcpExt_TCPRcvQDrop"]; v > 0 { + t.Logf("TCPRcvQDrop=%.0f", v) + } + validatePcap(t, pcapPath) + }) + + // Scenario B: Replicates doc Example B conditions — Gbps-scale traffic so that on NUMA + // hardware zoneinfo_numa_miss/other and pcidevice numa_node would correlate with latency. + // Limitation: NUMA topology is not replicated in the netns (see doc §9.1). + t.Run("ScenarioB_TrafficForNUMA", func(t *testing.T) { + serverCmd := exec.Command("ip", "netns", "exec", netnsName, stressBin, "-port", stressPort, "-hold", strconv.Itoa(trafficScenarioBConns)) + if err := serverCmd.Start(); err != nil { + t.Skipf("start stress server: %v", err) + } + defer serverCmd.Process.Kill() + time.Sleep(300 * time.Millisecond) + pcapPath := filepath.Join(pcapDir, "scenario_b_numa_traffic.pcap") + stopCapture := startPcapInNetns(t, pcapPath, 200) + bytesPerConn := trafficScenarioBBytes / trafficScenarioBConns + total, elapsed := openConnectionsAndSendDataTimed(t, netnsAddr+":"+stressPort, trafficScenarioBConns, bytesPerConn) + stopCapture() + logEffectiveGbps(t, total, elapsed, "Scenario B (traffic for NUMA)") + t.Logf("Scenario B: on real NUMA hardware check node_zoneinfo_numa_* and node_pcidevice_numa_node; netns has no NUMA topology") + validatePcap(t, pcapPath) + }) + + // Replicates doc §1.2 / §4 — AF_PACKET cost: Gbps-scale traffic while capturing (tcpdump) + // so softirq and netdev metrics reflect per-packet cost; pcap captures the same traffic. + t.Run("TrafficAndPcap", func(t *testing.T) { + serverCmd := exec.Command("ip", "netns", "exec", netnsName, stressBin, "-port", stressPort, "-hold", strconv.Itoa(trafficScenarioPcapConns)) + if err := serverCmd.Start(); err != nil { + t.Skipf("start stress server: %v", err) + } + defer serverCmd.Process.Kill() + time.Sleep(300 * time.Millisecond) + pcapPath := pcaps["scenario_traffic.pcap"] + stopCapture := startPcapInNetns(t, pcapPath, 300) + bytesPerConn := trafficScenarioPcapBytes / trafficScenarioPcapConns + total, elapsed := openConnectionsAndSendDataTimed(t, netnsAddr+":"+stressPort, trafficScenarioPcapConns, bytesPerConn) + stopCapture() + logEffectiveGbps(t, total, elapsed, "TrafficAndPcap (AF_PACKET cost)") + metrics := scrapeMetricValues(t, metricsURL, "node_network_receive_bytes_total", "node_network_receive_drop_total") + t.Logf("traffic metrics (receive bytes/drops; correlate with softirq when capture active): %v", metrics) + validatePcap(t, pcapPath) + }) + t.Logf("pcap files preserved in %s", pcapDir) +} + +func findNodeExporterBinary() (string, error) { + wd, _ := os.Getwd() + moduleRoot := findModuleRoot(wd) + gopath := os.Getenv("GOPATH") + if gopath == "" { + gopath = filepath.Join(os.Getenv("HOME"), "go") + } + candidates := []string{ + filepath.Join(moduleRoot, "node_exporter"), // make build in repo root + filepath.Join(gopath, "bin", "node_exporter"), + "./node_exporter", + "node_exporter", + } + for _, c := range candidates { + if c == "" { + continue + } + if path, err := exec.LookPath(c); err == nil { + return path, nil + } + if _, err := os.Stat(c); err == nil { + abs, _ := filepath.Abs(c) + return abs, nil + } + } + return "", fmt.Errorf("node_exporter binary not found (run 'make build' from repo root)") +} + +// findModuleRoot returns the directory containing go.mod by walking up from dir. +func findModuleRoot(dir string) string { + for { + if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { + return dir + } + parent := filepath.Dir(dir) + if parent == dir { + return "" + } + dir = parent + } +} + +func buildStressServer(t *testing.T) (string, error) { + wd, err := os.Getwd() + if err != nil { + return "", err + } + bin := filepath.Join(t.TempDir(), "kernel_stack_stress_server") + cmd := exec.Command("go", "build", "-o", bin, "./cmd/kernel_stack_stress_server") + cmd.Dir = wd + if out, err := cmd.CombinedOutput(); err != nil { + return "", fmt.Errorf("%w: %s", err, out) + } + return bin, nil +} + +func createNetnsAndVeth(t *testing.T) error { + // Create netns and veth; move one end into netns; assign IPs. + cleanupNetns(t) + for _, c := range [][]string{ + {"ip", "netns", "add", netnsName}, + {"ip", "link", "add", vethHost, "type", "veth", "peer", "name", vethNetns}, + {"ip", "link", "set", vethNetns, "netns", netnsName}, + {"ip", "addr", "add", hostAddr + "/24", "dev", vethHost}, + {"ip", "link", "set", vethHost, "up"}, + } { + if out, err := exec.Command(c[0], c[1:]...).CombinedOutput(); err != nil { + return fmt.Errorf("%v: %s", err, out) + } + } + if out, err := runInNetnsOut(t, "ip", "addr", "add", netnsAddr+"/24", "dev", vethNetns); err != nil { + return fmt.Errorf("netns addr: %s %v", out, err) + } + if out, err := runInNetnsOut(t, "ip", "link", "set", vethNetns, "up"); err != nil { + return fmt.Errorf("netns link up: %s %v", out, err) + } + if out, err := runInNetnsOut(t, "ip", "link", "set", "lo", "up"); err != nil { + return fmt.Errorf("netns lo up: %s %v", out, err) + } + return nil +} + +func cleanupNetns(t *testing.T) { + exec.Command("ip", "netns", "del", netnsName).Run() + exec.Command("ip", "link", "del", vethHost).Run() +} + +func runInNetns(t *testing.T, name string, args ...string) error { + out, err := runInNetnsOut(t, name, args...) + if err != nil { + t.Logf("runInNetns %s: %s", name, out) + } + return err +} + +func runInNetnsOut(t *testing.T, name string, args ...string) ([]byte, error) { + cmd := exec.Command("ip", "netns", "exec", netnsName, name) + cmd.Args = append(cmd.Args, args...) + return cmd.CombinedOutput() +} + +func waitForExporter(metricsURL string) error { + for i := 0; i < 25; i++ { + resp, err := http.Get(metricsURL) + if err == nil { + resp.Body.Close() + if resp.StatusCode == http.StatusOK { + return nil + } + } + time.Sleep(200 * time.Millisecond) + } + return fmt.Errorf("exporter at %s did not become ready", metricsURL) +} + +func scrapeMetricValues(t *testing.T, metricsURL string, names ...string) map[string]float64 { + resp, err := http.Get(metricsURL) + if err != nil { + t.Fatalf("scrape: %v", err) + } + defer resp.Body.Close() + body, err := io.ReadAll(resp.Body) + if err != nil { + t.Fatalf("read body: %v", err) + } + return parsePrometheusMetrics(body, names) +} + +// parsePrometheusMetrics extracts gauge/counter values for the given metric names. +func parsePrometheusMetrics(body []byte, names []string) map[string]float64 { + want := make(map[string]bool) + for _, n := range names { + want[n] = true + } + out := make(map[string]float64) + scanner := bufio.NewScanner(bytes.NewReader(body)) + // Match lines like: node_nf_conntrack_entries 42 or node_netstat_TcpExt_ListenOverflows{foo="bar"} 1 + re := regexp.MustCompile(`^(node_[a-zA-Z0-9_]+)(?:\{[^}]*\})?\s+([0-9.eE+-]+)`) + for scanner.Scan() { + line := scanner.Text() + if len(line) == 0 || line[0] == '#' { + continue + } + m := re.FindStringSubmatch(line) + if m == nil { + continue + } + name := m[1] + if !want[name] { + continue + } + v, err := strconv.ParseFloat(m[2], 64) + if err != nil { + continue + } + out[name] = v + } + return out +} + +// startPcapInNetns starts tcpdump in the netns and returns a function that waits for +// it to exit after capturing maxPackets. Do not kill tcpdump—letting it exit naturally +// ensures the pcap file is flushed to disk. +func startPcapInNetns(t *testing.T, pcapPath string, maxPackets int) func() { + cmd := exec.Command("ip", "netns", "exec", netnsName, "tcpdump", "-i", "any", "-w", pcapPath, "-c", strconv.Itoa(maxPackets)) + if err := cmd.Start(); err != nil { + t.Skipf("tcpdump not available: %v", err) + } + time.Sleep(300 * time.Millisecond) + return func() { + done := make(chan struct{}) + go func() { + cmd.Wait() + close(done) + }() + select { + case <-done: + return + case <-time.After(15 * time.Second): + cmd.Process.Kill() + <-done + } + } +} + +// openManyConnectionsWithStats opens n parallel TCP connections; returns how many succeeded vs failed. +// Used to replicate conntrack full (many fail) or listen overflow (many fail when backlog=1). +func openManyConnectionsWithStats(t *testing.T, addr string, n int) (ok, fail int) { + var wg sync.WaitGroup + var mu sync.Mutex + for i := 0; i < n; i++ { + wg.Add(1) + go func() { + defer wg.Done() + c, err := net.DialTimeout("tcp", addr, 3*time.Second) + mu.Lock() + if err != nil { + fail++ + } else { + ok++ + c.Close() + } + mu.Unlock() + }() + } + wg.Wait() + return ok, fail +} + +// openConnectionsAndSendData opens n connections, each sends size bytes, then closes. +// Replicates traffic volume for AF_PACKET cost (netdev receive bytes, softirq). +func openConnectionsAndSendData(t *testing.T, addr string, n int, size int) { + _, _ = openConnectionsAndSendDataTimed(t, addr, n, size) +} + +// openConnectionsAndSendDataTimed does the same but returns total bytes sent and duration +// so callers can log effective rate (e.g. Gbps). Used for Gbps-scale stress scenarios. +func openConnectionsAndSendDataTimed(t *testing.T, addr string, n int, size int) (totalBytes int, elapsed time.Duration) { + data := make([]byte, 64*1024) // 64 KB chunks for higher throughput + var wg sync.WaitGroup + var totalMu sync.Mutex + var sharedTotal int + start := time.Now() + for i := 0; i < n; i++ { + wg.Add(1) + go func() { + defer wg.Done() + c, err := net.DialTimeout("tcp", addr, 5*time.Second) + if err != nil { + return + } + defer c.Close() + sent := 0 + for sent < size { + chunk := size - sent + if chunk > len(data) { + chunk = len(data) + } + nw, err := c.Write(data[:chunk]) + if err != nil { + return + } + sent += nw + totalMu.Lock() + sharedTotal += nw + totalMu.Unlock() + } + }() + } + wg.Wait() + elapsed = time.Since(start) + return sharedTotal, elapsed +} + +func logEffectiveGbps(t *testing.T, totalBytes int, elapsed time.Duration, label string) { + if elapsed <= 0 { + return + } + gbps := (float64(totalBytes) * 8) / (elapsed.Seconds() * 1e9) + t.Logf("%s: %d bytes in %s → effective rate %.3f Gbps (metrics: node_network_*_bytes_total, softirq)", label, totalBytes, elapsed.Round(time.Millisecond), gbps) +} + +func sendFasterThanRead(t *testing.T, addr string, size int) { + c, err := net.DialTimeout("tcp", addr, 3*time.Second) + if err != nil { + t.Skipf("dial: %v", err) + } + defer c.Close() + // Server reads very slowly; TCP window fills and Write would block forever. + // Use a short write deadline so we send enough to fill rcvbuf and trigger TCPRcvQDrop, then exit. + if tcp, ok := c.(*net.TCPConn); ok { + _ = tcp.SetWriteDeadline(time.Now().Add(10 * time.Second)) + } + data := make([]byte, 4096) + for sent := 0; sent < size; sent += len(data) { + if _, err := c.Write(data); err != nil { + break + } + } +} + +func validatePcap(t *testing.T, pcapPath string) { + info, err := os.Stat(pcapPath) + if err != nil { + t.Errorf("pcap missing: %v", err) + return + } + if info.Size() < 24 { + t.Errorf("pcap too small (%d bytes)", info.Size()) + } + t.Logf("pcap %s: %d bytes", pcapPath, info.Size()) +}