diff --git a/Cargo.lock b/Cargo.lock index 66ce7bd3..d408fb0a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -708,7 +708,7 @@ dependencies = [ "ipc-channel", "itertools 0.14.0", "libc", - "linux-perf-data 0.12.0", + "linux-perf-data 0.12.0 (git+https://github.com/mstange/linux-perf-data.git?rev=da5bce4b9fb724e84b1eea0cb6ab9c8a291bc676)", "log", "md5", "memmap2", @@ -1375,7 +1375,7 @@ dependencies = [ [[package]] name = "fxprof-processed-profile" version = "0.8.1" -source = "git+https://github.com/CodSpeedHQ/samply-codspeed?rev=ec97a70c0667098f8607f30a607ddd031a15a8b8#ec97a70c0667098f8607f30a607ddd031a15a8b8" +source = "git+https://github.com/CodSpeedHQ/samply-codspeed?rev=81ba2c346e71#81ba2c346e71d05aaef94448e2962961d29cb4c8" dependencies = [ "bitflags", "debugid", @@ -2177,6 +2177,20 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfae20f6b19ad527b550c223fddc3077a547fc70cda94b9b566575423fd303ee" +[[package]] +name = "linux-perf-data" +version = "0.12.0" +source = "git+https://github.com/CodSpeedHQ/linux-perf-data?rev=87308af9cad1#87308af9cad13fc93d7e7100652cc05e531c37a4" +dependencies = [ + "byteorder", + "linear-map", + "linux-perf-event-reader 0.10.2 (git+https://github.com/AvalancheHQ/linux-perf-event-reader?rev=908775c8b5bd)", + "memchr", + "prost", + "prost-derive", + "thiserror 2.0.18", +] + [[package]] name = "linux-perf-data" version = "0.12.0" @@ -2184,7 +2198,7 @@ source = "git+https://github.com/mstange/linux-perf-data.git?rev=da5bce4b9fb724e dependencies = [ "byteorder", "linear-map", - "linux-perf-event-reader", + "linux-perf-event-reader 0.10.2 (registry+https://github.com/rust-lang/crates.io-index)", "memchr", "prost", "prost-derive", @@ -2200,12 +2214,11 @@ checksum = "79544deaf2626fe2d10e5f87af7b7fca93c0d0062fc6ec84fc24e463039c6750" dependencies = [ "byteorder", "linear-map", - "linux-perf-event-reader", + "linux-perf-event-reader 0.10.2 (registry+https://github.com/rust-lang/crates.io-index)", "memchr", "prost", "prost-derive", "thiserror 2.0.18", - "zstd-safe", ] [[package]] @@ -2220,6 +2233,17 @@ dependencies = [ "thiserror 2.0.18", ] +[[package]] +name = "linux-perf-event-reader" +version = "0.10.2" +source = "git+https://github.com/AvalancheHQ/linux-perf-event-reader?rev=908775c8b5bd#908775c8b5bd620ea4ec767d7e56afb3b2f232ac" +dependencies = [ + "bitflags", + "byteorder", + "memchr", + "thiserror 2.0.18", +] + [[package]] name = "linux-raw-sys" version = "0.4.15" @@ -3560,6 +3584,7 @@ dependencies = [ "codspeed-divan-compat", "itertools 0.14.0", "libc", + "linux-perf-event-reader 0.10.2 (registry+https://github.com/rust-lang/crates.io-index)", "log", "rand 0.8.6", "rmp", @@ -3749,7 +3774,7 @@ dependencies = [ [[package]] name = "samply" version = "0.13.1" -source = "git+https://github.com/CodSpeedHQ/samply-codspeed?rev=ec97a70c0667098f8607f30a607ddd031a15a8b8#ec97a70c0667098f8607f30a607ddd031a15a8b8" +source = "git+https://github.com/CodSpeedHQ/samply-codspeed?rev=81ba2c346e71#81ba2c346e71d05aaef94448e2962961d29cb4c8" dependencies = [ "bitflags", "byteorder", @@ -3771,7 +3796,7 @@ dependencies = [ "indexmap", "lazy_static", "libc", - "linux-perf-data 0.13.0", + "linux-perf-data 0.12.0 (git+https://github.com/CodSpeedHQ/linux-perf-data?rev=87308af9cad1)", "log", "mach2", "memchr", @@ -3818,7 +3843,7 @@ dependencies = [ [[package]] name = "samply-api" version = "0.24.0" -source = "git+https://github.com/CodSpeedHQ/samply-codspeed?rev=ec97a70c0667098f8607f30a607ddd031a15a8b8#ec97a70c0667098f8607f30a607ddd031a15a8b8" +source = "git+https://github.com/CodSpeedHQ/samply-codspeed?rev=81ba2c346e71#81ba2c346e71d05aaef94448e2962961d29cb4c8" dependencies = [ "samply-debugid", "samply-symbols", @@ -3834,7 +3859,7 @@ dependencies = [ [[package]] name = "samply-debugid" version = "0.1.0" -source = "git+https://github.com/CodSpeedHQ/samply-codspeed?rev=ec97a70c0667098f8607f30a607ddd031a15a8b8#ec97a70c0667098f8607f30a607ddd031a15a8b8" +source = "git+https://github.com/CodSpeedHQ/samply-codspeed?rev=81ba2c346e71#81ba2c346e71d05aaef94448e2962961d29cb4c8" dependencies = [ "debugid", "uuid", @@ -3843,7 +3868,7 @@ dependencies = [ [[package]] name = "samply-object" version = "0.1.0" -source = "git+https://github.com/CodSpeedHQ/samply-codspeed?rev=ec97a70c0667098f8607f30a607ddd031a15a8b8#ec97a70c0667098f8607f30a607ddd031a15a8b8" +source = "git+https://github.com/CodSpeedHQ/samply-codspeed?rev=81ba2c346e71#81ba2c346e71d05aaef94448e2962961d29cb4c8" dependencies = [ "debugid", "object", @@ -3854,7 +3879,7 @@ dependencies = [ [[package]] name = "samply-quota-manager" version = "0.1.0" -source = "git+https://github.com/CodSpeedHQ/samply-codspeed?rev=ec97a70c0667098f8607f30a607ddd031a15a8b8#ec97a70c0667098f8607f30a607ddd031a15a8b8" +source = "git+https://github.com/CodSpeedHQ/samply-codspeed?rev=81ba2c346e71#81ba2c346e71d05aaef94448e2962961d29cb4c8" dependencies = [ "bytesize", "futures", @@ -3868,7 +3893,7 @@ dependencies = [ [[package]] name = "samply-symbols" version = "0.24.1" -source = "git+https://github.com/CodSpeedHQ/samply-codspeed?rev=ec97a70c0667098f8607f30a607ddd031a15a8b8#ec97a70c0667098f8607f30a607ddd031a15a8b8" +source = "git+https://github.com/CodSpeedHQ/samply-codspeed?rev=81ba2c346e71#81ba2c346e71d05aaef94448e2962961d29cb4c8" dependencies = [ "addr2line", "bitflags", @@ -5247,7 +5272,7 @@ dependencies = [ [[package]] name = "wholesym" version = "0.8.1" -source = "git+https://github.com/CodSpeedHQ/samply-codspeed?rev=ec97a70c0667098f8607f30a607ddd031a15a8b8#ec97a70c0667098f8607f30a607ddd031a15a8b8" +source = "git+https://github.com/CodSpeedHQ/samply-codspeed?rev=81ba2c346e71#81ba2c346e71d05aaef94448e2962961d29cb4c8" dependencies = [ "bytes", "core-foundation 0.10.1", diff --git a/Cargo.toml b/Cargo.toml index ba368974..4a82e7b3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -69,7 +69,7 @@ rmp-serde = "1.3.1" uuid = { version = "1.23.1", features = ["v4"] } which = "8.0.2" crc32fast = "1.5.0" -samply = { git = "https://github.com/CodSpeedHQ/samply-codspeed", rev = "ec97a70c0667098f8607f30a607ddd031a15a8b8" } +samply = { git = "https://github.com/CodSpeedHQ/samply-codspeed", rev = "81ba2c346e71" } [target.'cfg(target_os = "linux")'.dependencies] procfs = "0.18" @@ -103,6 +103,7 @@ serde_json = "1.0" serde = { version = "1.0.228", features = ["derive"] } ipc-channel = "0.20" itertools = "0.14.0" +linux-perf-event-reader = "0.10.2" # matches the version linux-perf-data resolves to env_logger = "0.11.10" tempfile = "3.27.0" object = { version = "0.39", default-features = false, features = ["read_core", "elf"] } diff --git a/crates/runner-shared/Cargo.toml b/crates/runner-shared/Cargo.toml index ea007198..d0c67f77 100644 --- a/crates/runner-shared/Cargo.toml +++ b/crates/runner-shared/Cargo.toml @@ -11,6 +11,7 @@ serde_json = { workspace = true } # Pinned to 1.x: 2.0 changes the wire format and serde integration bincode = "1.3" itertools = { workspace = true } +linux-perf-event-reader = { workspace = true } log = { workspace = true } rmp = "0.8.15" rmp-serde = "1.3.1" diff --git a/crates/runner-shared/src/perf_event.rs b/crates/runner-shared/src/perf_event.rs index 1dfe2af4..b93692bd 100644 --- a/crates/runner-shared/src/perf_event.rs +++ b/crates/runner-shared/src/perf_event.rs @@ -1,14 +1,28 @@ +use linux_perf_event_reader::constants::{ + PERF_COUNT_HW_CPU_CYCLES, PERF_COUNT_HW_INSTRUCTIONS, PERF_TYPE_HARDWARE, PERF_TYPE_RAW, +}; + /// Subset of perf events that CodSpeed supports. +/// +/// Each variant is a semantic slot of the cache/execution model, named by +/// [`Self::to_perf_string`] and backed by a concrete PMU event resolved for +/// the current CPU (see [`Self::to_samply_spec`]). #[derive(Debug, Clone, Copy)] pub enum PerfEvent { CpuCycles, + /// L1 data cache accesses. L1DCache, + /// Accesses one level below L1: what L1 misses spill into. Hits in L1 are + /// derived as `L1DCache - L2DCache`. L2DCache, + /// Misses out of the last profiled cache level (i.e. trips to memory). + /// Hits below L1 are derived as `L2DCache - CacheMisses`. CacheMisses, Instructions, } impl PerfEvent { + /// The event name backing this slot. pub fn to_perf_string(&self) -> &'static str { match self { PerfEvent::CpuCycles => "cpu-cycles", @@ -28,6 +42,108 @@ impl PerfEvent { PerfEvent::Instructions, ] } + + /// The `::` spec for samply's `--perf-events`, + /// resolving this slot to a concrete PMU event of the CPU we are running + /// on. + /// + /// `None` when the slot has no suitable backing event on this CPU. + /// The column is labelled with [`Self::to_perf_string`] so samply profiles + /// carry the same event names as perf ones and parse through one path. + pub fn to_samply_spec(&self) -> Option { + let (event_type, config) = self.perf_event_attr()?; + Some(format!( + "{}:{}:{:#x}", + self.to_perf_string(), + event_type, + config + )) + } + + /// The `perf_event_attr` `(type, config)` encoding backing this slot on + /// the current CPU. + fn perf_event_attr(&self) -> Option<(u32, u64)> { + match self { + // Generalized hardware events, portable across architectures. + PerfEvent::CpuCycles => Some((PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES.into())), + PerfEvent::Instructions => { + Some((PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS.into())) + } + _ => Some((PERF_TYPE_RAW, self.raw_cache_config()?)), + } + } + + /// Raw PMU encoding of this cache slot on x86_64: `umask << 8 | event`, + /// the layout the kernel expects in `perf_event_attr.config` for + /// `PERF_TYPE_RAW`. + /// + /// Only Intel has a vetted selection; other vendors get no cache events. + /// EventCode/UMask come from Intel's perfmon tables, listed per mnemonic in + /// the Skylake-X core event file + /// (), + /// stable since Skylake. + #[cfg(target_arch = "x86_64")] + fn raw_cache_config(&self) -> Option { + if !is_genuine_intel() { + // Not tested on AMD or other x86_64 vendors yet + return None; + } + // Retired load instructions, by the cache level that served them + // (demand loads only; stores and prefetches don't count). + match self { + // MEM_INST_RETIRED.ALL_LOADS: 0xD0 | 0x81 << 8 + PerfEvent::L1DCache => Some(0x81d0), + // MEM_LOAD_RETIRED.L1_MISS: 0xD1 | 0x08 << 8 + PerfEvent::L2DCache => Some(0x08d1), + // MEM_LOAD_RETIRED.L3_MISS: 0xD1 | 0x20 << 8 + PerfEvent::CacheMisses => Some(0x20d1), + _ => None, + } + } + + /// Raw PMU encoding of this cache slot on arm64: the architected PMU event + /// number, used directly as `perf_event_attr.config` for `PERF_TYPE_RAW`. + /// + /// These are common (architected) event numbers, listed per mnemonic in + /// Arm's PMU event table for the Cortex-A72 fleet + /// (). + #[cfg(target_arch = "aarch64")] + fn raw_cache_config(&self) -> Option { + match self { + // L1D_CACHE (0x04): L1 data cache accesses, loads and stores. + PerfEvent::L1DCache => Some(0x04), + // L1D_CACHE_REFILL (0x03): L1D line fills. Defined against the same + // access population as L1D_CACHE — unlike L2D_CACHE, which also + // counts L1 write-backs, instruction-side refills and table + // walks, and counts lines where L1D_CACHE counts operations — + // so the `L1DCache - L2DCache` hit derivation stays sound. + PerfEvent::L2DCache => Some(0x03), + // L2D_CACHE_REFILL (0x17): refills of L2 or L1 from outside those + // caches. On the Cortex-A72 macro-runner fleet (a1.metal) there + // is no L3, so these are trips to DRAM. Includes instruction-side + // refills, so it can exceed L1D_CACHE_REFILL in icache-missing + // code; the derived hit counts saturate against that. + PerfEvent::CacheMisses => Some(0x17), + _ => None, + } + } + + #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + fn raw_cache_config(&self) -> Option { + None + } +} + +#[cfg(target_arch = "x86_64")] +fn is_genuine_intel() -> bool { + use std::arch::x86_64::__cpuid; + // CPUID leaf 0: vendor string in EBX,EDX,ECX. + let leaf0 = unsafe { __cpuid(0) }; + let mut vendor = [0u8; 12]; + vendor[0..4].copy_from_slice(&leaf0.ebx.to_le_bytes()); + vendor[4..8].copy_from_slice(&leaf0.edx.to_le_bytes()); + vendor[8..12].copy_from_slice(&leaf0.ecx.to_le_bytes()); + &vendor == b"GenuineIntel" } impl std::fmt::Display for PerfEvent { @@ -35,3 +151,38 @@ impl std::fmt::Display for PerfEvent { write!(f, "{}", self.to_perf_string()) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn portable_slots_have_samply_specs() { + assert_eq!( + PerfEvent::CpuCycles.to_samply_spec().unwrap(), + "cpu-cycles:0:0x0" + ); + assert_eq!( + PerfEvent::Instructions.to_samply_spec().unwrap(), + "instructions:0:0x1" + ); + } + + #[test] + fn event_names_are_unique() { + let mut names: Vec<_> = PerfEvent::all_events() + .iter() + .map(|event| event.to_perf_string()) + .collect(); + names.sort(); + names.dedup(); + assert_eq!(names.len(), PerfEvent::all_events().len()); + } + + #[test] + fn print_specs_for_this_host() { + for event in PerfEvent::all_events() { + println!("{event:?} -> {:?}", event.to_samply_spec()); + } + } +} diff --git a/scripts/samply-dev.sh b/scripts/samply-dev.sh new file mode 100755 index 00000000..f99a9c72 --- /dev/null +++ b/scripts/samply-dev.sh @@ -0,0 +1,105 @@ +#!/bin/sh +# Toggle "samply dev mode" for the runner. +# +# Dev mode redirects the runner's `samply`, `framehop`, and +# `linux-perf-event-reader` dependencies to local sibling checkouts by +# appending a `[patch]` block to the relevant `Cargo.toml` files. This lets you +# iterate on all three crates in place and have the runner pick the changes up +# immediately. +# +# Nothing is hidden from git: the appended blocks and the resulting `Cargo.lock` +# edits show up in `git status` like any other change. It is on you not to commit +# them — run `off` to remove the blocks when you're done. +# +# The block is delimited by sentinel comments so `off` can strip it cleanly: +# - /Cargo.toml patches samply + framehop + reader -> local +# - /Cargo.toml patches framehop + reader -> local (so samply +# standalone builds also use them) +# +# Usage: +# scripts/samply-dev.sh on enable dev mode (append patch blocks) +# scripts/samply-dev.sh off disable dev mode (remove patch blocks) +set -eu + +# Resolve repo roots relative to this script, not the cwd. +RUNNER_ROOT=$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd) +SAMPLY_ROOT=$(CDPATH= cd -- "$RUNNER_ROOT/../samply-codspeed" 2>/dev/null && pwd || true) + +SAMPLY_URL="https://github.com/CodSpeedHQ/samply-codspeed" +FRAMEHOP_URL="https://github.com/CodSpeedHQ/framehop" + +RUNNER_MANIFEST="$RUNNER_ROOT/Cargo.toml" +SAMPLY_MANIFEST="$SAMPLY_ROOT/Cargo.toml" + +BEGIN="# >>> samply-dev (do not commit) >>>" +END="# <<< samply-dev <<<" + +usage() { + echo "Usage: $0 {on|off}" >&2 + echo " on enable dev mode (append patch blocks)" >&2 + echo " off disable dev mode (remove patch blocks)" >&2 + exit 2 +} + +# Remove the sentinel-delimited block from a manifest, if present. +strip_block() { + manifest=$1 + [ -f "$manifest" ] || return 0 + sed "/^$BEGIN\$/,/^$END\$/d" "$manifest" > "$manifest.tmp" && mv "$manifest.tmp" "$manifest" +} + +# Append a sentinel-delimited block to a manifest, replacing any existing one. +append_block() { + manifest=$1 + body=$2 + strip_block "$manifest" + printf '%s\n%s%s\n' "$BEGIN" "$body" "$END" >> "$manifest" +} + +enable() { + if [ -z "$SAMPLY_ROOT" ]; then + echo "error: ../samply-codspeed not found next to the runner repo" >&2 + exit 1 + fi + + append_block "$RUNNER_MANIFEST" "\ +[patch.\"$SAMPLY_URL\"] +samply = { path = \"../samply-codspeed/samply\" } + +[patch.\"$FRAMEHOP_URL\"] +framehop = { path = \"../framehop\" } + +[patch.crates-io] +linux-perf-event-reader = { path = \"../linux-perf-event-reader\" } +" + + append_block "$SAMPLY_MANIFEST" "\ +[patch.\"$FRAMEHOP_URL\"] +framehop = { path = \"../framehop\" } + +[patch.crates-io] +linux-perf-event-reader = { path = \"../linux-perf-event-reader\" } +" + + echo "samply dev mode: ON" + echo " patched $RUNNER_MANIFEST" + echo " patched $SAMPLY_MANIFEST" +} + +disable() { + strip_block "$RUNNER_MANIFEST" + echo " cleaned $RUNNER_MANIFEST" + if [ -n "$SAMPLY_ROOT" ]; then + strip_block "$SAMPLY_MANIFEST" + echo " cleaned $SAMPLY_MANIFEST" + fi + echo "samply dev mode: OFF" +} + +[ $# -eq 1 ] || usage + +case "$1" in + on) enable ;; + off) disable ;; + *) usage ;; +esac diff --git a/src/executor/wall_time/profiler/samply/mod.rs b/src/executor/wall_time/profiler/samply/mod.rs index 34283cf0..c6e31897 100644 --- a/src/executor/wall_time/profiler/samply/mod.rs +++ b/src/executor/wall_time/profiler/samply/mod.rs @@ -46,10 +46,6 @@ impl SamplyProfiler { #[async_trait(?Send)] impl Profiler for SamplyProfiler { - fn requires_isolation(&self) -> bool { - false - } - async fn setup( &self, _system_info: &SystemInfo, @@ -117,6 +113,21 @@ impl Profiler for SamplyProfiler { ), ]); + // Extra hardware events to capture alongside the sampling event, + // stored by samply as per-sample delta columns in the profile, as + // `::` specs resolved for the CPU we run on. + // samply degrades gracefully to cycles-only sampling when the PMU + // can't deliver them, so this is safe to request unconditionally. + // Linux only: the events go through perf_event_open. + #[cfg(target_os = "linux")] + cmd_builder.env( + "SAMPLY_PERF_EVENTS", + runner_shared::perf_event::PerfEvent::all_events() + .iter() + .filter_map(|event| event.to_samply_spec()) + .join(","), + ); + // If `setup` decided the bash on PATH is Apple-signed, prepend brew's // bin so samply's spawned shell resolves to the ad-hoc-signed brew bash // instead. Only the samply child's PATH is touched.