From d38f924274085e9705c920de852013b3b7d48d3e Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Fri, 29 May 2026 05:08:32 +0300 Subject: [PATCH 1/3] test(bench): add encode_loop_z000033 example for clean encoder profiles Mirrors decode_loop_z000033 for the encode side: reads a raw corpus, loops `compress_to_vec` at a given level N times, no criterion and no FFI. The `compare_ffi` compress benchmark runs the donor in the same process, so its flamegraph mixes `ZSTD_*` donor symbols with ours; this binary isolates the pure-Rust encoder hot path for perf-record. black_box + len-sum sink defeats dead-code elimination of the compress call. --- zstd/examples/encode_loop_z000033.rs | 54 ++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 zstd/examples/encode_loop_z000033.rs diff --git a/zstd/examples/encode_loop_z000033.rs b/zstd/examples/encode_loop_z000033.rs new file mode 100644 index 00000000..8d1311ee --- /dev/null +++ b/zstd/examples/encode_loop_z000033.rs @@ -0,0 +1,54 @@ +//! Standalone encode-loop binary for clean perf-record profiles of the +//! ENCODER hot path. Reads a raw corpus, then loops `compress_to_vec` at +//! the given level for N iters. No criterion, no FFI side — the perf +//! samples land purely in our encoder (the `compare_ffi` compress bench +//! runs the donor in the same process, so its flamegraph mixes +//! `ZSTD_*` donor symbols with ours; this binary does not). +//! +//! Build: cargo build --profile flamegraph -p structured-zstd \ +//! --example encode_loop_z000033 --features dict_builder +//! Run: cargo flamegraph --example encode_loop_z000033 --features dict_builder \ +//! --profile flamegraph -- + +use std::env; + +use structured_zstd::encoding::{CompressionLevel, compress_to_vec}; + +fn main() { + let args: Vec = env::args().collect(); + let level: i32 = args.get(1).and_then(|s| s.parse().ok()).unwrap_or(-1); + let iters: u32 = args.get(2).and_then(|s| s.parse().ok()).unwrap_or(2000); + let corpus_path: Option<&str> = args.get(3).map(|s| s.as_str()); + + let src: Vec = if let Some(path) = corpus_path { + std::fs::read(path).expect("read corpus file") + } else { + // Deterministic 1 MiB LCG synthetic fallback. + let n = 1_048_576usize; + let mut src = Vec::with_capacity(n); + let mut state: u64 = 0x517cc1b727220a95; + while src.len() < n { + state = state + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + src.push((state >> 56) as u8); + } + src + }; + + let mut sink: usize = 0; + for _ in 0..iters { + let out = compress_to_vec(src.as_slice(), CompressionLevel::Level(level)); + // Defeat dead-code elimination of the compress call. + sink = sink.wrapping_add(out.len()); + core::hint::black_box(&out); + } + + eprintln!( + "encoded {} bytes × {} iters at level {}; last-out-sum={}", + src.len(), + iters, + level, + sink + ); +} From 8412995e322ea92d10c86bc6f03ae49dc37bf99b Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Fri, 29 May 2026 06:41:51 +0300 Subject: [PATCH 2/3] test(bench): use compress_slice_to_vec in encode_loop to avoid per-iter copy The input is a contiguous `&[u8]`; `compress_to_vec` takes `impl Read` and re-buffers it via `read_to_end` into a fresh `Vec` every iteration, adding per-iter input allocation + memmove that pollutes the encoder flamegraph. `compress_slice_to_vec` consumes the slice directly, keeping the profile focused on the encode hot path. --- zstd/examples/encode_loop_z000033.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/zstd/examples/encode_loop_z000033.rs b/zstd/examples/encode_loop_z000033.rs index 8d1311ee..6321fc46 100644 --- a/zstd/examples/encode_loop_z000033.rs +++ b/zstd/examples/encode_loop_z000033.rs @@ -12,7 +12,7 @@ use std::env; -use structured_zstd::encoding::{CompressionLevel, compress_to_vec}; +use structured_zstd::encoding::{CompressionLevel, compress_slice_to_vec}; fn main() { let args: Vec = env::args().collect(); @@ -38,7 +38,13 @@ fn main() { let mut sink: usize = 0; for _ in 0..iters { - let out = compress_to_vec(src.as_slice(), CompressionLevel::Level(level)); + // `compress_slice_to_vec` (NOT `compress_to_vec`): the input is + // already a contiguous `&[u8]`. `compress_to_vec` takes `impl + // Read` and re-buffers via `read_to_end` into a fresh `Vec` + // every iteration — that per-iter input allocation + copy would + // pollute the encoder flamegraph with `memmove` / alloc traffic + // that isn't part of the encode hot path. + let out = compress_slice_to_vec(src.as_slice(), CompressionLevel::Level(level)); // Defeat dead-code elimination of the compress call. sink = sink.wrapping_add(out.len()); core::hint::black_box(&out); From 58f3a75f8466c81dc7e3def9595ee164c8870555 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Fri, 29 May 2026 11:09:43 +0300 Subject: [PATCH 3/3] perf(bench): reuse encode_loop output buffer across iterations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allocate the output Vec once and clear()-reuse it every iteration so steady-state iters do zero output-buffer allocation — keeps the flamegraph on the encoder hot path instead of per-iter Vec growth + first-touch page faults. Drive FrameCompressor directly over the contiguous slice. Sync the module doc to the actual API used. --- zstd/examples/encode_loop_z000033.rs | 49 ++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/zstd/examples/encode_loop_z000033.rs b/zstd/examples/encode_loop_z000033.rs index 6321fc46..7c8a8752 100644 --- a/zstd/examples/encode_loop_z000033.rs +++ b/zstd/examples/encode_loop_z000033.rs @@ -1,9 +1,18 @@ //! Standalone encode-loop binary for clean perf-record profiles of the -//! ENCODER hot path. Reads a raw corpus, then loops `compress_to_vec` at -//! the given level for N iters. No criterion, no FFI side — the perf -//! samples land purely in our encoder (the `compare_ffi` compress bench -//! runs the donor in the same process, so its flamegraph mixes -//! `ZSTD_*` donor symbols with ours; this binary does not). +//! ENCODER hot path. Reads a raw corpus, then loops a `FrameCompressor` +//! over a contiguous `&[u8]` source at the given level for N iters. No +//! criterion, no FFI side — the perf samples land purely in our encoder +//! (the `compare_ffi` compress bench runs the donor in the same process, +//! so its flamegraph mixes `ZSTD_*` donor symbols with ours; this binary +//! does not). +//! +//! The output buffer is allocated ONCE and `clear()`-reused every +//! iteration, so steady-state iters do zero output-buffer allocation — +//! the flamegraph stays on the encoder hot path instead of per-iter +//! `Vec` growth + first-touch page faults. A fresh `FrameCompressor` per +//! iter mirrors a real per-frame encode (there is no compressor-reset +//! API; the matcher-table init is inherent encode cost, unlike the +//! pure-noise output realloc). //! //! Build: cargo build --profile flamegraph -p structured-zstd \ //! --example encode_loop_z000033 --features dict_builder @@ -12,7 +21,7 @@ use std::env; -use structured_zstd::encoding::{CompressionLevel, compress_slice_to_vec}; +use structured_zstd::encoding::{CompressionLevel, FrameCompressor}; fn main() { let args: Vec = env::args().collect(); @@ -36,15 +45,29 @@ fn main() { src }; + // Output buffer reused across iterations. Generous capacity + // (src + 1/8 + 4 KiB) exceeds any frame's compressed size — even the + // incompressible worst case (raw blocks + frame/block headers stays + // well under src * 1.125) — so no iteration ever reallocates. We + // can't call the crate-internal `compress_bound` from an example, so + // this closed-form bound stands in for it. + let cap = src.len() + (src.len() >> 3) + 4096; + let mut out: Vec = Vec::with_capacity(cap); + let mut sink: usize = 0; for _ in 0..iters { - // `compress_slice_to_vec` (NOT `compress_to_vec`): the input is - // already a contiguous `&[u8]`. `compress_to_vec` takes `impl - // Read` and re-buffers via `read_to_end` into a fresh `Vec` - // every iteration — that per-iter input allocation + copy would - // pollute the encoder flamegraph with `memmove` / alloc traffic - // that isn't part of the encode hot path. - let out = compress_slice_to_vec(src.as_slice(), CompressionLevel::Level(level)); + // Reuse the buffer: `clear()` resets len to 0 but keeps the + // capacity, so the drain writes into already-faulted-in pages. + // Drive the low-level `FrameCompressor` directly (the input is + // already a contiguous `&[u8]`) instead of `compress_to_vec`, + // which takes `impl Read` and re-buffers via `read_to_end` into + // a fresh `Vec` every iteration. + out.clear(); + let mut frame_enc = FrameCompressor::new(CompressionLevel::Level(level)); + frame_enc.set_source_size_hint(src.len() as u64); + frame_enc.set_source(src.as_slice()); + frame_enc.set_drain(&mut out); + frame_enc.compress(); // Defeat dead-code elimination of the compress call. sink = sink.wrapping_add(out.len()); core::hint::black_box(&out);