From add7ea6a68df3985266b5f22c413985bb180d263 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Thu, 28 May 2026 23:39:37 +0300 Subject: [PATCH 1/2] perf(decode): unroll HUF 4-stream burst inner loop via const-generic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The HUF literal burst inner loop used a runtime `for _ in 0..symbols_per_burst` trip count, leaving an induction variable + per-iteration check that LLVM could not eliminate. Donor's `HUF_decompress4X1_usingDTable_internal_fast_c_loop` hardcodes `for symbol in 0..5`, fully unrolling the 5×4 decode into straight-line code. Pure-decode profiling (decode_loop, i9, z000033 L2 dfast) showed the burst body (`decompress_literals_avx2`) at 27.6% self-time vs donor's ~4-5% for the entire HUF literal decode — on literal-heavy weak-compression frames (the worst dashboard r/f points: 0.12-0.29) this is the single largest decode gap. Extract the inner symbol loop into `burst_decode_symbols::`, monomorphised on the compile-time symbol count, and dispatch on the loop-invariant `symbols_per_burst`. SPB=5 covers max_num_bits ∈ {10,11} (large-alphabet literal-heavy — the dominant cost), 6 covers {8,9}, 7 covers {7}. Rarer small-max tables (few symbols, cheap overall) keep the dynamic loop. The match is loop-invariant so it unswitches out of the `while`; each arm gets a fully-unrolled body. 653/653 nextest pass. --- zstd/src/decoding/literals_section_decoder.rs | 123 ++++++++++++++---- 1 file changed, 99 insertions(+), 24 deletions(-) diff --git a/zstd/src/decoding/literals_section_decoder.rs b/zstd/src/decoding/literals_section_decoder.rs index fbc2b1f5..e90287fb 100644 --- a/zstd/src/decoding/literals_section_decoder.rs +++ b/zstd/src/decoding/literals_section_decoder.rs @@ -508,6 +508,57 @@ struct LoopBounds { alloc_upper_bound: usize, } +/// One burst iteration's inner symbol loop, monomorphised on the +/// compile-time symbol count `SPB` so LLVM fully unrolls the +/// `SPB × 4` decode steps into straight-line code — matching donor's +/// `for symbol in 0..5` hardcoded unroll in +/// `HUF_decompress4X1_usingDTable_internal_fast_c_loop`. A runtime +/// `for _ in 0..symbols_per_burst` bound leaves an induction variable +/// and per-iteration trip check that the unrolled form eliminates; +/// on the literal-heavy decode path this inner loop is the dominant +/// decode cost, so the unroll is the single biggest burst win. +/// +/// # Safety +/// Same preconditions as the caller's burst body: every `idx` is +/// `< packed.len()` (table-shift bounded), and every written +/// `cursors[s]` is `< alloc_upper_bound` (caller's lockstep gate + +/// `debug_assert!`). `target_ptr` backs an allocation covering those +/// indices. +#[inline(always)] +unsafe fn burst_decode_symbols( + bits: &mut [u64; 4], + cursors: &mut [usize; 4], + target_ptr: *mut u8, + packed: &[u16], + table_shift: u32, +) { + for _ in 0..SPB { + let idx0 = (bits[0] >> table_shift) as usize; + let entry0 = unsafe { *packed.get_unchecked(idx0) }; + unsafe { target_ptr.add(cursors[0]).write((entry0 & 0xFF) as u8) }; + cursors[0] += 1; + bits[0] <<= (entry0 >> 8) & 0xFF; + + let idx1 = (bits[1] >> table_shift) as usize; + let entry1 = unsafe { *packed.get_unchecked(idx1) }; + unsafe { target_ptr.add(cursors[1]).write((entry1 & 0xFF) as u8) }; + cursors[1] += 1; + bits[1] <<= (entry1 >> 8) & 0xFF; + + let idx2 = (bits[2] >> table_shift) as usize; + let entry2 = unsafe { *packed.get_unchecked(idx2) }; + unsafe { target_ptr.add(cursors[2]).write((entry2 & 0xFF) as u8) }; + cursors[2] += 1; + bits[2] <<= (entry2 >> 8) & 0xFF; + + let idx3 = (bits[3] >> table_shift) as usize; + let entry3 = unsafe { *packed.get_unchecked(idx3) }; + unsafe { target_ptr.add(cursors[3]).write((entry3 & 0xFF) as u8) }; + cursors[3] += 1; + bits[3] <<= (entry3 >> 8) & 0xFF; + } +} + /// Donor-parity 4-stream HUF decode burst loop. Single code path — /// no kernel dispatch, no SIMD-fallback hybrid. Mirrors /// `huf_decompress.c:HUF_decompress4X1_usingDTable_internal_fast_c_loop`: @@ -691,30 +742,54 @@ unsafe fn run_4stream_burst_loop( // `.write()` (raw store) is used instead of `&mut [u8]` // indexing so no Rust reference is ever formed to the // uninitialised tail before its byte is written. - for _ in 0..symbols_per_burst { - let idx0 = (bits[0] >> table_shift) as usize; - let entry0 = unsafe { *packed.get_unchecked(idx0) }; - unsafe { target_ptr.add(cursors[0]).write((entry0 & 0xFF) as u8) }; - cursors[0] += 1; - bits[0] <<= (entry0 >> 8) & 0xFF; - - let idx1 = (bits[1] >> table_shift) as usize; - let entry1 = unsafe { *packed.get_unchecked(idx1) }; - unsafe { target_ptr.add(cursors[1]).write((entry1 & 0xFF) as u8) }; - cursors[1] += 1; - bits[1] <<= (entry1 >> 8) & 0xFF; - - let idx2 = (bits[2] >> table_shift) as usize; - let entry2 = unsafe { *packed.get_unchecked(idx2) }; - unsafe { target_ptr.add(cursors[2]).write((entry2 & 0xFF) as u8) }; - cursors[2] += 1; - bits[2] <<= (entry2 >> 8) & 0xFF; - - let idx3 = (bits[3] >> table_shift) as usize; - let entry3 = unsafe { *packed.get_unchecked(idx3) }; - unsafe { target_ptr.add(cursors[3]).write((entry3 & 0xFF) as u8) }; - cursors[3] += 1; - bits[3] <<= (entry3 >> 8) & 0xFF; + // + // Dispatch the inner loop on the compile-time symbol count so + // the dominant cases get a fully-unrolled body (donor's + // hardcoded `for symbol in 0..5`). `symbols_per_burst` is + // loop-invariant, so the match is hoisted out of the `while` + // by loop-unswitching; each arm monomorphises + // `burst_decode_symbols::` to straight-line code. SPB=5 + // covers `max_num_bits ∈ {10, 11}` — the large-alphabet + // literal-heavy case that dominates decode cost; 6 covers + // {8, 9}, 7 covers {7}. Rarer small-max tables (few symbols, + // cheap overall) fall through to the dynamic loop. + match symbols_per_burst { + 5 => unsafe { + burst_decode_symbols::<5>(&mut bits, &mut *cursors, target_ptr, packed, table_shift); + }, + 6 => unsafe { + burst_decode_symbols::<6>(&mut bits, &mut *cursors, target_ptr, packed, table_shift); + }, + 7 => unsafe { + burst_decode_symbols::<7>(&mut bits, &mut *cursors, target_ptr, packed, table_shift); + }, + _ => { + for _ in 0..symbols_per_burst { + let idx0 = (bits[0] >> table_shift) as usize; + let entry0 = unsafe { *packed.get_unchecked(idx0) }; + unsafe { target_ptr.add(cursors[0]).write((entry0 & 0xFF) as u8) }; + cursors[0] += 1; + bits[0] <<= (entry0 >> 8) & 0xFF; + + let idx1 = (bits[1] >> table_shift) as usize; + let entry1 = unsafe { *packed.get_unchecked(idx1) }; + unsafe { target_ptr.add(cursors[1]).write((entry1 & 0xFF) as u8) }; + cursors[1] += 1; + bits[1] <<= (entry1 >> 8) & 0xFF; + + let idx2 = (bits[2] >> table_shift) as usize; + let entry2 = unsafe { *packed.get_unchecked(idx2) }; + unsafe { target_ptr.add(cursors[2]).write((entry2 & 0xFF) as u8) }; + cursors[2] += 1; + bits[2] <<= (entry2 >> 8) & 0xFF; + + let idx3 = (bits[3] >> table_shift) as usize; + let entry3 = unsafe { *packed.get_unchecked(idx3) }; + unsafe { target_ptr.add(cursors[3]).write((entry3 & 0xFF) as u8) }; + cursors[3] += 1; + bits[3] <<= (entry3 >> 8) & 0xFF; + } + } } // Reload all 4 streams (donor `HUF_4X1_RELOAD_STREAM`). From 55b70f103eec0a827a8360e8aef9427bb4b494f6 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Fri, 29 May 2026 10:38:48 +0300 Subject: [PATCH 2/2] style: rustfmt burst_decode_symbols call sites to multiline --- zstd/src/decoding/literals_section_decoder.rs | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/zstd/src/decoding/literals_section_decoder.rs b/zstd/src/decoding/literals_section_decoder.rs index e90287fb..714009f1 100644 --- a/zstd/src/decoding/literals_section_decoder.rs +++ b/zstd/src/decoding/literals_section_decoder.rs @@ -755,13 +755,31 @@ unsafe fn run_4stream_burst_loop( // cheap overall) fall through to the dynamic loop. match symbols_per_burst { 5 => unsafe { - burst_decode_symbols::<5>(&mut bits, &mut *cursors, target_ptr, packed, table_shift); + burst_decode_symbols::<5>( + &mut bits, + &mut *cursors, + target_ptr, + packed, + table_shift, + ); }, 6 => unsafe { - burst_decode_symbols::<6>(&mut bits, &mut *cursors, target_ptr, packed, table_shift); + burst_decode_symbols::<6>( + &mut bits, + &mut *cursors, + target_ptr, + packed, + table_shift, + ); }, 7 => unsafe { - burst_decode_symbols::<7>(&mut bits, &mut *cursors, target_ptr, packed, table_shift); + burst_decode_symbols::<7>( + &mut bits, + &mut *cursors, + target_ptr, + packed, + table_shift, + ); }, _ => { for _ in 0..symbols_per_burst {