diff --git a/src/snmalloc/backend/meta_protected_range.h b/src/snmalloc/backend/meta_protected_range.h index 857e853d2..d7f8bfe8a 100644 --- a/src/snmalloc/backend/meta_protected_range.h +++ b/src/snmalloc/backend/meta_protected_range.h @@ -55,6 +55,7 @@ namespace snmalloc LogRange<3>, GlobalRange, CommitRange, + DecayRange, StatsRange>; // Controls the padding around the meta-data range. diff --git a/src/snmalloc/backend/standard_range.h b/src/snmalloc/backend/standard_range.h index 78609ed2d..675cee516 100644 --- a/src/snmalloc/backend/standard_range.h +++ b/src/snmalloc/backend/standard_range.h @@ -37,8 +37,14 @@ namespace snmalloc LogRange<2>, GlobalRange>; - // Track stats of the committed memory - using Stats = Pipe, StatsRange>; + // Decay range caches deallocated memory and gradually releases it + // back to the parent, avoiding expensive repeated decommit/recommit + // cycles for transient allocation patterns. + using DecayR = Pipe, DecayRange>; + + // Track stats of the memory handed out (outside decay so stats + // methods are directly visible to StatsCombiner). + using Stats = Pipe; private: static constexpr size_t page_size_bits = diff --git a/src/snmalloc/backend_helpers/backend_helpers.h b/src/snmalloc/backend_helpers/backend_helpers.h index ee339337b..8b643ca6b 100644 --- a/src/snmalloc/backend_helpers/backend_helpers.h +++ b/src/snmalloc/backend_helpers/backend_helpers.h @@ -5,6 +5,7 @@ #include "buddy.h" #include "commitrange.h" #include "commonconfig.h" +#include "decayrange.h" #include "defaultpagemapentry.h" #include "empty_range.h" #include "globalrange.h" diff --git a/src/snmalloc/backend_helpers/decayrange.h b/src/snmalloc/backend_helpers/decayrange.h new file mode 100644 index 000000000..99b9e22cb --- /dev/null +++ b/src/snmalloc/backend_helpers/decayrange.h @@ -0,0 +1,418 @@ +#pragma once + +#include "../ds/ds.h" +#include "../mem/mem.h" +#include "empty_range.h" +#include "largebuddyrange.h" +#include "range_helpers.h" + +namespace snmalloc +{ + /** + * Intrusive singly-linked list using pagemap entries for storage. + * + * This uses BuddyChunkRep's pagemap entry access (direction=false, i.e. + * Word::Two) to store the "next" pointer for each node. + */ + template + class DecayList + { + using Rep = BuddyChunkRep; + + uintptr_t head = 0; + + DecayList(uintptr_t head) : head(head) {} + + public: + constexpr DecayList() = default; + + [[nodiscard]] bool is_empty() const + { + return head == 0; + } + + DecayList get_next() + { + SNMALLOC_ASSERT(!is_empty()); + auto next_field = Rep::ref(false, head); + auto next = Rep::get(next_field); + return {next}; + } + + capptr::Arena get_capability() + { + return capptr::Arena::unsafe_from(reinterpret_cast(head)); + } + + DecayList cons(capptr::Arena new_head_cap) + { + auto new_head = new_head_cap.unsafe_uintptr(); + auto field = Rep::ref(false, new_head); + Rep::set(field, head); + return {new_head}; + } + + template + void forall(F f) + { + auto curr = *this; + while (!curr.is_empty()) + { + auto next = curr.get_next(); + f(curr.get_capability()); + curr = next; + } + } + }; + + /** + * Concurrent stack for caching deallocated ranges. + * + * Supports the following concurrency pattern: + * (push|pop)* || pop_all* || ... || pop_all* + * + * That is, a single thread can do push and pop, and other threads + * can do pop_all. pop_all returns all of the stack if it doesn't + * race, or empty if it does. + * + * The primary use case is single-threaded access, where other threads + * can attempt to drain all values (via the timer callback). + */ + template + class DecayStack + { + static constexpr auto empty = DecayList{}; + + alignas(CACHELINE_SIZE) stl::Atomic> stack{}; + + DecayList take() + { + if (stack.load(stl::memory_order_relaxed).is_empty()) + return empty; + return stack.exchange(empty, stl::memory_order_acquire); + } + + void replace(DecayList new_head) + { + SNMALLOC_ASSERT(stack.load().is_empty()); + stack.store(new_head, stl::memory_order_release); + } + + public: + constexpr DecayStack() = default; + + void push(capptr::Arena new_head_cap) + { + auto old_head = take(); + auto new_head = old_head.cons(new_head_cap); + replace(new_head); + } + + capptr::Arena pop() + { + auto old_head = take(); + if (old_head.is_empty()) + return nullptr; + + auto next = old_head.get_next(); + replace(next); + + return old_head.get_capability(); + } + + DecayList pop_all() + { + return take(); + } + }; + + /** + * A range that provides temporal caching of deallocated ranges. + * + * Instead of immediately releasing deallocated memory back to the parent + * range (which would decommit it), this range caches it locally and + * uses PAL timers to gradually release it. This avoids expensive + * repeated decommit/recommit cycles for transient allocation patterns + * (e.g. repeatedly allocating and deallocating ~800KB objects). + * + * The range uses an epoch-based rotation scheme: + * - Deallocated ranges are placed in the current epoch's stack + * - A timer periodically advances the epoch + * - When the epoch advances, the oldest epoch's entries are flushed + * to the parent range + * + * The parent range MUST be ConcurrencySafe, as the timer callback may + * flush entries from a different thread context. + * + * PAL - Platform abstraction layer (for timer support) + * Pagemap - Used for storing linked list nodes in pagemap entries + */ + template + struct DecayRange + { + template> + class Type : public ContainsParent + { + using ContainsParent::parent; + + public: + static constexpr bool Aligned = ParentRange::Aligned; + + static constexpr bool ConcurrencySafe = false; + + using ChunkBounds = typename ParentRange::ChunkBounds; + + private: + /** + * Maximum chunk size bits we cache (4 MiB = 2^22). + */ + static constexpr size_t MAX_CACHEABLE_BITS = 22; + + /** + * Maximum chunk size we cache (4 MiB). + * Larger allocations bypass the cache and go directly to/from parent. + */ + static constexpr size_t MAX_CACHEABLE_SIZE = + bits::one_at_bit(MAX_CACHEABLE_BITS); + + /** + * How many slab sizes that can be cached. + * Only covers sizes from MIN_CHUNK_SIZE up to MAX_CACHEABLE_SIZE. + */ + static constexpr size_t NUM_SLAB_SIZES = + MAX_CACHEABLE_BITS - MIN_CHUNK_BITS + 1; + + /** + * Number of epoch slots for cached ranges. + * + * Ranges not used within (NUM_EPOCHS - 1) timer periods will be + * released to the parent. E.g., with period=500ms and NUM_EPOCHS=4, + * memory not reused within 1500-2000ms will be released. + * + * Must be a power of 2. + */ + static constexpr size_t NUM_EPOCHS = 4; + static_assert(bits::is_pow2(NUM_EPOCHS), "NUM_EPOCHS must be power of 2"); + + /** + * Per-sizeclass, per-epoch stacks of cached ranges. + */ + ModArray>> + chunk_stack; + + /** + * Current epoch index. + */ + static inline stl::Atomic epoch{0}; + + /** + * Flag to ensure one-shot timer registration with the PAL. + */ + static inline stl::AtomicBool registered_timer{false}; + + /** + * Flag indicating this instance has been registered in the global list. + */ + stl::AtomicBool registered_local{false}; + + /** + * Global list of all activated DecayRange instances. + * Used by the timer to iterate and flush old entries. + */ + static inline stl::Atomic all_local{nullptr}; + + /** + * Next pointer for the global intrusive list. + */ + Type* all_local_next{nullptr}; + + /** + * Flush the oldest epoch's entries across all instances + * and advance the epoch. + */ + static void handle_decay_tick() + { + static_assert( + ParentRange::ConcurrencySafe, + "Parent range must be concurrency safe, as dealloc_range is called " + "from the timer callback on a potentially different thread."); + + auto new_epoch = + (epoch.load(stl::memory_order_relaxed) + 1) % NUM_EPOCHS; + + // Flush the epoch that is about to become current + // across all registered instances. + auto curr = all_local.load(stl::memory_order_acquire); + while (curr != nullptr) + { + for (size_t sc = 0; sc < NUM_SLAB_SIZES; sc++) + { + auto old_stack = curr->chunk_stack[sc][new_epoch].pop_all(); + + old_stack.forall([curr, sc](auto cap) { + size_t size = MIN_CHUNK_SIZE << sc; +#ifdef SNMALLOC_TRACING + message<1024>( + "DecayRange::tick flushing {} size {} to parent", + cap.unsafe_ptr(), + size); +#endif + curr->parent.dealloc_range(cap, size); + }); + } + curr = curr->all_local_next; + } + + // Advance the epoch + epoch.store(new_epoch, stl::memory_order_release); + } + + /** + * Timer callback object for periodic decay. + */ + class DecayMemoryTimerObject : public PalTimerObject + { + static void process(PalTimerObject*) + { +#ifdef SNMALLOC_TRACING + message<1024>("DecayRange::handle_decay_tick timer"); +#endif + handle_decay_tick(); + } + + /// Timer fires every 500ms. + static constexpr size_t PERIOD = 500; + + public: + constexpr DecayMemoryTimerObject() : PalTimerObject(&process, PERIOD) {} + }; + + static inline DecayMemoryTimerObject timer_object; + + void ensure_registered() + { + // Register the global timer if this is the first instance. + if ( + !registered_timer.load(stl::memory_order_relaxed) && + !registered_timer.exchange(true, stl::memory_order_acq_rel)) + { + PAL::register_timer(&timer_object); + } + + // Register this instance in the global list. + if ( + !registered_local.load(stl::memory_order_relaxed) && + !registered_local.exchange(true, stl::memory_order_acq_rel)) + { + auto* head = all_local.load(stl::memory_order_relaxed); + do + { + all_local_next = head; + } while (!all_local.compare_exchange_weak( + head, this, stl::memory_order_release, stl::memory_order_relaxed)); + } + } + + public: + constexpr Type() = default; + + CapPtr alloc_range(size_t size) + { + SNMALLOC_ASSERT(bits::is_pow2(size)); + SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE); + + auto slab_sizeclass = bits::next_pow2_bits(size) - MIN_CHUNK_BITS; + + // Bypass cache for sizes beyond what we track. + if (slab_sizeclass >= NUM_SLAB_SIZES) + return parent.alloc_range(size); + + if constexpr (pal_supports) + { + // Try local cache across all epochs, starting from current. + auto current_epoch = epoch.load(stl::memory_order_relaxed); + for (size_t e = 0; e < NUM_EPOCHS; e++) + { + auto p = + chunk_stack[slab_sizeclass][(current_epoch - e) % NUM_EPOCHS] + .pop(); + + if (p != nullptr) + { +#ifdef SNMALLOC_TRACING + message<1024>( + "DecayRange::alloc_range returning {} from local cache", + p.unsafe_ptr()); +#endif + return p; + } + } + } + + // Try parent. If OOM, flush decay caches and retry. + CapPtr result; + for (size_t i = NUM_EPOCHS; i > 0; i--) + { + result = parent.alloc_range(size); + if (result != nullptr) + { +#ifdef SNMALLOC_TRACING + message<1024>( + "DecayRange::alloc_range returning {} from parent", + result.unsafe_ptr()); +#endif + return result; + } + + // OOM: force-flush decay caches to free memory. +#ifdef SNMALLOC_TRACING + message<1024>("DecayRange::alloc_range OOM, flushing decay caches"); +#endif + handle_decay_tick(); + } + + // Final attempt after flushing all epochs. + result = parent.alloc_range(size); +#ifdef SNMALLOC_TRACING + message<1024>( + "DecayRange::alloc_range final attempt: {}", result.unsafe_ptr()); +#endif + return result; + } + + void dealloc_range(CapPtr base, size_t size) + { + SNMALLOC_ASSERT(bits::is_pow2(size)); + SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE); + + auto slab_sizeclass = bits::next_pow2_bits(size) - MIN_CHUNK_BITS; + + // Bypass cache for sizes beyond what we track. + if (slab_sizeclass >= NUM_SLAB_SIZES) + { + parent.dealloc_range(base, size); + return; + } + + if constexpr (pal_supports) + { + ensure_registered(); + +#ifdef SNMALLOC_TRACING + message<1024>( + "DecayRange::dealloc_range caching {} size {}", + base.unsafe_ptr(), + size); +#endif + auto current_epoch = epoch.load(stl::memory_order_relaxed); + chunk_stack[slab_sizeclass][current_epoch].push(base); + } + else + { + // No timer support: pass through directly. + parent.dealloc_range(base, size); + } + } + }; + }; +} // namespace snmalloc diff --git a/src/snmalloc/mem/corealloc.h b/src/snmalloc/mem/corealloc.h index 5ec7bf1f3..d60908659 100644 --- a/src/snmalloc/mem/corealloc.h +++ b/src/snmalloc/mem/corealloc.h @@ -3,6 +3,7 @@ #include "../ds/ds.h" #include "check_init.h" #include "freelist.h" +#include "largecache.h" #include "metadata.h" #include "pool.h" #include "remotecache.h" @@ -181,6 +182,13 @@ namespace snmalloc */ Ticker ticker; + /** + * Cache for large object allocations. + * Avoids pagemap manipulation and backend buddy tree operations + * for recently freed large allocations. + */ + LargeObjectCache large_object_cache; + /** * The message queue needs to be accessible from other threads * @@ -695,14 +703,79 @@ namespace snmalloc return Conts::success(result, size, true); } + auto chunk_size = large_size_to_chunk_size(size); + auto sizeclass = size_to_sizeclass_full(size); + + // Check the frontend large object cache first. + // This avoids all pagemap and backend manipulation. + auto* cached_meta = self->large_object_cache.try_alloc( + chunk_size, [self](BackendSlabMetadata* fmeta) { + self->flush_large_cache_entry(fmeta); + }); + if (cached_meta != nullptr) + { + // Cache hit: pagemap still valid, recover address from meta. + auto slab_addr = + cached_meta->get_slab_interior(freelist::Object::key_root); + cached_meta->initialise_large( + slab_addr, freelist::Object::key_root); + self->laden.insert(cached_meta); + + // Reconstruct the capptr from the address. + auto p = Config::Backend::capptr_rederive_alloc( + capptr::Alloc::unsafe_from( + reinterpret_cast(slab_addr)), + chunk_size); + return Conts::success(capptr_reveal(p), size); + } + + // Cache miss: go to backend. // Grab slab of correct size // Set remote as large allocator remote. auto [chunk, meta] = Config::Backend::alloc_chunk( self->get_backend_local_state(), - large_size_to_chunk_size(size), - PagemapEntry::encode( - self->public_state(), size_to_sizeclass_full(size)), - size_to_sizeclass_full(size)); + chunk_size, + PagemapEntry::encode(self->public_state(), sizeclass), + sizeclass); + + // If backend OOM, try staged cache flush and retry. + // First flush smaller sizes — they coalesce upward in the + // buddy. If that's not enough, flush one larger entry — + // the buddy can split it. + if (meta == nullptr) + { + auto flush_fn = [self](BackendSlabMetadata* fmeta) { + self->flush_large_cache_entry(fmeta); + }; + + // Stage 1: flush all smaller sizeclasses. + if (self->large_object_cache.flush_smaller( + chunk_size, flush_fn)) + { + auto retry = Config::Backend::alloc_chunk( + self->get_backend_local_state(), + chunk_size, + PagemapEntry::encode(self->public_state(), sizeclass), + sizeclass); + chunk = retry.first; + meta = retry.second; + } + + // Stage 2: flush a single larger-or-equal entry. + if ( + meta == nullptr && + self->large_object_cache.flush_one_larger( + chunk_size, flush_fn)) + { + auto retry = Config::Backend::alloc_chunk( + self->get_backend_local_state(), + chunk_size, + PagemapEntry::encode(self->public_state(), sizeclass), + sizeclass); + chunk = retry.first; + meta = retry.second; + } + } #ifdef SNMALLOC_TRACING message<1024>( @@ -1086,6 +1159,7 @@ namespace snmalloc const PagemapEntry& entry, BackendSlabMetadata* meta) noexcept { + UNUSED(p); // TODO: Handle message queue on this path? if (meta->is_large()) @@ -1100,15 +1174,21 @@ namespace snmalloc #ifdef SNMALLOC_TRACING message<1024>("Large deallocation: {}", size); -#else - UNUSED(size); #endif // Remove from set of fully used slabs. meta->node.remove(); - Config::Backend::dealloc_chunk( - get_backend_local_state(), *meta, p, size, entry.get_sizeclass()); + // Cache in the frontend large object cache. + // The meta's free_queue already holds the chunk address (from + // initialise_large), and the pagemap entry retains the sizeclass + // and remote allocator info. No data is stored in the freed object. + // Epoch sync happens internally; stale entries are flushed via the + // callback. + large_object_cache.cache( + meta, size, [this](BackendSlabMetadata* fmeta) { + flush_large_cache_entry(fmeta); + }); return; } @@ -1117,6 +1197,24 @@ namespace snmalloc dealloc_local_object_meta(entry, meta); } + /** + * Flush a single cached large object back to the backend. + * Recovers the chunk address from the metadata and size from the pagemap. + */ + void flush_large_cache_entry(BackendSlabMetadata* meta) + { + auto slab_addr = meta->get_slab_interior(freelist::Object::key_root); + const PagemapEntry& entry = Config::Backend::get_metaentry(slab_addr); + size_t entry_sizeclass = entry.get_sizeclass().as_large(); + size_t size = bits::one_at_bit(entry_sizeclass); + + auto p = + capptr::Alloc::unsafe_from(reinterpret_cast(slab_addr)); + + Config::Backend::dealloc_chunk( + get_backend_local_state(), *meta, p, size, entry.get_sizeclass()); + } + /** * Very slow path for object deallocation. * @@ -1427,6 +1525,10 @@ namespace snmalloc dealloc_local_slabs(sizeclass); } + // Flush the large object cache back to the backend. + large_object_cache.flush_all( + [this](BackendSlabMetadata* fmeta) { flush_large_cache_entry(fmeta); }); + if constexpr (mitigations(freelist_teardown_validate)) { laden.iterate( diff --git a/src/snmalloc/mem/largecache.h b/src/snmalloc/mem/largecache.h new file mode 100644 index 000000000..bcb095c44 --- /dev/null +++ b/src/snmalloc/mem/largecache.h @@ -0,0 +1,417 @@ +#pragma once + +#include "../ds/ds.h" +#include "../pal/pal_ds.h" +#include "metadata.h" +#include "sizeclasstable.h" + +namespace snmalloc +{ + /** + * Frontend cache for large object allocations. + * + * This cache sits in the per-thread Allocator and intercepts large + * alloc/dealloc before they reach the backend. By caching recently freed + * large objects, we avoid: + * - Pagemap writes on dealloc (clearing N entries) and alloc (setting N + * entries) + * - Metadata allocation/deallocation + * - Buddy allocator tree operations + * - Decommit/recommit syscalls (if DecayRange is also in the pipeline) + * + * The cache uses the slab metadata's SeqSet node to link cached entries, + * storing no data inside the freed object itself. The chunk address is + * recovered from the metadata's free_queue, and the chunk size from the + * pagemap entry's sizeclass. + * + * Epoch rotation is driven by a PAL timer (DecayMemoryTimerObject). + * A global epoch counter is advanced periodically by the timer. Each + * cache instance tracks the last epoch it observed and self-flushes + * stale epochs on its next operation. This means no concurrent access + * to the per-thread SeqSets is needed. + * + * Each sizeclass has an adaptive budget that bounds how many items can + * be cached. The budget starts at 1 and adjusts on each epoch rotation: + * - If stale entries were flushed (surplus), decrease budget. + * - If no entries were flushed and the cache was actively drained by + * allocations (not just empty from startup), increase budget. + * This allows the cache to grow to match the working set while shrinking + * when the workload subsides. + * + * Template parameter Config provides Backend, PagemapEntry, Pal, etc. + */ + template + class LargeObjectCache + { + using PAL = typename Config::Pal; + using BackendSlabMetadata = typename Config::Backend::SlabMetadata; + using PagemapEntry = typename Config::PagemapEntry; + + /** + * Maximum chunk size bits we cache (4 MiB = 2^22). + */ + static constexpr size_t MAX_CACHEABLE_BITS = 22; + + /** + * Maximum chunk size we cache (4 MiB). + * Larger allocations bypass the cache and go directly to/from backend. + */ + static constexpr size_t MAX_CACHEABLE_SIZE = + bits::one_at_bit(MAX_CACHEABLE_BITS); + + /** + * Number of chunk sizeclasses we actually cache. + * Only covers sizes from MIN_CHUNK_SIZE up to MAX_CACHEABLE_SIZE. + */ + static constexpr size_t NUM_SIZECLASSES = + MAX_CACHEABLE_BITS - MIN_CHUNK_BITS + 1; + + /** + * Number of epoch slots for cached ranges. + * Must be a power of 2. + */ + static constexpr size_t NUM_EPOCHS = 4; + static_assert(bits::is_pow2(NUM_EPOCHS)); + + /** + * Global epoch counter, advanced by the timer callback. + * All LargeObjectCache instances read this to detect when epochs + * have advanced and stale entries need flushing. + */ + static inline stl::Atomic global_epoch{0}; + + /** + * Timer callback that advances the global epoch. + */ + class DecayMemoryTimerObject : public PalTimerObject + { + static void process(PalTimerObject*) + { + auto e = global_epoch.load(stl::memory_order_relaxed); + global_epoch.store(e + 1, stl::memory_order_release); + } + + /// Timer fires every 500ms. + static constexpr size_t PERIOD = 500; + + public: + constexpr DecayMemoryTimerObject() : PalTimerObject(&process, PERIOD) {} + }; + + static inline DecayMemoryTimerObject timer_object; + + /** + * Flag to ensure one-shot timer registration. + */ + static inline stl::AtomicBool registered_timer{false}; + + /** + * Per-sizeclass adaptive budget state. + */ + struct SizeclassState + { + /// Maximum number of items allowed in the cache for this sizeclass. + /// Starts at 1 so the first deallocation is always cached. + size_t budget{1}; + + /// Current number of cached items across all epoch slots. + size_t count{0}; + + /// Number of cache misses since last cache insert. + /// Reset to 0 each time we successfully add to the cache. + size_t misses{0}; + + /// Peak value of misses this epoch. + /// This is what we use for budget growth - it captures the maximum + /// "depth" of consecutive misses, not cumulative misses. + size_t peak_misses{0}; + }; + + /** + * Per-sizeclass budget tracking. + */ + ModArray sc_state; + + /** + * Per-sizeclass, per-epoch SeqSets of cached metadata. + * Indexed as lists[sizeclass_index][epoch % NUM_EPOCHS]. + */ + ModArray>> + lists; + + /** + * The epoch this instance last synced to. + * Used to detect when new epochs have passed and old ones need flushing. + */ + size_t local_epoch{0}; + + /** + * Convert a chunk size to a sizeclass index. + */ + static size_t to_sizeclass(size_t chunk_size) + { + SNMALLOC_ASSERT(bits::is_pow2(chunk_size)); + SNMALLOC_ASSERT(chunk_size >= MIN_CHUNK_SIZE); + return bits::next_pow2_bits(chunk_size) - MIN_CHUNK_BITS; + } + + /** + * Register the global timer if not already done. + */ + void ensure_registered() + { + if constexpr (pal_supports) + { + if ( + !registered_timer.load(stl::memory_order_relaxed) && + !registered_timer.exchange(true, stl::memory_order_acq_rel)) + { + PAL::register_timer(&timer_object); + } + } + } + + /** + * Catch up to the global epoch, flushing any stale epochs and + * adjusting per-sizeclass budgets. + */ + template + void sync_epoch(FlushFn&& flush_fn) + { + if constexpr (pal_supports) + { + auto current = global_epoch.load(stl::memory_order_acquire); + + auto behind = current - local_epoch; + if (behind == 0) + return; + + if (behind > NUM_EPOCHS) + behind = NUM_EPOCHS; + + // Snapshot counts before flushing. + size_t before_count[NUM_SIZECLASSES]; + for (size_t sc = 0; sc < NUM_SIZECLASSES; sc++) + before_count[sc] = sc_state[sc].count; + + // Flush stale epoch slots. + for (size_t i = 0; i < behind; i++) + { + auto epoch_to_flush = (local_epoch + 1 + i) % NUM_EPOCHS; + flush_epoch_slot(epoch_to_flush, flush_fn); + } + + // Adjust budgets based on what happened. + // Net out misses against flushed items to determine direction. + for (size_t sc = 0; sc < NUM_SIZECLASSES; sc++) + { + auto& state = sc_state[sc]; + size_t flushed = before_count[sc] - state.count; + + if (state.peak_misses > flushed) + { + // More misses than surplus: grow budget by the difference. + state.budget += state.peak_misses - flushed; + } + else if (flushed > state.peak_misses) + { + // More surplus than misses: shrink budget smoothly. + state.budget -= (flushed - state.peak_misses) / 2; + } + // If equal, budget stays the same. + + state.misses = 0; + state.peak_misses = 0; + } + + local_epoch = current; + } + } + + /** + * Flush all entries in a single epoch slot. + * Decrements per-sizeclass counts. + */ + template + void flush_epoch_slot(size_t epoch_slot, FlushFn&& flush_fn) + { + for (size_t sc = 0; sc < NUM_SIZECLASSES; sc++) + { + auto& list = lists[sc][epoch_slot]; + while (!list.is_empty()) + { + sc_state[sc].count--; + flush_fn(list.pop_front()); + } + } + } + + public: + constexpr LargeObjectCache() = default; + + /** + * Try to satisfy a large allocation from the cache. + * + * @param chunk_size The power-of-2 chunk size needed. + * @param flush_fn Callback to flush stale entries during epoch sync. + * @return Metadata for a cached chunk, or nullptr on cache miss. + */ + template + BackendSlabMetadata* try_alloc(size_t chunk_size, FlushFn&& flush_fn) + { + // Don't cache very large allocations. + if (chunk_size > MAX_CACHEABLE_SIZE) + return nullptr; + + sync_epoch(flush_fn); + + auto sc = to_sizeclass(chunk_size); + auto current = local_epoch; + + // Check current epoch first, then older ones. + for (size_t age = 0; age < NUM_EPOCHS; age++) + { + auto& list = lists[sc][(current - age) % NUM_EPOCHS]; + if (!list.is_empty()) + { + sc_state[sc].count--; + return list.pop_front(); + } + } + + // Cache miss - track for budget growth. + sc_state[sc].misses++; + if (sc_state[sc].misses > sc_state[sc].peak_misses) + sc_state[sc].peak_misses = sc_state[sc].misses; + return nullptr; + } + + /** + * Cache a large deallocation. + * + * If the sizeclass is at its budget, the entry is flushed immediately + * instead of being cached. + * + * @param meta The slab metadata for the chunk. + * @param chunk_size The power-of-2 chunk size. + * @param flush_fn Callback to flush stale entries during epoch sync, + * and to flush this entry if over budget. + */ + template + void cache(BackendSlabMetadata* meta, size_t chunk_size, FlushFn&& flush_fn) + { + // Don't cache very large allocations - flush directly to backend. + if (chunk_size > MAX_CACHEABLE_SIZE) + { + flush_fn(meta); + return; + } + + ensure_registered(); + sync_epoch(flush_fn); + + auto sc = to_sizeclass(chunk_size); + + if (sc_state[sc].count >= sc_state[sc].budget) + { + // Over budget: flush immediately rather than caching. + flush_fn(meta); + return; + } + + sc_state[sc].count++; + sc_state[sc].misses = 0; // Reset miss counter on successful cache. + lists[sc][local_epoch % NUM_EPOCHS].insert(meta); + } + + /** + * Flush all cached entries back to the backend. + * Called during allocator teardown/flush. + */ + template + void flush_all(FlushFn&& flush_fn) + { + for (size_t e = 0; e < NUM_EPOCHS; e++) + { + flush_epoch_slot(e, flush_fn); + } + } + + /** + * Flush all cached entries with sizeclass strictly smaller than + * the given chunk_size. These can coalesce in the buddy allocator + * to form the needed size. + * + * @return true if any entries were flushed. + */ + template + bool flush_smaller(size_t chunk_size, FlushFn&& flush_fn) + { + // If chunk_size > MAX_CACHEABLE_SIZE, all cached entries are smaller. + size_t target_sc = (chunk_size > MAX_CACHEABLE_SIZE) ? + NUM_SIZECLASSES : + to_sizeclass(chunk_size); + bool flushed = false; + for (size_t sc = 0; sc < target_sc; sc++) + { + for (size_t e = 0; e < NUM_EPOCHS; e++) + { + auto& list = lists[sc][e]; + while (!list.is_empty()) + { + sc_state[sc].count--; + flush_fn(list.pop_front()); + flushed = true; + } + } + } + return flushed; + } + + /** + * Flush a single cached entry with sizeclass >= the given chunk_size. + * The buddy allocator can split this to satisfy the request. + * + * @return true if an entry was flushed. + */ + template + bool flush_one_larger(size_t chunk_size, FlushFn&& flush_fn) + { + // Nothing in cache can satisfy requests larger than MAX_CACHEABLE_SIZE. + if (chunk_size > MAX_CACHEABLE_SIZE) + return false; + + auto target_sc = to_sizeclass(chunk_size); + for (size_t sc = target_sc; sc < NUM_SIZECLASSES; sc++) + { + for (size_t e = 0; e < NUM_EPOCHS; e++) + { + auto& list = lists[sc][e]; + if (!list.is_empty()) + { + sc_state[sc].count--; + flush_fn(list.pop_front()); + return true; + } + } + } + return false; + } + + /** + * Check if the cache is completely empty. + */ + bool is_empty() const + { + for (size_t sc = 0; sc < NUM_SIZECLASSES; sc++) + { + for (size_t e = 0; e < NUM_EPOCHS; e++) + { + if (!lists[sc][e].is_empty()) + return false; + } + } + return true; + } + }; +} // namespace snmalloc diff --git a/src/snmalloc/pal/pal_windows.h b/src/snmalloc/pal/pal_windows.h index a44079dea..602749aad 100644 --- a/src/snmalloc/pal/pal_windows.h +++ b/src/snmalloc/pal/pal_windows.h @@ -592,7 +592,7 @@ namespace snmalloc # ifdef PLATFORM_HAS_VIRTUALALLOC2 template - void* PALWindows::reserve_aligned(size_t size) noexcept + inline void* PALWindows::reserve_aligned(size_t size) noexcept { SNMALLOC_ASSERT(bits::is_pow2(size)); SNMALLOC_ASSERT(size >= minimum_alloc_size); @@ -622,7 +622,7 @@ namespace snmalloc } # endif - void* PALWindows::reserve(size_t size) noexcept + inline void* PALWindows::reserve(size_t size) noexcept { void* ret = VirtualAlloc(nullptr, size, MEM_RESERVE, PAGE_READWRITE);