diff --git a/benchmark/misc/compile-cache-timing.js b/benchmark/misc/compile-cache-timing.js new file mode 100644 index 00000000000000..bdeae8c5cd2e93 --- /dev/null +++ b/benchmark/misc/compile-cache-timing.js @@ -0,0 +1,72 @@ +'use strict'; + +// Startup benchmark for the compile cache (including the zstd dictionary). +// Compares no-cache / cold-cache / warm-cache for two workloads: +// big - one large module (the typescript.js fixture) +// many - many small modules (generated here, side-effect-free) +// The modules are generated into a temp dir so the benchmark is self-contained +// and reproducible, and never executes unrelated code. + +const common = require('../common.js'); +const { spawnSync } = require('child_process'); +const fs = require('fs'); +const os = require('os'); +const path = require('path'); + +const bench = common.createBenchmark(main, { + workload: ['big', 'many'], + cache: ['none', 'cold', 'warm'], + n: [30], +}); + +const BIG = path.resolve(__dirname, '../../test/fixtures/snapshot/typescript.js'); + +// Generate `count` small, side-effect-free modules and return the require() +// code that loads them all in one child. +function makeManyModules(dir, count) { + fs.mkdirSync(dir, { recursive: true }); + const reqs = []; + for (let i = 0; i < count; i++) { + const file = path.join(dir, `mod-${i}.js`); + fs.writeFileSync( + file, + `'use strict';\n` + + `module.exports = function value${i}(a, b) {\n` + + ` const sum = a + b + ${i};\n` + + ` return { id: ${i}, sum, label: 'module-${i}' };\n` + + `};\n`); + reqs.push(`require(${JSON.stringify(file)});`); + } + return reqs.join(''); +} + +function run(cmd, args, cacheDir) { + const env = { ...process.env }; + if (cacheDir) env.NODE_COMPILE_CACHE = cacheDir; + else delete env.NODE_COMPILE_CACHE; + const child = spawnSync(cmd, args, { env, stdio: 'ignore' }); + if (child.error) throw child.error; +} + +function main({ n, workload, cache }) { + const cmd = process.execPath || process.argv[0]; + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'cc-bench-')); + const args = workload === 'big' ? + [BIG] : + ['-e', makeManyModules(path.join(tmp, 'mods'), 120)]; + const cacheDir = cache === 'none' ? null : path.join(tmp, 'cache'); + + try { + if (cache === 'warm') run(cmd, args, cacheDir); // populate once + bench.start(); + for (let i = 0; i < n; i++) { + if (cache === 'cold' && cacheDir) { + fs.rmSync(cacheDir, { recursive: true, force: true }); + } + run(cmd, args, cacheDir); + } + bench.end(n); + } finally { + fs.rmSync(tmp, { recursive: true, force: true }); + } +} diff --git a/node.gyp b/node.gyp index d2dbce19992b10..ab54cc27437e72 100644 --- a/node.gyp +++ b/node.gyp @@ -1110,6 +1110,22 @@ '<@(linked_module_files)', ], }, + { + 'action_name': 'generate_compile_cache_zstd_dict', + 'inputs': [ + 'src/compile_cache_zstd.dict', + 'tools/generate_compile_cache_dict.py', + ], + 'outputs': [ + '<(SHARED_INTERMEDIATE_DIR)/compile_cache_zstd_dict.h', + ], + 'action': [ + '<(python)', + 'tools/generate_compile_cache_dict.py', + 'src/compile_cache_zstd.dict', + '<@(_outputs)', + ], + }, ], }, # node_base { @@ -1123,6 +1139,7 @@ 'src', 'deps/v8/include', 'deps/uv/include', + '<(SHARED_INTERMEDIATE_DIR)', # for compile_cache_zstd_dict.h etc. ], 'dependencies': [ diff --git a/src/compile_cache.cc b/src/compile_cache.cc index 9e11793aa3388f..35c4b02cc70d9a 100644 --- a/src/compile_cache.cc +++ b/src/compile_cache.cc @@ -11,6 +11,10 @@ #include "util.h" #include "zlib.h" #include "zstd.h" +// kCompileCacheZstdDict + kCompileCacheZstdDictSize come from the header +// generated at build time by the GYP action (from src/compile_cache_zstd.dict). +// The include directory (SHARED_INTERMEDIATE_DIR) is added by node.gyp. +#include "compile_cache_zstd_dict.h" #ifdef NODE_IMPLEMENTS_POSIX_CREDENTIALS #include // getuid @@ -28,6 +32,29 @@ using v8::ScriptCompiler; using v8::String; namespace { +// The compile-cache zstd dictionary is immutable and embedded in the binary, +// so the prepared CDict/DDict are created once and shared across all handlers +// (and all Environments/Workers) instead of per handler. They live for the +// lifetime of the process. Returns nullptr if preparation fails, in which +// case callers fall back to plain (dictionary-less) zstd. +ZSTD_CDict* GetCompileCacheCDict() { + static ZSTD_CDict* cdict = + ZSTD_createCDict(kCompileCacheZstdDict, kCompileCacheZstdDictSize, 1); + return cdict; +} + +ZSTD_DDict* GetCompileCacheDDict() { + static ZSTD_DDict* ddict = + ZSTD_createDDict(kCompileCacheZstdDict, kCompileCacheZstdDictSize); + return ddict; +} + +// The dictionary only helps small/medium caches; for larger inputs zstd's own +// adaptive model dominates and the dictionary never wins, so we skip the +// (otherwise wasted) second compression above this raw size. Decompression is +// unaffected: a single DDict decodes both dict-assisted and plain frames. +constexpr uint32_t kCompileCacheDictMaxRawSize = 256 * 1024; + std::string Uint32ToHex(uint32_t crc) { std::string str; str.reserve(8); @@ -266,10 +293,20 @@ void CompileCacheHandler::ReadCacheFile(CompileCacheEntry* entry) { Debug("failed to create zstd context\n"); return; } - // Decompress directly into the buffer handed to V8. + // Decompress directly into the buffer handed to V8. The embedded + // dictionary is referenced via a shared, prepared DDict; plain frames + // (which carry no dictID) decompress correctly with it as well. std::unique_ptr raw_data(new uint8_t[raw_size]); - size_t decompressed_size = ZSTD_decompressDCtx( - zstd_dctx_, raw_data.get(), raw_size, disk_data.get(), cache_size); + ZSTD_DDict* ddict = GetCompileCacheDDict(); + size_t decompressed_size; + if (ddict != nullptr) { + decompressed_size = ZSTD_decompress_usingDDict( + zstd_dctx_, raw_data.get(), raw_size, disk_data.get(), cache_size, + ddict); + } else { + decompressed_size = ZSTD_decompressDCtx( + zstd_dctx_, raw_data.get(), raw_size, disk_data.get(), cache_size); + } if (ZSTD_isError(decompressed_size)) { Debug("decompression failed: %s\n", ZSTD_getErrorName(decompressed_size)); return; @@ -508,16 +545,43 @@ void CompileCacheHandler::Persist() { // shutdown and should add as little overhead as possible. If the data // is not compressible, store it uncompressed, which is indicated by // the cache size being equal to the uncompressed size in the headers. + // + // We also try the embedded trained dictionary and keep whichever frame is + // smaller (still subject to the "only store if < raw" policy). The + // dictionary mainly helps the small/medium caches that dominate real + // compile cache usage; for inputs where plain zstd already wins we keep + // the plain frame. char* cache_ptr = raw_ptr; uint32_t cache_size = raw_size; std::unique_ptr compressed; + std::unique_ptr compressed_dict; if (cctx != nullptr || (cctx = ZSTD_createCCtx()) != nullptr) { size_t compressed_bound = ZSTD_compressBound(raw_size); compressed.reset(new uint8_t[compressed_bound]); size_t compressed_size = ZSTD_compressCCtx( cctx, compressed.get(), compressed_bound, raw_ptr, raw_size, 1); + char* best_ptr = reinterpret_cast(compressed.get()); + // Only attempt the dictionary for small/medium entries (see + // kCompileCacheDictMaxRawSize); for large blobs it never wins and the + // extra compression would be wasted work. + ZSTD_CDict* cdict = raw_size <= kCompileCacheDictMaxRawSize + ? GetCompileCacheCDict() + : nullptr; + if (cdict != nullptr) { + // Compress into a separate buffer so the selected frame's bytes and + // size always stay in sync (the plain buffer is left untouched). + compressed_dict.reset(new uint8_t[compressed_bound]); + size_t dict_size = ZSTD_compress_usingCDict( + cctx, compressed_dict.get(), compressed_bound, raw_ptr, raw_size, + cdict); + if (!ZSTD_isError(dict_size) && + (ZSTD_isError(compressed_size) || dict_size < compressed_size)) { + compressed_size = dict_size; + best_ptr = reinterpret_cast(compressed_dict.get()); + } + } if (!ZSTD_isError(compressed_size) && compressed_size < raw_size) { - cache_ptr = reinterpret_cast(compressed.get()); + cache_ptr = best_ptr; cache_size = static_cast(compressed_size); } } diff --git a/src/compile_cache_zstd.dict b/src/compile_cache_zstd.dict new file mode 100644 index 00000000000000..b64455d45b1d82 Binary files /dev/null and b/src/compile_cache_zstd.dict differ diff --git a/test/parallel/test-compile-cache-success.js b/test/parallel/test-compile-cache-success.js index c02a6243286972..9417b6bf63c490 100644 --- a/test/parallel/test-compile-cache-success.js +++ b/test/parallel/test-compile-cache-success.js @@ -64,3 +64,83 @@ const path = require('path'); } }); } + +// Exercise the dictionary-compressed path (added on top of #63861) for many +// small modules, which is where the embedded dictionary helps most. We write +// the cache, then read it back and assert every entry is accepted - this +// proves each dict-compressed frame decompresses to exactly the bytes that +// were persisted. +{ + tmpdir.refresh(); + const dir = tmpdir.resolve('.compile_cache_dir'); + + // Generate a handful of small modules so the dictionary path is exercised. + const count = 8; + const modules = []; + for (let i = 0; i < count; i++) { + const file = tmpdir.resolve(`mod-${i}.js`); + fs.writeFileSync( + file, + `'use strict';\n` + + `module.exports = function value${i}(a, b) {\n` + + ` const sum = a + b + ${i};\n` + + ` return { id: ${i}, sum, label: 'module-${i}' };\n` + + `};\n`); + modules.push(file); + } + const reqCode = modules.map((m) => `require(${JSON.stringify(m)});`).join(''); + + // First run writes the cache for every module. + spawnSyncAndAssert( + process.execPath, + ['-e', reqCode], + { + env: { + ...process.env, + NODE_DEBUG_NATIVE: 'COMPILE_CACHE', + NODE_COMPILE_CACHE: dir + }, + cwd: tmpdir.path + }, + { + stderr(output) { + for (const m of modules) { + const name = path.basename(m).replace(/[.]/g, '\\.'); + assert.match(output, new RegExp(`writing cache for .*${name}.*success`)); + } + return true; + } + }); + + const cacheDirs = fs.readdirSync(dir); + assert.strictEqual(cacheDirs.length, 1); + // At least one entry per module (the `-e` runner is cached too). + const entries = fs.readdirSync(path.join(dir, cacheDirs[0])); + assert(entries.length >= count, `expected >= ${count} entries, got ${entries.length}`); + + // Second run reads every cached entry back; "was accepted" only happens when + // the decompressed bytes match the freshly produced in-memory cache, so this + // is a full roundtrip check of the dictionary-compressed entries. + spawnSyncAndAssert( + process.execPath, + ['-e', reqCode], + { + env: { + ...process.env, + NODE_DEBUG_NATIVE: 'COMPILE_CACHE', + NODE_COMPILE_CACHE: dir + }, + cwd: tmpdir.path + }, + { + stderr(output) { + for (const m of modules) { + const name = path.basename(m).replace(/[.]/g, '\\.'); + assert.match( + output, + new RegExp(`cache for .*${name} was accepted, keeping the in-memory entry`)); + } + return true; + } + }); +} diff --git a/tools/generate_compile_cache_dict.py b/tools/generate_compile_cache_dict.py new file mode 100644 index 00000000000000..8794889775bb63 --- /dev/null +++ b/tools/generate_compile_cache_dict.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +"""Generate compile_cache_zstd_dict.h from a trained zstd .dict file. + +Invoked by the GYP action in node.gyp at build time. Only the small binary +.dict (src/compile_cache_zstd.dict) is checked into the repository; the C +array it produces is generated into SHARED_INTERMEDIATE_DIR. +""" +import os +import sys + + +def main(dict_path, out_path): + with open(dict_path, 'rb') as f: + data = f.read() + + lines = [ + '// Generated by tools/generate_compile_cache_dict.py', + '// from %s' % os.path.basename(dict_path), + '// The .dict file is the source of truth; do not edit by hand.', + '', + 'static const unsigned char kCompileCacheZstdDict[] = {', + ] + for i in range(0, len(data), 12): + chunk = data[i:i + 12] + lines.append(' %s,' % ', '.join('0x%02x' % b for b in chunk)) + lines.append('};') + lines.append('static const size_t kCompileCacheZstdDictSize = %d;' % + len(data)) + + with open(out_path, 'w') as f: + f.write('\n'.join(lines) + '\n') + + +if __name__ == '__main__': + if len(sys.argv) != 3: + sys.exit('Usage: %s ' % sys.argv[0]) + main(sys.argv[1], sys.argv[2])