From be95d4bf17abeca2a46be38cb60f61d36c624298 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 17:48:59 +0200 Subject: [PATCH 01/23] Add parser experiments catalog and parse-only benchmark harness Consolidates the parser/lexer performance experiments explored alongside the shipped optimizations (PR #378, built on #373/#375/#376). One directory and commit per approach; each has code and/or a NOTES.md with idea, method, result, verdict. --- experiments/README.md | 54 +++++++ experiments/_harness/bench-parse-only.php | 151 ++++++++++++++++++++ experiments/_harness/bench-parser-split.php | 95 ++++++++++++ 3 files changed, 300 insertions(+) create mode 100644 experiments/README.md create mode 100644 experiments/_harness/bench-parse-only.php create mode 100644 experiments/_harness/bench-parser-split.php diff --git a/experiments/README.md b/experiments/README.md new file mode 100644 index 000000000..3109b0a24 --- /dev/null +++ b/experiments/README.md @@ -0,0 +1,54 @@ +# MySQL parser performance experiments + +This branch consolidates and verifies the parser/lexer performance experiments +that were explored while optimizing the pure-PHP MySQL parser. The shipped +optimizations live in PR #378 (built on #373 / #375 / #376); the optional native +Rust extension is PR #381 (and #423). The work here is the catalog of *other* +approaches that were prototyped and measured along the way — most lived only in +throwaway local branches or ephemeral sessions and had no home until now. + +Everything was re-measured on a MacBook Pro M4, PHP 8.5.5, PCRE2 10.47. +Numbers drift ~10–15% with thermal/load; treat them as orders of magnitude and +ratios, not exact constants. + +## How to run +Warm tracing JIT (the production-relevant config): +``` +-d memory_limit=2G -d opcache.enable_cli=1 -d opcache.jit_buffer_size=64M -d opcache.jit=tracing +``` +No opcache: `-d opcache.enable_cli=0`. opcache without JIT: `-d opcache.enable_cli=1 -d opcache.jit=disable`. +Always put `-d` flags BEFORE the script path. The corpus is the 69,577-query +MySQL server-suite CSV at `packages/mysql-on-sqlite/tests/mysql/data/`. + +Verified parse-only baselines (best-of-N, reuse one parser, warm JIT): +trunk ≈ 27,700 QPS; the optimized parser (#378) ≈ 56,500 QPS (≈2.0×); +pure-regex recognition ≈ 98K; the parser in validate-only mode ≈ 246K. +AST construction is ≈77% of parse time. + +## Experiments (one per directory, one per commit) +`_harness/` holds the parse-only benchmark harnesses used throughout. Each +experiment directory has a `NOTES.md` with the idea, how it was measured, the +result, and a verdict; see each for origin (PR or local branch). + +- `whole-grammar-compilation/` — compile every rule to a dedicated PHP method. +- `method-size-capping/` — cap compiled method size, stub the rest to the interpreter. +- `ast-data-structures/` — object vs validate-only vs flat-int-tape vs array node. +- `pratt-expression-cascade/` — Pratt operator-precedence parser for the expr chain. +- `ll2-selectors/` — 2-token-lookahead proposal + the rule/call-split analysis behind it. +- `lalr-table-driven/` — kmyacc/nikic-style action-goto table interpreter. +- `packed-table-lookups/` — pack/unpack vs PHP-array action-table lookups. +- `full-pcre-recognizer/` — fold the whole grammar into one recursive PCRE pattern. +- `regex-prevalidate-hybrid/` — regex yes/no gate in front of the AST parser. +- `multishape-fast-parser/` — per-query-shape regex → direct AST construction. +- `pcre2-capture-trace/` — extract a parse tree from PCRE2 captures. +- `pcre2-callouts-ffi/` — PCRE2 callouts via FFI to emit a structural trace. +- `preg-replace-callback-shiftreduce/` — iterative mega-pattern reduction. +- `binary-bottomup-reduction/` — the same, with fixed-width binary encodings. +- `oniguruma-capture-trees/` — `(?@...)` capture trees (31-group cap; unreachable in PHP). +- `strtr-blind-reduction/` — strtr iterate-to-stable reduction (toy grammar). +- `native-tree-builders/` — json_decode/unserialize/DOMDocument (circular). +- `parle-extension/` — the `parle` PECL LALR(1) extension. +- `other-php-parser-libs/` — PHP-PEG / Hoa\Compiler / Phlexy. +- `sqlite-as-parser/` — use SQLite's own parser as a classifier. +- `ast-cache/` — cache the AST on a parameterized token-stream signature. +- `native-rust-extension/` — the optional Rust extension (PR #381/#423/#378). diff --git a/experiments/_harness/bench-parse-only.php b/experiments/_harness/bench-parse-only.php new file mode 100644 index 000000000..888c3c719 --- /dev/null +++ b/experiments/_harness/bench-parse-only.php @@ -0,0 +1,151 @@ += $limit ) { + break; + } +} +fclose( $handle ); + +// Pre-lex all queries (excluded from timing). +$all_tokens = array(); +foreach ( $queries as $query ) { + $lexer = new WP_MySQL_Lexer( $query ); + $all_tokens[] = $lexer instanceof WP_MySQL_Native_Lexer + ? $lexer->native_token_stream() + : $lexer->remaining_tokens(); +} +$n = count( $queries ); + +$run_once = function () use ( $grammar, $all_tokens, $reuse ) { + $failures = 0; + $parser = null; + $start = microtime( true ); + foreach ( $all_tokens as $tokens ) { + if ( $reuse ) { + if ( null === $parser ) { + $parser = new WP_MySQL_Parser( $grammar, $tokens ); + } else { + $parser->reset_tokens( $tokens ); + } + } else { + $parser = new WP_MySQL_Parser( $grammar, $tokens ); + } + $ast = $parser->parse(); + if ( null === $ast ) { + ++$failures; + } + } + return array( microtime( true ) - $start, $failures ); +}; + +for ( $i = 0; $i < $warmup; $i++ ) { + $run_once(); +} + +$qpss = array(); +$fail = 0; +for ( $r = 0; $r < $runs; $r++ ) { + list( $duration, $failures ) = $run_once(); + $qpss[] = $n / $duration; + $fail = $failures; +} +sort( $qpss ); +$best = $qpss[ count( $qpss ) - 1 ]; +$median = $qpss[ intdiv( count( $qpss ), 2 ) ]; + +$jit_on = false; +$status = opcache_get_status( false ); +if ( is_array( $status ) && isset( $status['jit']['on'] ) ) { + $jit_on = (bool) $status['jit']['on']; +} + +if ( $json ) { + echo json_encode( + array( + 'queries' => $n, + 'failures' => $fail, + 'qps_best' => $best, + 'qps_med' => $median, + 'jit' => $jit_on, + 'php' => PHP_VERSION, + ) + ), "\n"; + exit; +} + +printf( + "queries=%d failures=%d best=%d QPS median=%d QPS jit=%s php=%s\n", + $n, + $fail, + $best, + $median, + $jit_on ? 'on' : 'off', + PHP_VERSION +); diff --git a/experiments/_harness/bench-parser-split.php b/experiments/_harness/bench-parser-split.php new file mode 100644 index 000000000..107f3cbe1 --- /dev/null +++ b/experiments/_harness/bench-parser-split.php @@ -0,0 +1,95 @@ += $limit ) { + break; + } +} +fclose( $handle ); +echo 'Loaded ', count( $queries ), " queries\n"; + +// Pre-tokenize all queries once. The tokens are reused across runs, so the +// parser starts from a cold AST cache each iteration but a warm token cache. +$lex_start = microtime( true ); +$all_tokens = array(); +foreach ( $queries as $query ) { + $lexer = new WP_MySQL_Lexer( $query ); + $all_tokens[] = $lexer->remaining_tokens(); +} +$lex_duration = microtime( true ) - $lex_start; +printf( "Lex: %.4fs, %d QPS\n", $lex_duration, count( $queries ) / $lex_duration ); + +// Parse benchmark. +$results = array(); +for ( $r = 0; $r < $runs; $r++ ) { + $failures = 0; + $start = microtime( true ); + foreach ( $all_tokens as $tokens ) { + $parser = new WP_MySQL_Parser( $grammar, $tokens ); + $ast = $parser->parse(); + if ( null === $ast ) { + ++$failures; + } + } + $duration = microtime( true ) - $start; + $qps = count( $queries ) / $duration; + $results[] = array( $duration, $qps, $failures ); + printf( "Run %d: %.4fs, %d QPS, %d failures\n", $r + 1, $duration, $qps, $failures ); +} + +if ( $runs > 1 ) { + $durations = array_column( $results, 0 ); + sort( $durations ); + $best = $durations[0]; + printf( "Best: %.4fs, %d QPS\n", $best, count( $queries ) / $best ); + $avg = array_sum( $durations ) / count( $durations ); + printf( "Avg: %.4fs, %d QPS\n", $avg, count( $queries ) / $avg ); +} From c9b364f2b469243d5915657a3da2be9e6717367b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 17:49:00 +0200 Subject: [PATCH 02/23] Add whole-grammar PHP compilation experiment Compile every rule to a dedicated method. +18-20% without JIT, -6-8% under tracing JIT (huge methods exceed the JIT trace-length limit). From local branch _parser_perf. --- .../whole-grammar-compilation/NOTES.md | 28 ++ .../bench-compiled-parser.php | 92 ++++ .../compare-asts.php | 67 +++ .../compile-grammar.php | 416 ++++++++++++++++++ .../dump-inflated-grammar.php | 27 ++ 5 files changed, 630 insertions(+) create mode 100644 experiments/whole-grammar-compilation/NOTES.md create mode 100644 experiments/whole-grammar-compilation/bench-compiled-parser.php create mode 100644 experiments/whole-grammar-compilation/compare-asts.php create mode 100644 experiments/whole-grammar-compilation/compile-grammar.php create mode 100644 experiments/whole-grammar-compilation/dump-inflated-grammar.php diff --git a/experiments/whole-grammar-compilation/NOTES.md b/experiments/whole-grammar-compilation/NOTES.md new file mode 100644 index 000000000..4c79ce174 --- /dev/null +++ b/experiments/whole-grammar-compilation/NOTES.md @@ -0,0 +1,28 @@ +# Whole-grammar → PHP compilation + +**Origin:** local branch `_parser_perf` (commits b5959e8, bf1b1fea). No PR. + +**Idea:** compile every grammar rule to a dedicated PHP method with switch-on-token +dispatch and inlined symbol matching, instead of interpreting the grammar at +runtime. `compile-grammar.php` emits `WP_MySQL_Compiled_Parser` (extends `WP_Parser`). + +**Run** (point requires at the optimized parser tree — PR #378): +``` +php compile-grammar.php > /tmp/compiled.php # 2.48 MB, 50,918 lines, 1427 methods +php -d ...jit... bench-compiled-parser.php --runs=5 # interpreter vs compiled +php compare-asts.php # AST identity check vs interpreter +``` + +**Result (best-of-N, fresh parser per query):** + +| config | interpreter | compiled | Δ | +|------------------|-------------|----------|--------| +| no opcache | ~35K QPS | ~42K | +20% | +| opcache, no JIT | ~39K QPS | ~46K | +18% | +| opcache + JIT | ~62K QPS | ~57K | −8% | + +**Verdict:** the compiled parser wins ~18–20% WITHOUT JIT but loses ~6–8% under +tracing JIT — the huge generated methods exceed `opcache.jit_max_trace_length`, so +JIT abandons them while the small interpreted hot loop traces tightly. Wrong shape +for tracing JIT (the production default). See `method-size-capping/` for the +attempt to fix this. diff --git a/experiments/whole-grammar-compilation/bench-compiled-parser.php b/experiments/whole-grammar-compilation/bench-compiled-parser.php new file mode 100644 index 000000000..6d375416e --- /dev/null +++ b/experiments/whole-grammar-compilation/bench-compiled-parser.php @@ -0,0 +1,92 @@ += $limit ) { + break; + } +} +fclose( $handle ); + +$all_tokens = array(); +foreach ( $queries as $q ) { + $all_tokens[] = ( new WP_MySQL_Lexer( $q ) )->remaining_tokens(); +} +echo 'Loaded ', count( $queries ), " queries\n"; + +function bench( $label, callable $factory, array $tokens_list, $runs ) { + $results = array(); + for ( $r = 0; $r < $runs; $r++ ) { + $fail = 0; + $start = microtime( true ); + foreach ( $tokens_list as $tokens ) { + $parser = $factory( $tokens ); + $ast = $parser->parse(); + if ( null === $ast ) { + ++$fail; + } + } + $dur = microtime( true ) - $start; + $results[] = $dur; + printf( "%-15s run %d: %.4fs, %d QPS, %d failures\n", $label, $r + 1, $dur, count( $tokens_list ) / $dur, $fail ); + } + sort( $results ); + $best = $results[0]; + $avg = array_sum( $results ) / count( $results ); + printf( "%-15s best %.4fs (%d QPS) avg %.4fs (%d QPS)\n", $label, $best, count( $tokens_list ) / $best, $avg, count( $tokens_list ) / $avg ); +} + +bench( + 'interpreted', + fn( $tokens ) => new WP_MySQL_Parser( $grammar, $tokens ), + $all_tokens, + $runs +); +bench( + 'compiled', + fn( $tokens ) => new WP_MySQL_Compiled_Parser( $grammar, $tokens ), + $all_tokens, + $runs +); diff --git a/experiments/whole-grammar-compilation/compare-asts.php b/experiments/whole-grammar-compilation/compare-asts.php new file mode 100644 index 000000000..097250096 --- /dev/null +++ b/experiments/whole-grammar-compilation/compare-asts.php @@ -0,0 +1,67 @@ +id . ',' . $n->start . ',' . $n->length . ')'; + } + $out = 'n(' . $n->rule_name; + foreach ( $n->get_children() as $c ) { + $out .= ',' . ast_signature( $c ); + } + return $out . ')'; +} + +$grammar = new WP_Parser_Grammar( require __DIR__ . '/../../src/mysql/mysql-grammar.php' ); +$handle = fopen( __DIR__ . '/../mysql/data/mysql-server-tests-queries.csv', 'r' ); +$header = true; +$limit = (int) ( $argv[1] ?? PHP_INT_MAX ); +$n = 0; +$miss = 0; +while ( ( $row = fgetcsv( $handle, null, ',', '"', '\\' ) ) !== false && $n < $limit ) { + if ( $header ) { + $header = false; + continue; + } + if ( null === $row[0] ) { + continue; + } + ++$n; + $tokens1 = ( new WP_MySQL_Lexer( $row[0] ) )->remaining_tokens(); + $tokens2 = ( new WP_MySQL_Lexer( $row[0] ) )->remaining_tokens(); + $a1 = ( new WP_MySQL_Parser( $grammar, $tokens1 ) )->parse(); + $a2 = ( new WP_MySQL_Compiled_Parser( $grammar, $tokens2 ) )->parse(); + $s1 = ast_signature( $a1 ); + $s2 = ast_signature( $a2 ); + if ( $s1 !== $s2 ) { + ++$miss; + if ( $miss <= 5 ) { + echo "MISMATCH query #$n:\n"; + echo ' ', substr( $row[0], 0, 200 ), "\n"; + echo ' interpreter: ', substr( $s1, 0, 300 ), "\n"; + echo ' compiled: ', substr( $s2, 0, 300 ), "\n"; + } + } +} +echo "Checked $n queries, $miss mismatches.\n"; diff --git a/experiments/whole-grammar-compilation/compile-grammar.php b/experiments/whole-grammar-compilation/compile-grammar.php new file mode 100644 index 000000000..e82bc578e --- /dev/null +++ b/experiments/whole-grammar-compilation/compile-grammar.php @@ -0,0 +1,416 @@ + src/mysql/class-wp-mysql-compiled-parser.php + */ + +require_once __DIR__ . '/../../src/parser/class-wp-parser-grammar.php'; +require_once __DIR__ . '/../../src/parser/class-wp-parser-node.php'; +require_once __DIR__ . '/../../src/parser/class-wp-parser-token.php'; +require_once __DIR__ . '/../../src/parser/class-wp-parser.php'; +require_once __DIR__ . '/../../src/mysql/class-wp-mysql-token.php'; +require_once __DIR__ . '/../../src/mysql/class-wp-mysql-lexer.php'; +require_once __DIR__ . '/../../src/mysql/class-wp-mysql-parser.php'; + +$grammar = new WP_Parser_Grammar( require __DIR__ . '/../../src/mysql/mysql-grammar.php' ); +$query_rid = $grammar->get_rule_id( 'query' ); +$select_rid = $grammar->get_rule_id( 'selectStatement' ); +$htid = $grammar->highest_terminal_id; +$into_symbol = WP_MySQL_Lexer::INTO_SYMBOL; + +// Reachability + fragment reference count. +$visited = array(); +$refs = array(); +$queue = array( $query_rid ); +while ( $queue ) { + $r = array_pop( $queue ); + if ( isset( $visited[ $r ] ) ) { + continue; + } + $visited[ $r ] = true; + foreach ( $grammar->rules[ $r ] as $branch ) { + foreach ( $branch as $sym ) { + if ( $sym > $htid ) { + $refs[ $sym ] = ( $refs[ $sym ] ?? 0 ) + 1; + if ( ! isset( $visited[ $sym ] ) ) { + $queue[] = $sym; + } + } + } + } +} + +// Decide which rules get inlined. +// Inline a fragment only if it is reachable AND single-branch (the simple +// case where we can splice its symbols into the parent branch). Multi-branch +// fragments require splatting which can explode parent branch counts; keep +// them as methods for now. +$inline_fragments = array(); +foreach ( $grammar->fragment_ids as $rid => $_ ) { + if ( + isset( $visited[ $rid ] ) + && isset( $grammar->rules[ $rid ] ) + && 1 === count( $grammar->rules[ $rid ] ) + ) { + $inline_fragments[ $rid ] = true; + } +} + +// Rules that will get a method. +$kept = array(); +foreach ( $visited as $rid => $_ ) { + if ( ! isset( $inline_fragments[ $rid ] ) ) { + $kept[ $rid ] = true; + } +} + +/** + * Compute the flattened symbol sequence for a branch, splicing any inlined + * single-use fragments in place. Cycles fall back to leaving the reference. + */ +$flatten = function ( array $branch ) use ( &$flatten, $grammar, $inline_fragments, $htid ) { + static $expanding = array(); + $out = array(); + foreach ( $branch as $sym ) { + if ( $sym <= $htid ) { + $out[] = $sym; + continue; + } + if ( ! isset( $inline_fragments[ $sym ] ) ) { + $out[] = $sym; + continue; + } + if ( count( $grammar->rules[ $sym ] ) !== 1 ) { + // Multi-branch single-use fragment: keep as call to avoid + // exponential parent-branch explosion. Future work could splat + // selected cases where branch count stays small. + $out[] = $sym; + continue; + } + if ( isset( $expanding[ $sym ] ) ) { + $out[] = $sym; + continue; + } + $expanding[ $sym ] = true; + foreach ( $flatten( $grammar->rules[ $sym ][0] ) as $s ) { + $out[] = $s; + } + unset( $expanding[ $sym ] ); + } + return $out; +}; + +/** + * PHP-safe method name for a rule id. + */ +$method_name = function ( $rid ) use ( $grammar ) { + $raw = $grammar->rule_names[ $rid ]; + // Fragment names start with "%" - turn that into "f_". + $clean = '%' === $raw[0] ? 'f_' . substr( $raw, 1 ) : $raw; + $clean = preg_replace( '/[^A-Za-z0-9_]/', '_', $clean ); + return 'r_' . $clean . '_' . $rid; +}; + +/** + * Emit code that matches a single symbol in a branch, appending on success + * and jumping to $fail_label (via `goto`) on failure. We use goto because + * PHP `break`/`continue` can only target immediate loops, and we want to + * roll back the position in a shared failure path. + * + * For single-branch rules there is no rollback label - failure just returns + * immediately so the label is reused inline. + */ +$emit_symbol = function ( $sym, $indent, $fail_stmt, $skip_check = false ) use ( $grammar, $htid, $inline_fragments, &$method_name, &$flatten, &$emit_symbol ) { + $out = ''; + if ( $sym <= $htid ) { + // Inline terminal match. The caller may tell us the token at the + // current position is already known to match (via switch case + // dispatch), in which case the check is redundant. + if ( ! $skip_check ) { + $out .= $indent . "if (\$tokens[\$this->position]->id !== $sym) $fail_stmt\n"; + } + $out .= $indent . "\$children[] = \$tokens[\$this->position];\n"; + $out .= $indent . "++\$this->position;\n"; + return $out; + } + + $is_fragment = isset( $grammar->fragment_ids[ $sym ] ); + $method = $method_name( $sym ); + $out .= $indent . "\$sub = \$this->$method();\n"; + $out .= $indent . "if (false === \$sub) $fail_stmt\n"; + $nullable = isset( $grammar->nullable_branches[ $sym ] ); + if ( $is_fragment ) { + if ( $nullable ) { + $out .= $indent . "if (true !== \$sub) { foreach (\$sub as \$c) \$children[] = \$c; }\n"; + } else { + $out .= $indent . "foreach (\$sub as \$c) \$children[] = \$c;\n"; + } + } else { + if ( $nullable ) { + $out .= $indent . "if (true !== \$sub) \$children[] = \$sub;\n"; + } else { + $out .= $indent . "\$children[] = \$sub;\n"; + } + } + return $out; +}; + +/** + * Emit the body of a rule method. + */ +$emit_method = function ( $rid ) use ( $grammar, $htid, $select_rid, $into_symbol, $inline_fragments, &$method_name, &$flatten, &$emit_symbol ) { + $name = $method_name( $rid ); + $is_fragment = isset( $grammar->fragment_ids[ $rid ] ); + $is_select = $rid === $select_rid; + $rule_name = $grammar->rule_names[ $rid ]; + $nullable = isset( $grammar->nullable_branches[ $rid ] ); + + // Per-token selector. Entries are lists of branch symbol sequences (the + // runtime format). Group tokens whose branch list is identical so their + // switch cases share a body. + $selector = $grammar->branches_for_token[ $rid ] ?? array(); + $groups = array(); + foreach ( $selector as $tid => $branch_seqs ) { + $sig_parts = array(); + foreach ( $branch_seqs as $seq ) { + $sig_parts[] = implode( ',', $seq ); + } + $key = implode( '|', $sig_parts ); + $groups[ $key ]['branches'] = $branch_seqs; + $groups[ $key ]['tids'][] = $tid; + } + + $code = "\tprivate function $name() {\n"; + $code .= "\t\t\$tokens = \$this->tokens;\n"; + $code .= "\t\t\$position = \$this->position;\n"; + $code .= "\t\t\$tid = \$tokens[\$position]->id;\n"; + + // "One of N terminals" fast path. When every branch is a single + // terminal, the entire rule collapses to: check accept set, consume + // one token, return. A rule like `%f1282` (406 terminal choices) + // compiles to ~8 lines instead of ~2.8k. + $all_single_terminal = true; + $accept = array(); + foreach ( $grammar->rules[ $rid ] as $b ) { + if ( 1 !== count( $b ) || $b[0] > $htid || 0 === $b[0] ) { + $all_single_terminal = false; + break; + } + $accept[ $b[0] ] = true; + } + if ( $all_single_terminal && $accept ) { + $keys = array_keys( $accept ); + sort( $keys ); + $lookup = '[' . implode( '=>1,', $keys ) . '=>1]'; + $code .= "\t\tstatic \$ok = $lookup;\n"; + $code .= "\t\tif (!isset(\$ok[\$tid])) return " . ( $nullable ? 'true' : 'false' ) . ";\n"; + $code .= "\t\t\$t = \$tokens[\$position];\n"; + $code .= "\t\t\$this->position = \$position + 1;\n"; + if ( $is_select ) { + // selectStatement is never single-terminal, but guard anyway. + $code .= "\t\tif (\$tokens[\$position + 1]->id === $into_symbol) { \$this->position = \$position; return false; }\n"; + } + if ( $is_fragment ) { + $code .= "\t\treturn array(\$t);\n"; + } else { + $code .= "\t\treturn new WP_Parser_Node($rid, " . var_export( $rule_name, true ) . ", array(\$t));\n"; + } + $code .= "\t}\n"; + return $code; + } + + if ( count( $groups ) === 1 ) { + // All accepting tokens reach the same branch list. A bare isset() + // check against a shared lookup table is much smaller than the + // equivalent 200-way switch case list and lets PHP resolve + // dispatch in a single hash lookup. + $only = reset( $groups ); + $tids = $only['tids']; + sort( $tids ); + $lookup = '[' . implode( '=>1,', $tids ) . '=>1]'; + $code .= "\t\tstatic \$first = $lookup;\n"; + $code .= "\t\tif (!isset(\$first[\$tid])) return " . ( $nullable ? 'true' : 'false' ) . ";\n"; + // We cannot hand $known_tids here: the single-branch-group fast + // path covers many tokens, so the branch's first symbol may not be + // a specific one of them. + $code .= emit_group_body( $only['branches'], $grammar, $rid, $rule_name, $is_fragment, $is_select, $into_symbol, $htid, $inline_fragments, $method_name, $flatten, $emit_symbol, false ); + // All branches failed; emit_group_body already reset the position. + $code .= "\t\treturn " . ( $nullable ? 'true' : 'false' ) . ";\n"; + } else { + $code .= "\t\tswitch (\$tid) {\n"; + foreach ( $groups as $g ) { + // Pack case labels onto as few lines as practical (~10 per + // line); single-label cases on their own line for readability. + $tids = $g['tids']; + $chunks = array_chunk( $tids, 10 ); + foreach ( $chunks as $chunk ) { + $code .= "\t\t\t" . implode( ' ', array_map( fn( $t ) => "case $t:", $chunk ) ) . "\n"; + } + $code .= emit_group_body( $g['branches'], $grammar, $rid, $rule_name, $is_fragment, $is_select, $into_symbol, $htid, $inline_fragments, $method_name, $flatten, $emit_symbol, true, $g['tids'] ); + } + $code .= "\t\t}\n"; + $code .= "\t\treturn " . ( $nullable ? 'true' : 'false' ) . ";\n"; + } + $code .= "\t}\n"; + return $code; +}; + +function emit_group_body( array $branch_seqs, WP_Parser_Grammar $g, $rid, $rule_name, $is_fragment, $is_select, $into_symbol, $htid, $inline_fragments, $method_name, $flatten, $emit_symbol, $in_switch = true, array $known_tids = array() ) { + $indent = $in_switch ? "\t\t\t\t" : "\t\t"; + $out = ''; + $count = count( $branch_seqs ); + + foreach ( $branch_seqs as $n => $raw_branch ) { + $branch = $flatten( $raw_branch ); + $is_last = ( $n === $count - 1 ); + + // The switch dispatch guarantees the current token matches a case + // label, so if there's exactly one label and the branch starts + // with that same terminal we can skip the redundant id check. + $first_is_known_terminal = false; + if ( count( $known_tids ) === 1 && $branch && $branch[0] === $known_tids[0] ) { + $first_is_known_terminal = true; + } + + if ( $count > 1 ) { + // Multi-branch: wrap each attempt in do-while(false). Break + // falls through to the next attempt; the final break falls + // through to the switch-level break / rule-level fall-through. + $out .= $indent . "do {\n"; + $inner_indent = $indent . "\t"; + $fail_stmt = 'break;'; + $out .= $inner_indent . "\$children = array();\n"; + $out .= $inner_indent . "\$this->position = \$position;\n"; + foreach ( $branch as $i => $sym ) { + $skip_check = ( 0 === $i && $first_is_known_terminal ); + $out .= $emit_symbol( $sym, $inner_indent, $fail_stmt, $skip_check ); + } + if ( $is_select ) { + $out .= $inner_indent . "if (\$tokens[\$this->position]->id === $into_symbol) break;\n"; + } + $out .= emit_branch_return( $inner_indent, $rid, $rule_name, $is_fragment ); + $out .= $indent . "} while (false);\n"; + } else { + // Single branch: no alternatives to try, just inline. + $out .= $indent . "\$children = array();\n"; + $fail_stmt = '{ $this->position = $position; return false; }'; + foreach ( $branch as $i => $sym ) { + $skip_check = ( 0 === $i && $first_is_known_terminal ); + $out .= $emit_symbol( $sym, $indent, $fail_stmt, $skip_check ); + } + if ( $is_select ) { + $out .= $indent . "if (\$tokens[\$this->position]->id === $into_symbol) { \$this->position = \$position; return false; }\n"; + } + $out .= emit_branch_return( $indent, $rid, $rule_name, $is_fragment ); + if ( $in_switch ) { + $out .= $indent . "break;\n"; + } + return $out; + } + } + // Multi-branch group fell through all do-while attempts: reset and + // break out of the switch (or return to the rule-level fallback). + $out .= $indent . "\$this->position = \$position;\n"; + if ( $in_switch ) { + $out .= $indent . "break;\n"; + } + return $out; +} + +function emit_branch_return( $indent, $rid, $rule_name, $is_fragment ) { + $out = ''; + $out .= $indent . "if (!\$children) return true;\n"; + if ( $is_fragment ) { + $out .= $indent . "return \$children;\n"; + } else { + $out .= $indent . 'return new WP_Parser_Node(' . $rid . ', ' . var_export( $rule_name, true ) . ", \$children);\n"; + } + return $out; +} + +// Emit the class. The generated parser is self-contained: it bakes every +// FIRST set, rule name, and branch structure into the emitted code, so no +// WP_Parser_Grammar has to be loaded at runtime. +echo " $_ ) { + $compiled[ $rid ] = $emit_method( $rid ); +} +$stub = array(); +foreach ( $kept as $rid => $_ ) { + $lines = substr_count( $compiled[ $rid ], "\n" ); + $is_stub = false; + if ( null !== $cap && $lines > $cap ) { + $is_stub = true; + } + if ( $stub_single && isset( $grammar->single_candidate_rules[ $rid ] ) ) { + $is_stub = true; + } + if ( null !== $only_rids && ! isset( $only_rids[ $rid ] ) ) { + $is_stub = true; + } + // Always keep the start rule compiled so parsing begins in compiled code. + if ( $rid === $query_rid ) { + $is_stub = false; + } + if ( $is_stub ) { + $stub[ $rid ] = true; + } +} + +echo "class WP_MySQL_Compiled_Parser extends WP_Parser {\n"; +echo "\tpublic function __construct( WP_Parser_Grammar \$grammar, array \$tokens ) {\n"; +echo "\t\tparent::__construct( \$grammar, \$tokens );\n"; +echo "\t}\n\n"; +echo "\tpublic function parse() {\n"; +echo "\t\t\$ast = \$this->" . $method_name( $query_rid ) . "();\n"; +echo "\t\treturn false === \$ast ? null : \$ast;\n"; +echo "\t}\n\n"; + +foreach ( $kept as $rid => $_ ) { + if ( isset( $stub[ $rid ] ) ) { + echo "\tprivate function " . $method_name( $rid ) . "() {\n"; + echo "\t\treturn \$this->parse_recursive( $rid );\n"; + echo "\t}\n\n"; + } else { + echo $compiled[ $rid ]; + echo "\n"; + } +} + +echo "}\n"; + +$compiled_count = count( $kept ) - count( $stub ); +fwrite( STDERR, sprintf( "compiled=%d stubbed=%d total=%d\n", $compiled_count, count( $stub ), count( $kept ) ) ); diff --git a/experiments/whole-grammar-compilation/dump-inflated-grammar.php b/experiments/whole-grammar-compilation/dump-inflated-grammar.php new file mode 100644 index 000000000..88b7f370c --- /dev/null +++ b/experiments/whole-grammar-compilation/dump-inflated-grammar.php @@ -0,0 +1,27 @@ + /tmp/mysql-grammar-inflated.php + */ + +require_once __DIR__ . '/../../src/parser/class-wp-parser-grammar.php'; + +$g = new WP_Parser_Grammar( require __DIR__ . '/../../src/mysql/mysql-grammar.php' ); + +$data = array( + 'rules' => $g->rules, + 'rule_names' => $g->rule_names, + 'fragment_ids' => $g->fragment_ids ?? array(), + 'branches_for_token' => $g->branches_for_token, + 'nullable_branches' => $g->nullable_branches, + 'lowest_non_terminal_id' => $g->lowest_non_terminal_id, + 'highest_terminal_id' => $g->highest_terminal_id, +); + +echo " Date: Sat, 6 Jun 2026 17:49:00 +0200 Subject: [PATCH 03/23] Add method-size capping with runtime fallback experiment Cap compiled method size and stub the rest back to the interpreter. Rescues the no-cap JIT loss but plateaus ~0.92x; never reaches parity. From local branch _parser_perf. --- experiments/method-size-capping/NOTES.md | 37 +++++ .../method-size-capping/bench-hot-rules.php | 156 ++++++++++++++++++ 2 files changed, 193 insertions(+) create mode 100644 experiments/method-size-capping/NOTES.md create mode 100644 experiments/method-size-capping/bench-hot-rules.php diff --git a/experiments/method-size-capping/NOTES.md b/experiments/method-size-capping/NOTES.md new file mode 100644 index 000000000..6c0e7650c --- /dev/null +++ b/experiments/method-size-capping/NOTES.md @@ -0,0 +1,37 @@ +# Method-size capping with runtime fallback + +**Origin:** local branch `_parser_perf`. No PR. Uses the compiler in +`../whole-grammar-compilation/compile-grammar.php` with extra flags: +`--cap=N` (rules over N lines become stubs), `--stub-single-candidate`, +`--only-rids-file=FILE`. A stubbed rule is a 3-line method +`return $this->parse_recursive($rid);` that delegates back to the interpreter. +`bench-hot-rules.php` (here) ranks rules by call frequency (`DUMP_TOPN=50` writes +`/tmp/top50.txt` for `--only-rids-file`). + +**Idea:** keep only the hottest / smallest rules compiled (so each method stays +under the JIT trace-length limit) and let the cold tail run in the interpreter — +hoping for "small AND fast." + +**Run:** +``` +DUMP_TOPN=50 php bench-hot-rules.php +php ../whole-grammar-compilation/compile-grammar.php --cap=200 > /tmp/compiled.php +php ../whole-grammar-compilation/compile-grammar.php --only-rids-file=/tmp/top50.txt > /tmp/compiled.php +php -d ...jit... ../whole-grammar-compilation/bench-compiled-parser.php --runs=9 +``` + +**Result (warm JIT, compiled / interpreter):** + +| budget | size | speedup | +|-------------------------------|---------|---------| +| no cap | 2.48 MB | ~0.68× | +| cap 200 | 2.1 MB | ~0.92× | +| cap 100 + stub single-cand. | 868 KB | ~0.93× | +| top-50 hot rules only | ~280 KB | ~0.92× | + +All budgets verified AST-identical to the interpreter across the corpus. + +**Verdict:** capping rescues the no-cap JIT disaster (0.68×) but then PLATEAUS just +below parity (~0.92×) — it approaches 1.0× from below as more rules are stubbed +(smaller == closer to the interpreter) and never exceeds it. A focused partial +compiler is not worth it for this grammar under tracing JIT. diff --git a/experiments/method-size-capping/bench-hot-rules.php b/experiments/method-size-capping/bench-hot-rules.php new file mode 100644 index 000000000..5a9dcaa6c --- /dev/null +++ b/experiments/method-size-capping/bench-hot-rules.php @@ -0,0 +1,156 @@ +grammar = $g; + $this->token_count = count( $tokens ); + $tokens[] = new WP_Parser_Token( 0, 0, 0, '' ); + $this->tokens = $tokens; + $this->position = 0; + $this->rule_names = $g->rule_names; + $this->fragment_ids = $g->fragment_ids ?? array(); + $this->branches_for_token = $g->branches_for_token; + $this->nullable_branches = $g->nullable_branches; + $this->highest_terminal_id = $g->highest_terminal_id; + $this->sel_rid = $g->get_rule_id( 'selectStatement' ); + } + public function parse() { + $rid = $this->grammar->get_rule_id( 'query' ); + return $this->r( $rid ); + } + private function r( $rid ) { + self::$counts[ $rid ] = ( self::$counts[ $rid ] ?? 0 ) + 1; + $tokens = $this->tokens; + $position = $this->position; + $tid = $tokens[ $position ]->id; + if ( isset( $this->branches_for_token[ $rid ][ $tid ] ) ) { + $cb = $this->branches_for_token[ $rid ][ $tid ]; + } elseif ( isset( $this->nullable_branches[ $rid ] ) ) { + return true; + } else { + return false; + } + $htid = $this->highest_terminal_id; + $is_fragment = isset( $this->fragment_ids[ $rid ] ); + $is_sel = $rid === $this->sel_rid; + $ok = false; + $kids = array(); + foreach ( $cb as $branch ) { + $this->position = $position; + $kids = array(); + $ok = true; + foreach ( $branch as $sid ) { + if ( $sid <= $htid ) { + if ( $tokens[ $this->position ]->id === $sid ) { + $kids[] = $tokens[ $this->position ]; + ++$this->position; + continue; + } + $ok = false; + break; + } + $sn = $this->r( $sid ); + if ( false === $sn ) { + $ok = false; + break; + } + if ( true === $sn ) { + continue; + } + if ( is_array( $sn ) ) { + foreach ( $sn as $c ) { + $kids[] = $c; + } + } else { + $kids[] = $sn; + } + } + if ( $ok && $is_sel && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id ) { + $ok = false; + } + if ( $ok ) { + break; + } + } + if ( ! $ok ) { + $this->position = $position; + return false; + } + if ( ! $kids ) { + return true; + } + if ( $is_fragment ) { + return $kids; + } + return new WP_Parser_Node( $rid, $this->rule_names[ $rid ], $kids ); + } +} + +$grammar = new WP_Parser_Grammar( require __DIR__ . '/../../src/mysql/mysql-grammar.php' ); +$handle = fopen( __DIR__ . '/../mysql/data/mysql-server-tests-queries.csv', 'r' ); +$queries = array(); +$header = true; +while ( ( $r = fgetcsv( $handle, null, ',', '"', '\\' ) ) !== false ) { + if ( $header ) { + $header = false; + continue; + } + if ( null !== $r[0] ) { + $queries[] = $r[0]; + } +} +$queries = array_slice( $queries, 0, (int) ( $argv[1] ?? 10000 ) ); +$all_tokens = array(); +foreach ( $queries as $q ) { + $all_tokens[] = ( new WP_MySQL_Lexer( $q ) )->remaining_tokens(); +} + +foreach ( $all_tokens as $t ) { + ( new HR_Parser( $grammar, $t ) )->parse(); +} +arsort( HR_Parser::$counts ); +if ( getenv( 'DUMP_TOPN' ) ) { + $topn = (int) getenv( 'DUMP_TOPN' ); + file_put_contents( "/tmp/top{$topn}.txt", implode( "\n", array_slice( array_keys( HR_Parser::$counts ), 0, $topn ) ) . "\n" ); + fwrite( STDERR, "Wrote /tmp/top{$topn}.txt\n" ); +} +$total = array_sum( HR_Parser::$counts ); +$cumsum = 0; +$covered = array(); +$i = 0; +foreach ( HR_Parser::$counts as $rid => $cnt ) { + $cumsum += $cnt; + $covered[ $rid ] = true; + $pct = 100 * $cumsum / $total; + if ( in_array( ++$i, array( 10, 25, 50, 100, 200, 500 ), true ) || $pct >= 80 ) { + printf( "After top %d rules: cumulative %.1f%% (%s of %s calls)\n", $i, $pct, number_format( $cumsum ), number_format( $total ) ); + if ( $pct >= 95 ) { + break; + } + } +} From 0b998c11ed54b999b9286273425dbba62fa4605f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 17:49:00 +0200 Subject: [PATCH 04/23] Add alternative AST data-structure experiment object vs validate-only vs array vs flat-int-tape nodes. Validate-only ceiling ~246K (AST build is ~77% of parse time); array -5%; an in-place-truncation tape builds ~2.4x faster but is not a usable tree. --- experiments/ast-data-structures/NOTES.md | 34 + .../ast-data-structures/bench-variants.php | 170 +++++ .../ast-data-structures/microbench.php | 67 ++ .../ast-data-structures/parser-variants.php | 709 ++++++++++++++++++ 4 files changed, 980 insertions(+) create mode 100644 experiments/ast-data-structures/NOTES.md create mode 100644 experiments/ast-data-structures/bench-variants.php create mode 100644 experiments/ast-data-structures/microbench.php create mode 100644 experiments/ast-data-structures/parser-variants.php diff --git a/experiments/ast-data-structures/NOTES.md b/experiments/ast-data-structures/NOTES.md new file mode 100644 index 000000000..4aac3df8b --- /dev/null +++ b/experiments/ast-data-structures/NOTES.md @@ -0,0 +1,34 @@ +# Alternative AST data structures + +**Origin:** ephemeral exploration (rebuilt fresh from the optimized parser's hot +path). No PR/commit. + +**Idea:** the object AST (`WP_Parser_Node`) dominates parse time — try cheaper +node representations and measure the realistic ceiling. + +**Run (each variant in its own process):** +``` +php -d ...jit... bench-variants.php --src=<.../src> --variant=object|noast|array|tape --reuse +php -d ...jit... microbench.php # object vs array-literal construction +``` + +**Result (parse-only, full corpus, best-of-7, warm JIT):** + +| variant | what parse_recursive returns | QPS | vs object | +|----------|--------------------------------------|-------|-----------| +| object | `new WP_Parser_Node(...)` (current) | ~57K | 1.00× | +| noast | true/false (recognition only) | ~246K | +330% | +| array | `[$rid, $children]` | ~55K | −5% | +| tape | flat int tape + in-place rollback | ~140K | +144% | + +Microbench: `new WP_Parser_Node(...)` ≈ 27 ns; a realistic `[$rid,$child]` array +with a live child ≈ 10 ns ⇒ ~2× (NOT the ~12× a dead-store `[1]` literal suggests — +the JIT elides that literal). + +**Verdict:** Recognition is cheap; **AST materialization is ~77% of parse time** +(~246K validate-only ceiling). Swapping an object for an array barely moves the +needle (−5%) because the children-array work dominates and is constant. A *flat +int tape with in-place truncation* is ~2.4× faster to BUILD (the slowness of a +"tape" only appears with a naive `array_slice` copy-on-rollback) — but a tape is +not a usable tree; consumers would need a tape walker. No drop-in node-shape win +at scale; anything dramatic needs consumer cooperation (e.g. a lazy CST). diff --git a/experiments/ast-data-structures/bench-variants.php b/experiments/ast-data-structures/bench-variants.php new file mode 100644 index 000000000..9a017f79a --- /dev/null +++ b/experiments/ast-data-structures/bench-variants.php @@ -0,0 +1,170 @@ + 'WP_Variant_Parser_Object', + 'noast' => 'WP_Variant_Parser_NoAst', + 'array' => 'WP_Variant_Parser_Array', + 'tape' => 'WP_Variant_Parser_Tape', +); +if ( ! isset( $class_map[ $variant ] ) ) { + fwrite( STDERR, "Unknown --variant=$variant\n" ); + exit( 1 ); +} +$parser_class = $class_map[ $variant ]; + +$grammar_data = include "$src/mysql/mysql-grammar.php"; +$grammar = new WP_Parser_Grammar( $grammar_data ); + +$data_dir = dirname( __DIR__, 2 ) . '/corpus'; +$handle = fopen( "$data_dir/mysql-server-tests-queries.csv", 'r' ); +$queries = array(); +while ( ( $record = fgetcsv( $handle, null, ',', '"', '\\' ) ) !== false ) { + $query = $record[0] ?? null; + if ( null === $query || '' === $query ) { + continue; + } + $queries[] = $query; + if ( count( $queries ) >= $limit ) { + break; + } +} +fclose( $handle ); + +$all_tokens = array(); +foreach ( $queries as $query ) { + $lexer = new WP_MySQL_Lexer( $query ); + $all_tokens[] = $lexer->remaining_tokens(); +} +$n = count( $queries ); + +$run_once = function () use ( $grammar, $all_tokens, $reuse, $parser_class ) { + $failures = 0; + $parser = null; + $start = microtime( true ); + foreach ( $all_tokens as $tokens ) { + if ( $reuse ) { + if ( null === $parser ) { + $parser = new $parser_class( $grammar, $tokens ); + } else { + $parser->reset_tokens( $tokens ); + } + } else { + $parser = new $parser_class( $grammar, $tokens ); + } + $ast = $parser->parse(); + if ( null === $ast || false === $ast ) { + ++$failures; + } + } + return array( microtime( true ) - $start, $failures ); +}; + +for ( $i = 0; $i < $warmup; $i++ ) { + $run_once(); +} + +$qpss = array(); +$fail = 0; +for ( $r = 0; $r < $runs; $r++ ) { + list( $duration, $failures ) = $run_once(); + $qpss[] = $n / $duration; + $fail = $failures; +} +sort( $qpss ); +$best = $qpss[ count( $qpss ) - 1 ]; +$median = $qpss[ intdiv( count( $qpss ), 2 ) ]; + +$jit_on = false; +$status = opcache_get_status( false ); +if ( is_array( $status ) && isset( $status['jit']['on'] ) ) { + $jit_on = (bool) $status['jit']['on']; +} + +if ( $json ) { + echo json_encode( + array( + 'variant' => $variant, + 'queries' => $n, + 'failures' => $fail, + 'qps_best' => $best, + 'qps_med' => $median, + 'jit' => $jit_on, + 'php' => PHP_VERSION, + ) + ), "\n"; + exit; +} + +printf( + "variant=%-7s queries=%d failures=%d best=%d QPS median=%d QPS jit=%s php=%s\n", + $variant, + $n, + $fail, + (int) $best, + (int) $median, + $jit_on ? 'on' : 'off', + PHP_VERSION +); diff --git a/experiments/ast-data-structures/microbench.php b/experiments/ast-data-structures/microbench.php new file mode 100644 index 000000000..01b998db6 --- /dev/null +++ b/experiments/ast-data-structures/microbench.php @@ -0,0 +1,67 @@ +grammar = $grammar; + $this->rule_names = $grammar->rule_names; + $this->fragment_ids = $grammar->fragment_ids; + $this->branches_for_token = array(); + $this->nullable_branches = $grammar->nullable_branches; + $this->highest_terminal_id = $grammar->highest_terminal_id; + $this->single_candidate_rules = array(); + $this->select_statement_rule_id = $grammar->get_or_cache_rule_id( 'selectStatement' ); + $this->set_tokens( $tokens ); + } + + public function reset_tokens( array $tokens ): void { + $this->set_tokens( $tokens ); + $this->current_ast = null; + } + + protected function set_tokens( array $tokens ): void { + $this->token_count = count( $tokens ); + $tokens[] = new WP_Parser_Token( WP_Parser_Grammar::EMPTY_RULE_ID, 0, 0, '' ); + $this->tokens = $tokens; + $this->position = 0; + } + + public function parse() { + $ast = $this->parse_recursive( $this->grammar->get_or_cache_rule_id( 'query' ) ); + return false === $ast ? null : $ast; + } + + private function parse_recursive( $rule_id ) { + $tokens = $this->tokens; + $position = $this->position; + + $tid = $tokens[ $position ]->id; + if ( isset( $this->branches_for_token[ $rule_id ][ $tid ] ) ) { + $candidate_branches = $this->branches_for_token[ $rule_id ][ $tid ]; + } elseif ( isset( $this->built_rules[ $rule_id ] ) ) { + return isset( $this->nullable_branches[ $rule_id ] ); + } else { + $this->built_rules[ $rule_id ] = true; + $this->grammar->ensure_rule_selector( $rule_id ); + if ( isset( $this->grammar->branches_for_token[ $rule_id ] ) ) { + $this->branches_for_token[ $rule_id ] = $this->grammar->branches_for_token[ $rule_id ]; + if ( isset( $this->grammar->single_candidate_rules[ $rule_id ] ) ) { + $this->single_candidate_rules[ $rule_id ] = true; + } + } + if ( isset( $this->branches_for_token[ $rule_id ][ $tid ] ) ) { + $candidate_branches = $this->branches_for_token[ $rule_id ][ $tid ]; + } else { + return isset( $this->nullable_branches[ $rule_id ] ); + } + } + + $highest_terminal_id = $this->highest_terminal_id; + $is_fragment = isset( $this->fragment_ids[ $rule_id ] ); + $is_select_statement = $rule_id === $this->select_statement_rule_id; + + if ( isset( $this->single_candidate_rules[ $rule_id ] ) ) { + $branch = $candidate_branches[0]; + $children = array(); + foreach ( $branch as $subrule_id ) { + if ( $subrule_id <= $highest_terminal_id ) { + if ( $tokens[ $this->position ]->id === $subrule_id ) { + $children[] = $tokens[ $this->position ]; + ++$this->position; + continue; + } + $this->position = $position; + return false; + } + $subnode = $this->parse_recursive( $subrule_id ); + if ( false === $subnode ) { + $this->position = $position; + return false; + } + if ( true === $subnode ) { + continue; + } + if ( is_array( $subnode ) ) { + foreach ( $subnode as $c ) { + $children[] = $c; + } + } else { + $children[] = $subnode; + } + } + if ( $is_select_statement && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id ) { + $this->position = $position; + return false; + } + if ( ! $children ) { + return true; + } + if ( $is_fragment ) { + return $children; + } + return new WP_Parser_Node( $rule_id, $this->rule_names[ $rule_id ], $children ); + } + + $branch_matches = false; + $children = array(); + foreach ( $candidate_branches as $branch ) { + $this->position = $position; + $children = array(); + $branch_matches = true; + foreach ( $branch as $subrule_id ) { + if ( $subrule_id <= $highest_terminal_id ) { + if ( $tokens[ $this->position ]->id === $subrule_id ) { + $children[] = $tokens[ $this->position ]; + ++$this->position; + continue; + } + $branch_matches = false; + break; + } + $subnode = $this->parse_recursive( $subrule_id ); + if ( false === $subnode ) { + $branch_matches = false; + break; + } + if ( true === $subnode ) { + continue; + } + if ( is_array( $subnode ) ) { + foreach ( $subnode as $c ) { + $children[] = $c; + } + } else { + $children[] = $subnode; + } + } + if ( + $branch_matches + && $is_select_statement + && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id + ) { + $branch_matches = false; + } + if ( $branch_matches ) { + break; + } + } + + if ( ! $branch_matches ) { + $this->position = $position; + return false; + } + if ( ! $children ) { + return true; + } + if ( $is_fragment ) { + return $children; + } + return new WP_Parser_Node( $rule_id, $this->rule_names[ $rule_id ], $children ); + } +} + +/** + * V_NoAST: pure recognition. parse_recursive never accumulates children and + * returns true on success / false on failure, only advancing $position. + * This is the validation-only (recognition) ceiling. + */ +class WP_Variant_Parser_NoAst { + protected $grammar; + protected $tokens; + protected $token_count; + protected $position; + private $fragment_ids; + private $branches_for_token; + private $nullable_branches; + private $highest_terminal_id; + private $select_statement_rule_id; + private $single_candidate_rules; + private $built_rules = array(); + + public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { + $this->grammar = $grammar; + $this->fragment_ids = $grammar->fragment_ids; + $this->branches_for_token = array(); + $this->nullable_branches = $grammar->nullable_branches; + $this->highest_terminal_id = $grammar->highest_terminal_id; + $this->single_candidate_rules = array(); + $this->select_statement_rule_id = $grammar->get_or_cache_rule_id( 'selectStatement' ); + $this->set_tokens( $tokens ); + } + + public function reset_tokens( array $tokens ): void { + $this->set_tokens( $tokens ); + } + + protected function set_tokens( array $tokens ): void { + $this->token_count = count( $tokens ); + $tokens[] = new WP_Parser_Token( WP_Parser_Grammar::EMPTY_RULE_ID, 0, 0, '' ); + $this->tokens = $tokens; + $this->position = 0; + } + + public function parse() { + // Returns true/false; the harness counts false as a failure. + return $this->parse_recursive( $this->grammar->get_or_cache_rule_id( 'query' ) ); + } + + private function parse_recursive( $rule_id ) { + $tokens = $this->tokens; + $position = $this->position; + + $tid = $tokens[ $position ]->id; + if ( isset( $this->branches_for_token[ $rule_id ][ $tid ] ) ) { + $candidate_branches = $this->branches_for_token[ $rule_id ][ $tid ]; + } elseif ( isset( $this->built_rules[ $rule_id ] ) ) { + return isset( $this->nullable_branches[ $rule_id ] ); + } else { + $this->built_rules[ $rule_id ] = true; + $this->grammar->ensure_rule_selector( $rule_id ); + if ( isset( $this->grammar->branches_for_token[ $rule_id ] ) ) { + $this->branches_for_token[ $rule_id ] = $this->grammar->branches_for_token[ $rule_id ]; + if ( isset( $this->grammar->single_candidate_rules[ $rule_id ] ) ) { + $this->single_candidate_rules[ $rule_id ] = true; + } + } + if ( isset( $this->branches_for_token[ $rule_id ][ $tid ] ) ) { + $candidate_branches = $this->branches_for_token[ $rule_id ][ $tid ]; + } else { + return isset( $this->nullable_branches[ $rule_id ] ); + } + } + + $highest_terminal_id = $this->highest_terminal_id; + $is_select_statement = $rule_id === $this->select_statement_rule_id; + + if ( isset( $this->single_candidate_rules[ $rule_id ] ) ) { + $branch = $candidate_branches[0]; + foreach ( $branch as $subrule_id ) { + if ( $subrule_id <= $highest_terminal_id ) { + if ( $tokens[ $this->position ]->id === $subrule_id ) { + ++$this->position; + continue; + } + $this->position = $position; + return false; + } + $ok = $this->parse_recursive( $subrule_id ); + if ( false === $ok ) { + $this->position = $position; + return false; + } + } + if ( $is_select_statement && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id ) { + $this->position = $position; + return false; + } + return true; + } + + $branch_matches = false; + foreach ( $candidate_branches as $branch ) { + $this->position = $position; + $branch_matches = true; + foreach ( $branch as $subrule_id ) { + if ( $subrule_id <= $highest_terminal_id ) { + if ( $tokens[ $this->position ]->id === $subrule_id ) { + ++$this->position; + continue; + } + $branch_matches = false; + break; + } + $ok = $this->parse_recursive( $subrule_id ); + if ( false === $ok ) { + $branch_matches = false; + break; + } + } + if ( + $branch_matches + && $is_select_statement + && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id + ) { + $branch_matches = false; + } + if ( $branch_matches ) { + break; + } + } + + if ( ! $branch_matches ) { + $this->position = $position; + return false; + } + return true; + } +} + +/** + * V_Array: return array($rule_id, $children) instead of a WP_Parser_Node. + * Children accumulation is unchanged; fragments still splice (return raw array + * of children, distinguishable from a node by shape: node is [int, array]). + */ +class WP_Variant_Parser_Array { + protected $grammar; + protected $tokens; + protected $token_count; + protected $position; + private $fragment_ids; + private $branches_for_token; + private $nullable_branches; + private $highest_terminal_id; + private $select_statement_rule_id; + private $single_candidate_rules; + private $built_rules = array(); + private $current_ast; + + public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { + $this->grammar = $grammar; + $this->fragment_ids = $grammar->fragment_ids; + $this->branches_for_token = array(); + $this->nullable_branches = $grammar->nullable_branches; + $this->highest_terminal_id = $grammar->highest_terminal_id; + $this->single_candidate_rules = array(); + $this->select_statement_rule_id = $grammar->get_or_cache_rule_id( 'selectStatement' ); + $this->set_tokens( $tokens ); + } + + public function reset_tokens( array $tokens ): void { + $this->set_tokens( $tokens ); + $this->current_ast = null; + } + + protected function set_tokens( array $tokens ): void { + $this->token_count = count( $tokens ); + $tokens[] = new WP_Parser_Token( WP_Parser_Grammar::EMPTY_RULE_ID, 0, 0, '' ); + $this->tokens = $tokens; + $this->position = 0; + } + + public function parse() { + $ast = $this->parse_recursive( $this->grammar->get_or_cache_rule_id( 'query' ) ); + return false === $ast ? null : $ast; + } + + private function parse_recursive( $rule_id ) { + $tokens = $this->tokens; + $position = $this->position; + + $tid = $tokens[ $position ]->id; + if ( isset( $this->branches_for_token[ $rule_id ][ $tid ] ) ) { + $candidate_branches = $this->branches_for_token[ $rule_id ][ $tid ]; + } elseif ( isset( $this->built_rules[ $rule_id ] ) ) { + return isset( $this->nullable_branches[ $rule_id ] ); + } else { + $this->built_rules[ $rule_id ] = true; + $this->grammar->ensure_rule_selector( $rule_id ); + if ( isset( $this->grammar->branches_for_token[ $rule_id ] ) ) { + $this->branches_for_token[ $rule_id ] = $this->grammar->branches_for_token[ $rule_id ]; + if ( isset( $this->grammar->single_candidate_rules[ $rule_id ] ) ) { + $this->single_candidate_rules[ $rule_id ] = true; + } + } + if ( isset( $this->branches_for_token[ $rule_id ][ $tid ] ) ) { + $candidate_branches = $this->branches_for_token[ $rule_id ][ $tid ]; + } else { + return isset( $this->nullable_branches[ $rule_id ] ); + } + } + + $highest_terminal_id = $this->highest_terminal_id; + $is_fragment = isset( $this->fragment_ids[ $rule_id ] ); + $is_select_statement = $rule_id === $this->select_statement_rule_id; + + if ( isset( $this->single_candidate_rules[ $rule_id ] ) ) { + $branch = $candidate_branches[0]; + $children = array(); + foreach ( $branch as $subrule_id ) { + if ( $subrule_id <= $highest_terminal_id ) { + if ( $tokens[ $this->position ]->id === $subrule_id ) { + $children[] = $tokens[ $this->position ]; + ++$this->position; + continue; + } + $this->position = $position; + return false; + } + $subnode = $this->parse_recursive( $subrule_id ); + if ( false === $subnode ) { + $this->position = $position; + return false; + } + if ( true === $subnode ) { + continue; + } + // A fragment returns a raw children array (list); a node returns + // the [rule_id, children] tuple. Distinguish by key 0 being int. + if ( is_array( $subnode ) && ! ( isset( $subnode[0] ) && is_int( $subnode[0] ) && isset( $subnode[1] ) && is_array( $subnode[1] ) ) ) { + foreach ( $subnode as $c ) { + $children[] = $c; + } + } else { + $children[] = $subnode; + } + } + if ( $is_select_statement && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id ) { + $this->position = $position; + return false; + } + if ( ! $children ) { + return true; + } + if ( $is_fragment ) { + return $children; + } + return array( $rule_id, $children ); + } + + $branch_matches = false; + $children = array(); + foreach ( $candidate_branches as $branch ) { + $this->position = $position; + $children = array(); + $branch_matches = true; + foreach ( $branch as $subrule_id ) { + if ( $subrule_id <= $highest_terminal_id ) { + if ( $tokens[ $this->position ]->id === $subrule_id ) { + $children[] = $tokens[ $this->position ]; + ++$this->position; + continue; + } + $branch_matches = false; + break; + } + $subnode = $this->parse_recursive( $subrule_id ); + if ( false === $subnode ) { + $branch_matches = false; + break; + } + if ( true === $subnode ) { + continue; + } + if ( is_array( $subnode ) && ! ( isset( $subnode[0] ) && is_int( $subnode[0] ) && isset( $subnode[1] ) && is_array( $subnode[1] ) ) ) { + foreach ( $subnode as $c ) { + $children[] = $c; + } + } else { + $children[] = $subnode; + } + } + if ( + $branch_matches + && $is_select_statement + && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id + ) { + $branch_matches = false; + } + if ( $branch_matches ) { + break; + } + } + + if ( ! $branch_matches ) { + $this->position = $position; + return false; + } + if ( ! $children ) { + return true; + } + if ( $is_fragment ) { + return $children; + } + return array( $rule_id, $children ); + } +} + +/** + * V_Tape: a flat, append-only int tape. Each matched node appends two ints: + * [ rule_id, child_count ] + * Terminals append [ -1, token_index ] (negative rule_id marks a token leaf). + * On branch failure the tape is rolled back by truncating to the saved length + * (array_splice), exercising the rollback cost on multi-candidate + * rollback. parse_recursive returns the tape length consumed (an int) on + * success so callers can roll back, true for empty matches, false on failure. + * + * The tape is built faithfully (every node + terminal recorded) so that the + * rollback / truncation cost is exercised on real backtracking. + */ +class WP_Variant_Parser_Tape { + protected $grammar; + protected $tokens; + protected $token_count; + protected $position; + private $fragment_ids; + private $branches_for_token; + private $nullable_branches; + private $highest_terminal_id; + private $select_statement_rule_id; + private $single_candidate_rules; + private $built_rules = array(); + private $tape; + private $tape_len; + + public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { + $this->grammar = $grammar; + $this->fragment_ids = $grammar->fragment_ids; + $this->branches_for_token = array(); + $this->nullable_branches = $grammar->nullable_branches; + $this->highest_terminal_id = $grammar->highest_terminal_id; + $this->single_candidate_rules = array(); + $this->select_statement_rule_id = $grammar->get_or_cache_rule_id( 'selectStatement' ); + $this->set_tokens( $tokens ); + } + + public function reset_tokens( array $tokens ): void { + $this->set_tokens( $tokens ); + } + + protected function set_tokens( array $tokens ): void { + $this->token_count = count( $tokens ); + $tokens[] = new WP_Parser_Token( WP_Parser_Grammar::EMPTY_RULE_ID, 0, 0, '' ); + $this->tokens = $tokens; + $this->position = 0; + $this->tape = array(); + $this->tape_len = 0; + } + + public function parse() { + $this->tape = array(); + $this->tape_len = 0; + $res = $this->parse_recursive( $this->grammar->get_or_cache_rule_id( 'query' ) ); + // false => failure (harness counts as failure via false return). + return false === $res ? false : $this->tape; + } + + /** + * Returns false on failure, true on a matched-but-emitted-nothing rule, + * or an int (the count of tape entries appended) on a successful emit. + */ + private function parse_recursive( $rule_id ) { + $tokens = $this->tokens; + $position = $this->position; + + $tid = $tokens[ $position ]->id; + if ( isset( $this->branches_for_token[ $rule_id ][ $tid ] ) ) { + $candidate_branches = $this->branches_for_token[ $rule_id ][ $tid ]; + } elseif ( isset( $this->built_rules[ $rule_id ] ) ) { + return isset( $this->nullable_branches[ $rule_id ] ); + } else { + $this->built_rules[ $rule_id ] = true; + $this->grammar->ensure_rule_selector( $rule_id ); + if ( isset( $this->grammar->branches_for_token[ $rule_id ] ) ) { + $this->branches_for_token[ $rule_id ] = $this->grammar->branches_for_token[ $rule_id ]; + if ( isset( $this->grammar->single_candidate_rules[ $rule_id ] ) ) { + $this->single_candidate_rules[ $rule_id ] = true; + } + } + if ( isset( $this->branches_for_token[ $rule_id ][ $tid ] ) ) { + $candidate_branches = $this->branches_for_token[ $rule_id ][ $tid ]; + } else { + return isset( $this->nullable_branches[ $rule_id ] ); + } + } + + $highest_terminal_id = $this->highest_terminal_id; + $is_fragment = isset( $this->fragment_ids[ $rule_id ] ); + $is_select_statement = $rule_id === $this->select_statement_rule_id; + + if ( isset( $this->single_candidate_rules[ $rule_id ] ) ) { + $branch = $candidate_branches[0]; + $tape_mark = $this->tape_len; + $child_count = 0; + foreach ( $branch as $subrule_id ) { + if ( $subrule_id <= $highest_terminal_id ) { + if ( $tokens[ $this->position ]->id === $subrule_id ) { + $this->tape[ $this->tape_len++ ] = -1; + $this->tape[ $this->tape_len++ ] = $this->position; + ++$this->position; + ++$child_count; + continue; + } + $this->position = $position; + $this->rollback( $tape_mark ); + return false; + } + $sub = $this->parse_recursive( $subrule_id ); + if ( false === $sub ) { + $this->position = $position; + $this->rollback( $tape_mark ); + return false; + } + if ( true === $sub ) { + continue; + } + // Fragment splice: a fragment's entries are already on the tape; + // they count toward this node's children. We approximate the + // child count by entries appended (each child = 2 ints). + $child_count += $sub; + } + if ( $is_select_statement && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id ) { + $this->position = $position; + $this->rollback( $tape_mark ); + return false; + } + if ( 0 === $child_count ) { + return true; + } + if ( $is_fragment ) { + // Splice: children already on tape; report the count for parent. + return $child_count; + } + $this->tape[ $this->tape_len++ ] = $rule_id; + $this->tape[ $this->tape_len++ ] = $child_count; + return $child_count; + } + + $branch_matches = false; + $child_count = 0; + $tape_mark = $this->tape_len; + foreach ( $candidate_branches as $branch ) { + $this->position = $position; + $this->rollback( $tape_mark ); + $child_count = 0; + $branch_matches = true; + foreach ( $branch as $subrule_id ) { + if ( $subrule_id <= $highest_terminal_id ) { + if ( $tokens[ $this->position ]->id === $subrule_id ) { + $this->tape[ $this->tape_len++ ] = -1; + $this->tape[ $this->tape_len++ ] = $this->position; + ++$this->position; + ++$child_count; + continue; + } + $branch_matches = false; + break; + } + $sub = $this->parse_recursive( $subrule_id ); + if ( false === $sub ) { + $branch_matches = false; + break; + } + if ( true === $sub ) { + continue; + } + $child_count += $sub; + } + if ( + $branch_matches + && $is_select_statement + && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id + ) { + $branch_matches = false; + } + if ( $branch_matches ) { + break; + } + } + + if ( ! $branch_matches ) { + $this->position = $position; + $this->rollback( $tape_mark ); + return false; + } + if ( 0 === $child_count ) { + return true; + } + if ( $is_fragment ) { + return $child_count; + } + $this->tape[ $this->tape_len++ ] = $rule_id; + $this->tape[ $this->tape_len++ ] = $child_count; + return $child_count; + } + + private function rollback( $mark ) { + if ( $this->tape_len > $mark ) { + array_splice( $this->tape, $mark ); + $this->tape_len = $mark; + } + } +} From 4fb602736de715d8c8ed004df2220725f0e93598 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 17:49:00 +0200 Subject: [PATCH 05/23] Add Pratt expression-cascade proposal Operator-precedence inner loop for the expr->...->simpleExpr chain. Estimated 5-25% on expression-heavy queries; not prototyped. --- experiments/pratt-expression-cascade/NOTES.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 experiments/pratt-expression-cascade/NOTES.md diff --git a/experiments/pratt-expression-cascade/NOTES.md b/experiments/pratt-expression-cascade/NOTES.md new file mode 100644 index 000000000..e3de67e22 --- /dev/null +++ b/experiments/pratt-expression-cascade/NOTES.md @@ -0,0 +1,15 @@ +# Pratt parser for the expression cascade (proposal) + +**Status:** evaluated, not implemented. No code. + +**Idea:** replace the deep `expr → boolPri → predicate → bitExpr → simpleExpr → …` +recursive cascade — where most method-dispatch overhead lives — with a single +`parseExpression(min_bp)` driven by a per-token-id `(left_bp, right_bp, parse_fn)` +table. Production C compilers (GCC, Clang) use a Pratt-style inner loop inside +their hand-written recursive descent for exactly this. + +**Premise check:** the cascade is real — `expr`, `boolPri`, `predicate`, `bitExpr`, +`simpleExpr` all exist in the grammar. + +**Verdict:** estimated 5–25% on expression-heavy queries (WHERE clauses, complex +projections); medium engineering cost, low risk. Worth a prototype; not yet built. From eba6f84301973b59fec0721278f5c68aabd93a9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 17:49:00 +0200 Subject: [PATCH 06/23] Add LL(2) selectors proposal and supporting analysis 2-token lookahead to remove residual backtracking. Measured premise: 32.7% of rules are multi-candidate and absorb ~51% of parse calls. Estimated 5-15% at high cost; not prototyped. --- experiments/ll2-selectors/NOTES.md | 23 ++++++++++++++++++ experiments/ll2-selectors/call-split.php | 24 +++++++++++++++++++ experiments/ll2-selectors/grammar-stats.php | 26 +++++++++++++++++++++ 3 files changed, 73 insertions(+) create mode 100644 experiments/ll2-selectors/NOTES.md create mode 100644 experiments/ll2-selectors/call-split.php create mode 100644 experiments/ll2-selectors/grammar-stats.php diff --git a/experiments/ll2-selectors/NOTES.md b/experiments/ll2-selectors/NOTES.md new file mode 100644 index 000000000..30e989e3e --- /dev/null +++ b/experiments/ll2-selectors/NOTES.md @@ -0,0 +1,23 @@ +# LL(2) selectors (proposal + supporting analysis) + +**Origin:** proposal — not implemented. The scripts here measure the premise. + +**Idea:** the parser tries multi-candidate branches in order and backtracks on +failure. With 2-token lookahead instead of 1, most multi-candidate rules would +become deterministic, eliminating residual backtracking. + +**Run (supporting measurements):** +``` +php grammar-stats.php <.../packages/mysql-on-sqlite/src> # static rule split +php call-split.php # dynamic call split +``` + +**Result (premise check):** +- 1290/1916 = **67.3%** of rules always resolve to one branch per token. +- 626/1916 = **32.7%** are multi-candidate for at least one token. +- Of ~9.33M `parse_recursive` calls, multi-candidate rules absorb **~51%**. + +**Verdict:** the premise holds — a third of rules are multi-candidate yet they take +just over half of all parse calls, so that's where deeper lookahead could help. +Estimated 5–15% gain at high engineering cost (LL(*)/ALL(*)-style static +analysis). Not prototyped. diff --git a/experiments/ll2-selectors/call-split.php b/experiments/ll2-selectors/call-split.php new file mode 100644 index 000000000..4a540e8b2 --- /dev/null +++ b/experiments/ll2-selectors/call-split.php @@ -0,0 +1,24 @@ +remaining_tokens(); (new Counting($g,$t))->parse(); } +$single=0;$multi=0; +foreach(Counting::$counts as $rid=>$c){ if(isset($g->single_candidate_rules[$rid]))$single+=$c; else $multi+=$c; } +$tot=$single+$multi; +printf("total parse_recursive calls: %s\n", number_format($tot)); +printf("single-candidate-rule calls: %s (%.1f%%)\n",number_format($single),100*$single/$tot); +printf("multi-candidate-rule calls: %s (%.1f%%)\n",number_format($multi),100*$multi/$tot); diff --git a/experiments/ll2-selectors/grammar-stats.php b/experiments/ll2-selectors/grammar-stats.php new file mode 100644 index 000000000..f4084f91f --- /dev/null +++ b/experiments/ll2-selectors/grammar-stats.php @@ -0,0 +1,26 @@ +build_all_selectors(); +$total_rules = count( $g->rule_names ); +$frag = count( $g->fragment_ids ?? array() ); +$single = count( $g->single_candidate_rules ?? array() ); +// branches_for_token: rule_id => [token_id => branches]. Count rules that, for +// EVERY token they accept, resolve to exactly one branch. +$bft = $g->branches_for_token; +$rules_with_selectors = count( $bft ); +$always_single = 0; $multi_anywhere = 0; +foreach ( $bft as $rid => $by_tok ) { + $max = 0; + foreach ( $by_tok as $branches ) { $max = max( $max, count( $branches ) ); } + if ( $max <= 1 ) { $always_single++; } else { $multi_anywhere++; } +} +printf("total rule_names: %d\n", $total_rules); +printf("fragments: %d\n", $frag); +printf("rules with selectors (branches_for_token): %d\n", $rules_with_selectors); +printf("single_candidate_rules (flagged): %d (%.1f%% of all rules)\n", $single, 100*$single/$total_rules); +printf("rules always-single-per-token: %d (%.1f%% of selector rules)\n", $always_single, 100*$always_single/$rules_with_selectors); +printf("rules multi-candidate for some token: %d (%.1f%% of selector rules)\n", $multi_anywhere, 100*$multi_anywhere/$rules_with_selectors); From e050bd8ff4e1eeef69596327fda4be8037308b9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 17:49:00 +0200 Subject: [PATCH 07/23] Add table-driven LALR(1) proposal kmyacc/nikic-style action-goto table interpreter. Reality check: hand-written RD (tolerant-php-parser) ~40% faster than kmyacc-LALR (nikic) on PHP source. Not prototyped. --- experiments/lalr-table-driven/NOTES.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 experiments/lalr-table-driven/NOTES.md diff --git a/experiments/lalr-table-driven/NOTES.md b/experiments/lalr-table-driven/NOTES.md new file mode 100644 index 000000000..df9172c23 --- /dev/null +++ b/experiments/lalr-table-driven/NOTES.md @@ -0,0 +1,16 @@ +# Table-driven LALR(1) in pure PHP (proposal) + +**Status:** evaluated via reality-check only, not implemented. No code. + +**Idea:** generate an action/goto table from a yacc-style grammar and interpret it +in a tight while loop — the shape nikic/PHP-Parser uses to parse PHP itself +(kmyacc-generated LALR). + +**Reality check:** on parsing PHP source, microsoft/tolerant-php-parser (hand-written +recursive descent) is roughly 40% faster than nikic/PHP-Parser (kmyacc-LALR). The +intuition that "LALR is faster because there's no method dispatch" doesn't clearly +hold in PHP: table dispatch in the hot loop can cost more than method calls the JIT +can inline. + +**Verdict:** worth a focused spike if we accept the grammar-conversion cost; not a +clear win, and not prototyped. From 8a159eeae3b7487d21c05cfe4710a71022d806de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 17:49:00 +0200 Subject: [PATCH 08/23] Add packed-binary vs PHP-array lookup microbench pack/unpack loses ~4x on hot-path random lookups but wins ~5x on bulk decode. --- experiments/packed-table-lookups/NOTES.md | 22 +++++++++++++++ .../packed-table-lookups/pack-microbench.php | 27 +++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 experiments/packed-table-lookups/NOTES.md create mode 100644 experiments/packed-table-lookups/pack-microbench.php diff --git a/experiments/packed-table-lookups/NOTES.md b/experiments/packed-table-lookups/NOTES.md new file mode 100644 index 000000000..e4888cd85 --- /dev/null +++ b/experiments/packed-table-lookups/NOTES.md @@ -0,0 +1,22 @@ +# Packed binary vs PHP-array table lookups + +**Origin:** ephemeral microbenchmark. No PR/commit. + +**Idea:** could a parser action/selector table be stored as a packed binary string +(substr + unpack) more cheaply than nested/flat PHP arrays? + +**Run:** `php -d ...jit... pack-microbench.php` (2000×300 table, 2^20 random probes). + +**Result (ns per lookup, warm JIT):** + +| operation | ns/lookup | +|----------------------------------|-----------| +| nested PHP array `$a[$s][$t]` | ~13.7 | +| flat PHP array `$a[$s*W+$t]` | ~9.7 | +| packed binary `substr`+`unpack` | ~40 (≈4× slower) | +| bulk `unpack('n*', $bytes)` | ~4.9× faster than an `ord()` loop | + +**Verdict:** pack/unpack wins for BULK decoding at boundaries (one call, many ints) +but loses ~4× on hot-path random lookups. opcache-shared PHP arrays beat any +packed action table for per-step dispatch. Useful as a serialization primitive, +not a hot-path primitive. diff --git a/experiments/packed-table-lookups/pack-microbench.php b/experiments/packed-table-lookups/pack-microbench.php new file mode 100644 index 000000000..f3183f8f1 --- /dev/null +++ b/experiments/packed-table-lookups/pack-microbench.php @@ -0,0 +1,27 @@ +>8)%$W; } +function ns($dur,$ops){ return $dur/$ops*1e9; } +$best=array(); +foreach(array('nested','flat','packed') as $mode){ + $bestd=INF; + for($r=0;$r<7;$r++){ + $acc=0; $t0=microtime(true); + if($mode==='nested'){ for($i=0;$i<$P;$i++){ $acc+=$nested[$ss[$i]][$tt[$i]]; } } + elseif($mode==='flat'){ for($i=0;$i<$P;$i++){ $acc+=$flat[$ss[$i]*$W+$tt[$i]]; } } + else { for($i=0;$i<$P;$i++){ $o=($ss[$i]*$W+$tt[$i])*2; $u=unpack('n',substr($bin,$o,2)); $acc+=$u[1]; } } + $d=microtime(true)-$t0; if($d<$bestd)$bestd=$d; } + printf("%-8s %.2f ns/lookup (acc=%d)\n",$mode,ns($bestd,$P),$acc); + $best[$mode]=ns($bestd,$P); +} +printf("packed / flat ratio: %.1fx slower\n", $best['packed']/$best['flat']); +// Bulk decode: unpack('n*') vs ord loop, over the whole table. +$bd=INF; for($r=0;$r<7;$r++){ $t0=microtime(true); $arr=unpack('n*',$bin); $d=microtime(true)-$t0; if($d<$bd)$bd=$d; } +$od=INF; for($r=0;$r<7;$r++){ $t0=microtime(true); $a=array(); $len=strlen($bin); for($i=0;$i<$len;$i+=2){ $a[]= (ord($bin[$i])<<8)|ord($bin[$i+1]); } $d=microtime(true)-$t0; if($d<$od)$od=$d; } +printf("bulk unpack('n*'): %.4fs ord-loop: %.4fs => unpack %.1fx faster\n",$bd,$od,$od/$bd); From c2cb3d9e6ef14e04bcf9d33aa260c61195a5a0e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 17:49:01 +0200 Subject: [PATCH 09/23] Add full-PCRE grammar recognizer experiment 76KB pattern, 1127 named subroutines, ~98K QPS, 99.85% recognized. A recognizer, not a parser. From local branch _parser_perf. --- experiments/full-pcre-recognizer/NOTES.md | 20 ++ .../full-pcre-recognizer/exp-regex-v3.php | 288 ++++++++++++++++++ 2 files changed, 308 insertions(+) create mode 100644 experiments/full-pcre-recognizer/NOTES.md create mode 100644 experiments/full-pcre-recognizer/exp-regex-v3.php diff --git a/experiments/full-pcre-recognizer/NOTES.md b/experiments/full-pcre-recognizer/NOTES.md new file mode 100644 index 000000000..225978031 --- /dev/null +++ b/experiments/full-pcre-recognizer/NOTES.md @@ -0,0 +1,20 @@ +# Full-PCRE grammar recognizer + +**Origin:** local branch `_parser_perf` (commit e0c09f8f), `exp-regex-v3.php`. No PR. + +**Idea:** fold the whole grammar into one extended PCRE pattern using +`(?(DEFINE)...)` named subroutines, encode each token id as a codepoint +(offset 0x4000), `(*THEN)` on disjoint-FIRST branches, and inline single-use +rules to a fixpoint. PCRE2's JIT then recognizes queries. + +**Run:** `php -d ...jit... exp-regex-v3.php 100000` + +**Result:** pattern = 76,488 bytes, 1127 named subroutines (789 rules inlined). +Recognition ≈ **98K QPS**, recognizing 99.85% of the corpus. + +**Verdict:** It is a *recognizer*, not a parser — PCRE2 returns only last-write-wins +ovector slots plus one `(*MARK)`, so per-recursion-frame structure can't be +recovered in stock PHP. At ~98K it is faster than the AST-building parser (~57K), +but ~2.6× SLOWER than that same parser run in validate-only mode (~246K) — so as a +pure recognizer it loses. Still, it inspires the hybrids (shape fast-path, FFI +callouts). diff --git a/experiments/full-pcre-recognizer/exp-regex-v3.php b/experiments/full-pcre-recognizer/exp-regex-v3.php new file mode 100644 index 000000000..256c51e04 --- /dev/null +++ b/experiments/full-pcre-recognizer/exp-regex-v3.php @@ -0,0 +1,288 @@ +lowest_non_terminal_id; + +// Count how many times each rule is referenced. +function ref_counts( WP_Parser_Grammar $g ) { + $low_nt = $g->lowest_non_terminal_id; + $refs = array(); + foreach ( $g->rules as $rid => $branches ) { + $refs[ $rid ] = 0; + } + foreach ( $g->rules as $rid => $branches ) { + foreach ( $branches as $b ) { + foreach ( $b as $sym ) { + if ( $sym >= $low_nt ) { + $refs[ $sym ] = ( $refs[ $sym ] ?? 0 ) + 1; + } + } + } + } + return $refs; +} + +// FIRST and NULLABLE. +$rules = $grammar->rules; +$nullable = array(); +$first = array(); +foreach ( $rules as $rid => $_ ) { + $nullable[ $rid ] = false; + $first[ $rid ] = array(); +} +do { + $changed = false; + foreach ( $rules as $rid => $branches ) { + foreach ( $branches as $branch ) { + $bn = true; + foreach ( $branch as $sym ) { + if ( $sym < $low_nt ) { + if ( ! isset( $first[ $rid ][ $sym ] ) ) { + $first[ $rid ][ $sym ] = true; + $changed = true; + } + $bn = false; + break; + } + foreach ( $first[ $sym ] as $tid => $_ ) { + if ( ! isset( $first[ $rid ][ $tid ] ) ) { + $first[ $rid ][ $tid ] = true; + $changed = true; + } + } + if ( ! $nullable[ $sym ] ) { + $bn = false; + break; + } + } + if ( $bn && ! $nullable[ $rid ] ) { + $nullable[ $rid ] = true; + $changed = true; + } + } + } +} while ( $changed ); + +// Compile each rule into a "regex body" string. Inline single-use +// non-recursive rules into their callers transitively via memoization. +$single_candidate_rules = $grammar->single_candidate_rules ?? array(); +$select_rid = $grammar->get_rule_id( 'selectStatement' ); +$into_char = token_char( WP_MySQL_Lexer::INTO_SYMBOL ); +$compiled = array(); +$visiting = array(); +$compile_rule = function ( $rid ) use ( &$compile_rule, &$compiled, &$visiting, $rules, $first, $nullable, $low_nt, $single_candidate_rules, $select_rid, $into_char ) { + if ( isset( $compiled[ $rid ] ) ) { + return $compiled[ $rid ]; + } + $visiting[ $rid ] = true; + $alts = array(); + $safe_then = isset( $single_candidate_rules[ $rid ] ); + foreach ( $rules[ $rid ] as $branch ) { + $alt = ''; + foreach ( $branch as $i => $sym ) { + if ( $sym < $low_nt ) { + $alt .= token_char( $sym ); + } else { + $alt .= "RREF{$sym}RREF"; + } + // (*THEN) commits the alternative once the first symbol matches. + // Only safe when sibling branches of this rule have disjoint + // FIRST sets - that property is captured by + // $grammar->single_candidate_rules. Outside that set, multiple + // branches can share a first token and committing prematurely + // would yield spurious match failures. + if ( 0 === $i && $safe_then ) { + $alt .= '(*THEN)'; + } + } + $alts[] = $alt; + } + unset( $visiting[ $rid ] ); + $body = '(?:' . implode( '|', $alts ) . ')'; + if ( $rid === $select_rid ) { + // Mirror the negative lookahead the parser uses: a successful + // selectStatement match must not be followed by INTO. Otherwise + // the surrounding rule should pick a different alternative. + $body .= '(?!' . $into_char . ')'; + } + $compiled[ $rid ] = $body; + return $compiled[ $rid ]; +}; + +// First pass: compile every rule once. +foreach ( array_keys( $rules ) as $rid ) { + $compile_rule( $rid ); +} + +// Second pass: inline single-use non-recursive rules. A rule is +// inlinable if its body doesn't reference itself transitively. Repeat +// to fixpoint - inlining changes ref counts. +$inlined_count = 0; +do { + $changed = false; + $refs = array(); + foreach ( $compiled as $rid => $body ) { + $refs[ $rid ] = 0; + } + foreach ( $compiled as $rid => $body ) { + if ( preg_match_all( '/RREF(\d+)RREF/', $body, $m ) ) { + foreach ( $m[1] as $r ) { + $refs[ (int) $r ] = ( $refs[ (int) $r ] ?? 0 ) + 1; + } + } + } + foreach ( $compiled as $rid => $body ) { + if ( ( $refs[ $rid ] ?? 0 ) !== 1 ) { + continue; + } + // Don't inline recursive rules. + if ( strpos( $body, "RREF{$rid}RREF" ) !== false ) { + continue; + } + // Replace the single reference somewhere. + foreach ( $compiled as $caller_rid => $caller_body ) { + if ( strpos( $caller_body, "RREF{$rid}RREF" ) !== false ) { + $compiled[ $caller_rid ] = str_replace( "RREF{$rid}RREF", $body, $caller_body ); + unset( $compiled[ $rid ] ); + ++$inlined_count; + $changed = true; + break 2; // restart from top so refs recount with the new state + } + } + } +} while ( $changed ); + +// Now compile remaining rules with named subroutines. +$rule_to_idx = array(); +$idx_to_rule = array(); +foreach ( $compiled as $rid => $_ ) { + $rule_to_idx[ $rid ] = count( $idx_to_rule ); + $idx_to_rule[] = $rid; +} + +$define = ''; +foreach ( $idx_to_rule as $rid ) { + $body = $compiled[ $rid ]; + // Replace RREF placeholders with named-group references. + $body = preg_replace_callback( + '/RREF(\d+)RREF/', + function ( $m ) use ( $rule_to_idx ) { + $rid = (int) $m[1]; + return '(?&r' . $rule_to_idx[ $rid ] . ')'; + }, + $body + ); + $define .= "(?{$body})"; +} + +$start_rid = $grammar->get_rule_id( 'query' ); +$pattern = '/(?(DEFINE)' . $define . ')\\A(?&r' . $rule_to_idx[ $start_rid ] . ')\\z/u'; +printf( + "Inlined %d rules. Final rules: %d. Pattern: %s bytes\n", + $inlined_count, + count( $idx_to_rule ), + number_format( strlen( $pattern ) ) +); + +ini_set( 'pcre.backtrack_limit', '1000000000' ); +ini_set( 'pcre.recursion_limit', '10000000' ); +ini_set( 'pcre.jit', '1' ); + +$t = microtime( true ); +$ok = @preg_match( $pattern, "\xff", $m ); +printf( + "Compile: %.2fms, ok=%s, err=%s\n", + ( microtime( true ) - $t ) * 1000, + var_export( $ok, true ), + preg_last_error_msg() +); +if ( false === $ok && PREG_BAD_UTF8_ERROR !== preg_last_error() ) { + echo "Pattern doesn't compile cleanly. Bailing.\n"; + exit( 1 ); +} + +$handle = fopen( __DIR__ . '/../mysql/data/mysql-server-tests-queries.csv', 'r' ); +$queries = array(); +$header = true; +while ( ( $r = fgetcsv( $handle, null, ',', '"', '\\' ) ) !== false ) { + if ( $header ) { + $header = false; + continue; } + if ( null !== $r[0] ) { + $queries[] = $r[0]; + } +} +$queries = array_slice( $queries, 0, (int) ( $argv[1] ?? 5000 ) ); + +$encoded = array(); +foreach ( $queries as $q ) { + $tokens = ( new WP_MySQL_Lexer( $q ) )->remaining_tokens(); + $s = ''; + foreach ( $tokens as $t ) { + $s .= token_char( $t->id ); + } + $encoded[] = $s; +} + +$t = microtime( true ); +$matched = 0; +$failed = 0; +$errors = 0; +$failed_examples = array(); +$slow = array(); +foreach ( $encoded as $i => $s ) { + $qstart = microtime( true ); + $r = @preg_match( $pattern, $s ); + $qd = microtime( true ) - $qstart; + if ( 1 === $r ) { + ++$matched; + } elseif ( 0 === $r ) { + ++$failed; + if ( count( $failed_examples ) < 10 ) { + $failed_examples[] = substr( str_replace( "\n", ' ', $queries[ $i ] ), 0, 120 ); + } + } else { + ++$errors; } + if ( $qd > 0.005 && count( $slow ) < 3 ) { + $slow[] = sprintf( '%6.0fms: %s', $qd * 1000, substr( str_replace( "\n", ' ', $queries[ $i ] ), 0, 100 ) ); + } +} +$d = microtime( true ) - $t; +printf( + "Matched=%d, Failed=%d, Errors=%d, time=%.4fs (%d QPS)\n", + $matched, + $failed, + $errors, + $d, + count( $encoded ) / $d +); +echo "\nFailed queries:\n"; +foreach ( $failed_examples as $e ) { + echo " $e\n"; +} +echo "\nSlow queries:\n"; +foreach ( $slow as $e ) { + echo " $e\n"; +} From 0213f11dfca17f793cdb9d00155c67ea110eeda1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 17:49:01 +0200 Subject: [PATCH 10/23] Add regex pre-validate + parser hybrid experiment Slower than the parser alone (~50K vs ~65K): nearly all input is valid, so the gate is pure overhead. From local branch _parser_perf. --- experiments/regex-prevalidate-hybrid/NOTES.md | 16 ++ .../exp-regex-hybrid.php | 231 ++++++++++++++++++ 2 files changed, 247 insertions(+) create mode 100644 experiments/regex-prevalidate-hybrid/NOTES.md create mode 100644 experiments/regex-prevalidate-hybrid/exp-regex-hybrid.php diff --git a/experiments/regex-prevalidate-hybrid/NOTES.md b/experiments/regex-prevalidate-hybrid/NOTES.md new file mode 100644 index 000000000..afca9fd46 --- /dev/null +++ b/experiments/regex-prevalidate-hybrid/NOTES.md @@ -0,0 +1,16 @@ +# Regex pre-validate + parser hybrid + +**Origin:** local branch `_parser_perf` (commit 9d36df4c), `exp-regex-hybrid.php`. No PR. + +**Idea:** run the full-PCRE recognizer first as a fast yes/no gate; only invoke the +AST-building parser when the regex confirms the query is valid. + +**Run:** `php -d ...jit... exp-regex-hybrid.php 100000` + +**Result (full corpus, warm JIT):** regex-only ≈ 95K QPS; parser-only (AST) ≈ 65K; +regex + parser ≈ **50K**. + +**Verdict:** The hybrid is slower than the parser alone, because essentially every +corpus query is valid SQL — the parser still has to run to build the AST, so the +regex is pure overhead. Pre-validation only pays when invalid input is the common +case, which it isn't in any realistic workload. diff --git a/experiments/regex-prevalidate-hybrid/exp-regex-hybrid.php b/experiments/regex-prevalidate-hybrid/exp-regex-hybrid.php new file mode 100644 index 000000000..e7bc59024 --- /dev/null +++ b/experiments/regex-prevalidate-hybrid/exp-regex-hybrid.php @@ -0,0 +1,231 @@ +lowest_non_terminal_id; + $rules = $grammar->rules; + $nullable = array(); + $first = array(); + foreach ( $rules as $rid => $_ ) { + $nullable[ $rid ] = false; + $first[ $rid ] = array(); + } + do { + $changed = false; + foreach ( $rules as $rid => $branches ) { + foreach ( $branches as $branch ) { + $bn = true; + foreach ( $branch as $sym ) { + if ( $sym < $low_nt ) { + if ( ! isset( $first[ $rid ][ $sym ] ) ) { + $first[ $rid ][ $sym ] = true; + $changed = true; + } + $bn = false; + break; + } + foreach ( $first[ $sym ] as $tid => $_ ) { + if ( ! isset( $first[ $rid ][ $tid ] ) ) { + $first[ $rid ][ $tid ] = true; + $changed = true; + } + } + if ( ! $nullable[ $sym ] ) { + $bn = false; + break; + } + } + if ( $bn && ! $nullable[ $rid ] ) { + $nullable[ $rid ] = true; + $changed = true; + } + } + } + } while ( $changed ); + + $single_candidate_rules = $grammar->single_candidate_rules ?? array(); + $select_rid = $grammar->get_rule_id( 'selectStatement' ); + $into_char = mb_chr( WP_MySQL_Lexer::INTO_SYMBOL + TOKEN_OFFSET, 'UTF-8' ); + + $compiled = array(); + $compile = function ( $rid ) use ( &$compile, &$compiled, $rules, $low_nt, $single_candidate_rules, $select_rid, $into_char ) { + if ( isset( $compiled[ $rid ] ) ) { + return $compiled[ $rid ]; + } + $alts = array(); + $st = isset( $single_candidate_rules[ $rid ] ); + foreach ( $rules[ $rid ] as $branch ) { + $alt = ''; + foreach ( $branch as $i => $sym ) { + if ( $sym < $low_nt ) { + $alt .= mb_chr( $sym + TOKEN_OFFSET, 'UTF-8' ); + } else { + $alt .= "RREF{$sym}RREF"; + } + if ( 0 === $i && $st ) { + $alt .= '(*THEN)'; + } + } + $alts[] = $alt; + } + $body = '(?:' . implode( '|', $alts ) . ')'; + if ( $rid === $select_rid ) { + $body .= '(?!' . $into_char . ')'; + } + $compiled[ $rid ] = $body; + return $compiled[ $rid ]; + }; + foreach ( array_keys( $rules ) as $rid ) { + $compile( $rid ); + } + + // Inline single-use rules. + do { + $changed = false; + $refs = array(); + foreach ( $compiled as $rid => $_ ) { + $refs[ $rid ] = 0; + } + foreach ( $compiled as $rid => $body ) { + if ( preg_match_all( '/RREF(\d+)RREF/', $body, $m ) ) { + foreach ( $m[1] as $r ) { + $refs[ (int) $r ] = ( $refs[ (int) $r ] ?? 0 ) + 1; + } + } + } + foreach ( $compiled as $rid => $body ) { + if ( ( $refs[ $rid ] ?? 0 ) !== 1 || strpos( $body, "RREF{$rid}RREF" ) !== false ) { + continue; + } + foreach ( $compiled as $cr => $cb ) { + if ( strpos( $cb, "RREF{$rid}RREF" ) !== false ) { + $compiled[ $cr ] = str_replace( "RREF{$rid}RREF", $body, $cb ); + unset( $compiled[ $rid ] ); + $changed = true; + break 2; + } + } + } + } while ( $changed ); + + $rule_to_idx = array(); + foreach ( $compiled as $rid => $_ ) { + $rule_to_idx[ $rid ] = count( $rule_to_idx ); + } + $define = ''; + foreach ( $compiled as $rid => $body ) { + $body = preg_replace_callback( + '/RREF(\d+)RREF/', + function ( $m ) use ( $rule_to_idx ) { + return '(?&r' . $rule_to_idx[ (int) $m[1] ] . ')'; + }, + $body + ); + $define .= "(?{$body})"; + } + $start_rid = $grammar->get_rule_id( 'query' ); + return '/(?(DEFINE)' . $define . ')\\A(?&r' . $rule_to_idx[ $start_rid ] . ')\\z/u'; +} + +$grammar = new WP_Parser_Grammar( require __DIR__ . '/../../src/mysql/mysql-grammar.php' ); +$pattern = compile_regex( $grammar ); + +ini_set( 'pcre.backtrack_limit', '1000000000' ); +ini_set( 'pcre.recursion_limit', '10000000' ); +ini_set( 'pcre.jit', '1' ); +ini_set( 'pcre.jit_stacksize', '32M' ); + +$handle = fopen( __DIR__ . '/../mysql/data/mysql-server-tests-queries.csv', 'r' ); +$queries = array(); +$header = true; +while ( ( $r = fgetcsv( $handle, null, ',', '"', '\\' ) ) !== false ) { + if ( $header ) { + $header = false; + continue; + } + if ( null !== $r[0] ) { + $queries[] = $r[0]; + } +} +$queries = array_slice( $queries, 0, (int) ( $argv[1] ?? 10000 ) ); + +// Pre-tokenize and pre-encode. +$pairs = array(); +foreach ( $queries as $q ) { + $tokens = ( new WP_MySQL_Lexer( $q ) )->remaining_tokens(); + $enc = ''; + foreach ( $tokens as $t ) { + $enc .= mb_chr( $t->id + TOKEN_OFFSET, 'UTF-8' ); + } + $pairs[] = array( $tokens, $enc ); +} +printf( "Loaded %d queries\n", count( $pairs ) ); + +// 1. Just regex match. +$start = microtime( true ); +$ok = 0; +foreach ( $pairs as $p ) { + if ( @preg_match( $pattern, $p[1] ) === 1 ) { + ++$ok; + } +} +$d = microtime( true ) - $start; +printf( "regex only: %.4fs (%d QPS, %d/%d match)\n", $d, count( $pairs ) / $d, $ok, count( $pairs ) ); + +// 2. Just parser (build AST). +$start = microtime( true ); +$ok = 0; +foreach ( $pairs as $p ) { + if ( ( new WP_MySQL_Parser( $grammar, $p[0] ) )->parse() ) { + ++$ok; + } +} +$d = microtime( true ) - $start; +printf( "parser only (AST): %.4fs (%d QPS, %d/%d match)\n", $d, count( $pairs ) / $d, $ok, count( $pairs ) ); + +// 3. Hybrid: regex first; on success run the parser to build AST. Pure +// overhead: same parser runs, plus the regex. +$start = microtime( true ); +$ok = 0; +$regex_failed = 0; +foreach ( $pairs as $p ) { + if ( @preg_match( $pattern, $p[1] ) !== 1 ) { + ++$regex_failed; + continue; + } + if ( ( new WP_MySQL_Parser( $grammar, $p[0] ) )->parse() ) { + ++$ok; + } +} +$d = microtime( true ) - $start; +printf( + "regex + parser: %.4fs (%d QPS, %d/%d match, %d regex-rejected)\n", + $d, + count( $pairs ) / $d, + $ok, + count( $pairs ), + $regex_failed +); From 942d18e5bc41b9704bc3b630d7ec95ff25355a2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 17:49:01 +0200 Subject: [PATCH 11/23] Add multi-shape regex direct-AST fast parser Per-shape PCRE2 union (*MARK) builds the tree directly for ~19% of queries; ~1.18x overall, byte-identical AST. From local branch parser-fast-path. --- experiments/multishape-fast-parser/NOTES.md | 22 + .../class-wp-mysql-fast-parser.php | 1209 +++++++++++++++++ 2 files changed, 1231 insertions(+) create mode 100644 experiments/multishape-fast-parser/NOTES.md create mode 100644 experiments/multishape-fast-parser/class-wp-mysql-fast-parser.php diff --git a/experiments/multishape-fast-parser/NOTES.md b/experiments/multishape-fast-parser/NOTES.md new file mode 100644 index 000000000..d9cbc82f1 --- /dev/null +++ b/experiments/multishape-fast-parser/NOTES.md @@ -0,0 +1,22 @@ +# Multi-shape regex → direct AST construction + +**Origin:** local branches `parser-fast-path` / `perf-with-fastpath` +(commits d3436f92, 9630eb93). No PR. `class-wp-mysql-fast-parser.php` (1209 lines). + +**Idea:** for a curated set of common query shapes (INSERT/SELECT/UPDATE/DELETE/ +DROP/SHOW/USE/TRUNCATE/SET/EXPLAIN/BEGIN/COMMIT/ROLLBACK), detect the shape with a +single PCRE2 union pattern using `(*MARK:NAME)`, then build the `WP_Parser_Node` +tree directly in PHP. On a miss, fall through to the recursive parser unchanged. + +**Run:** wired into `WP_MySQL_Parser::parse()` at token position 0; benchmark +parse-only with the fast path toggled on/off. + +**Result:** on a 30K subset, overall ≈ **1.18×** (76K → 90K QPS); **19.06%** +(5,718/30K) of queries hit the fast path; on those queries the speedup is ≈3.4×. +The produced AST is byte-for-byte identical to the recursive parser's +(0 mismatches across the full corpus). + +**Verdict:** Real, modest, and orthogonal to the main parser — the one regex-based +hybrid that actually works, because it sidesteps the recursive descent entirely for +the shapes it knows. More shapes = more wins, but each shape needs a hand-written +builder. diff --git a/experiments/multishape-fast-parser/class-wp-mysql-fast-parser.php b/experiments/multishape-fast-parser/class-wp-mysql-fast-parser.php new file mode 100644 index 000000000..4b24423e8 --- /dev/null +++ b/experiments/multishape-fast-parser/class-wp-mysql-fast-parser.php @@ -0,0 +1,1209 @@ + builder method name. + * + * @var array + */ + private $builders = array( + 'insert' => 'build_insert', + 'drop' => 'build_drop', + 'show' => 'build_show', + 'select' => 'build_select', + 'update' => 'build_update', + 'delete' => 'build_delete', + 'set' => 'build_set', + 'use' => 'build_use', + 'begin' => 'build_begin', + 'commit' => 'build_commit', + 'rollback' => 'build_rollback', + 'truncate' => 'build_truncate', + 'explain' => 'build_explain', + ); + + /** + * Map of rule name => rule id, populated lazily. + * + * @var array + */ + private $rule_ids = array(); + + /** + * Reference to the grammar so rule ids can be resolved. + * + * @var WP_Parser_Grammar + */ + private $grammar; + + public function __construct( WP_Parser_Grammar $grammar ) { + $this->grammar = $grammar; + $this->union_pattern = $this->build_union_pattern(); + } + + /** + * Encode a token stream into the codepoint string used by the union + * pattern. One codepoint per token at offset TOKEN_OFFSET + token_id. + * + * @param WP_Parser_Token[] $tokens The token stream. + * @param int|null $count Optional. Number of tokens to encode + * (from index 0). Defaults to count($tokens). + * @return string The encoded UTF-8 string. + */ + public static function encode_tokens( array $tokens, ?int $count = null ): string { + if ( null === $count ) { + $count = count( $tokens ); + } + $out = ''; + for ( $i = 0; $i < $count; $i++ ) { + $cp = $tokens[ $i ]->id + self::TOKEN_OFFSET; + // Inline mb_chr for BMP codepoints (faster on the hot path; all + // token ids land below U+10000). + if ( $cp < 0x80 ) { + $out .= chr( $cp ); + } elseif ( $cp < 0x800 ) { + $out .= chr( 0xC0 | ( $cp >> 6 ) ) . chr( 0x80 | ( $cp & 0x3F ) ); + } else { + $out .= chr( 0xE0 | ( $cp >> 12 ) ) . chr( 0x80 | ( ( $cp >> 6 ) & 0x3F ) ) . chr( 0x80 | ( $cp & 0x3F ) ); + } + } + return $out; + } + + /** + * Try to parse the given token stream via the regex fast-path. + * + * @param WP_Parser_Token[] $tokens The token stream. + * @param string $encoded The codepoint-encoded token string. + * @return WP_Parser_Node|null The AST on hit, null on miss/error. + */ + public function try_parse( array $tokens, string $encoded ): ?WP_Parser_Node { + if ( ! preg_match( $this->union_pattern, $encoded, $m ) ) { + return null; + } + if ( ! isset( $m['MARK'] ) ) { + return null; + } + $kind = $m['MARK']; + if ( ! isset( $this->builders[ $kind ] ) ) { + return null; + } + $method = $this->builders[ $kind ]; + // Builder errors fall back to the recursive parser. They should not + // happen in practice (the regex fully validates the shape) but we + // must not let an internal slip break parsing. + try { + return $this->$method( $tokens ); + } catch ( Throwable $e ) { + return null; + } + } + + /** + * Build the UNION pattern that detects all supported shapes in a single + * preg_match. Each branch ends with `\z(*MARK:)(*ACCEPT)` so the + * marker survives JIT short-circuit and the match anchors the end of the + * input. + */ + private function build_union_pattern(): string { + $tc = static function ( int $tid ): string { + return self::cp_to_utf8( $tid + self::TOKEN_OFFSET ); + }; + + // Token-class shorthands. + $ident_re = '[' . $tc( WP_MySQL_Lexer::IDENTIFIER ) . $tc( WP_MySQL_Lexer::BACK_TICK_QUOTED_ID ) . ']'; + $ident_re_tbl = '[' . $tc( WP_MySQL_Lexer::IDENTIFIER ) . $tc( WP_MySQL_Lexer::BACK_TICK_QUOTED_ID ) . $tc( WP_MySQL_Lexer::DOUBLE_QUOTED_TEXT ) . ']'; + $lit_re = '[' . $tc( WP_MySQL_Lexer::INT_NUMBER ) . $tc( WP_MySQL_Lexer::SINGLE_QUOTED_TEXT ) . $tc( WP_MySQL_Lexer::DOUBLE_QUOTED_TEXT ) . $tc( WP_MySQL_Lexer::NULL_SYMBOL ) . $tc( WP_MySQL_Lexer::PARAM_MARKER ) . ']'; + $idlit_re = '[' . $tc( WP_MySQL_Lexer::INT_NUMBER ) . $tc( WP_MySQL_Lexer::SINGLE_QUOTED_TEXT ) . $tc( WP_MySQL_Lexer::DOUBLE_QUOTED_TEXT ) . $tc( WP_MySQL_Lexer::NULL_SYMBOL ) . $tc( WP_MySQL_Lexer::PARAM_MARKER ) . $tc( WP_MySQL_Lexer::IDENTIFIER ) . $tc( WP_MySQL_Lexer::BACK_TICK_QUOTED_ID ) . ']'; + + $tail = '(?:' . $tc( WP_MySQL_Lexer::SEMICOLON_SYMBOL ) . ')?+(?:' . $tc( WP_MySQL_Lexer::EOF ) . ')?+'; + $col_simple = $ident_re . '(?:' . $tc( WP_MySQL_Lexer::DOT_SYMBOL ) . $ident_re . ')?+'; + $tbl_re = $ident_re_tbl . '(?:' . $tc( WP_MySQL_Lexer::DOT_SYMBOL ) . $ident_re_tbl . ')?+'; + $assign_re = $ident_re . $tc( WP_MySQL_Lexer::EQUAL_OPERATOR ) . $idlit_re; + $one_eq_re = $col_simple . $tc( WP_MySQL_Lexer::EQUAL_OPERATOR ) . $idlit_re; + $where_re = '(?:' . $tc( WP_MySQL_Lexer::WHERE_SYMBOL ) . $one_eq_re . '(?:' . $tc( WP_MySQL_Lexer::AND_SYMBOL ) . $one_eq_re . ')*+)?+'; + $row_re = $tc( WP_MySQL_Lexer::OPEN_PAR_SYMBOL ) . $lit_re . '(?:' . $tc( WP_MySQL_Lexer::COMMA_SYMBOL ) . $lit_re . ')*+' . $tc( WP_MySQL_Lexer::CLOSE_PAR_SYMBOL ); + + // INSERT INTO t [(cols...)] VALUES (...) [, (...)]*. + $pat_insert = $tc( WP_MySQL_Lexer::INSERT_SYMBOL ) . $tc( WP_MySQL_Lexer::INTO_SYMBOL ) . $tbl_re + . '(?:' . $tc( WP_MySQL_Lexer::OPEN_PAR_SYMBOL ) . $ident_re . '(?:' . $tc( WP_MySQL_Lexer::COMMA_SYMBOL ) . $ident_re . ')*+' . $tc( WP_MySQL_Lexer::CLOSE_PAR_SYMBOL ) . ')?+' + . $tc( WP_MySQL_Lexer::VALUES_SYMBOL ) + . $row_re . '(?:' . $tc( WP_MySQL_Lexer::COMMA_SYMBOL ) . $row_re . ')*+'; + + // DROP TABLE [IF EXISTS] t1 [, t2]*. + $pat_drop = $tc( WP_MySQL_Lexer::DROP_SYMBOL ) . $tc( WP_MySQL_Lexer::TABLE_SYMBOL ) + . '(?:' . $tc( WP_MySQL_Lexer::IF_SYMBOL ) . $tc( WP_MySQL_Lexer::EXISTS_SYMBOL ) . ')?+' + . $tbl_re . '(?:' . $tc( WP_MySQL_Lexer::COMMA_SYMBOL ) . $tbl_re . ')*+'; + + // SHOW family. + $show_bare_kw = '[' . $tc( WP_MySQL_Lexer::TABLES_SYMBOL ) . $tc( WP_MySQL_Lexer::DATABASES_SYMBOL ) . $tc( WP_MySQL_Lexer::VARIABLES_SYMBOL ) + . $tc( WP_MySQL_Lexer::STATUS_SYMBOL ) . $tc( WP_MySQL_Lexer::WARNINGS_SYMBOL ) . $tc( WP_MySQL_Lexer::ERRORS_SYMBOL ) + . $tc( WP_MySQL_Lexer::EVENTS_SYMBOL ) . $tc( WP_MySQL_Lexer::TRIGGERS_SYMBOL ) . $tc( WP_MySQL_Lexer::PLUGINS_SYMBOL ) + . $tc( WP_MySQL_Lexer::GRANTS_SYMBOL ) . ']'; + $show_optype_kw = '[' . $tc( WP_MySQL_Lexer::SESSION_SYMBOL ) . $tc( WP_MySQL_Lexer::GLOBAL_SYMBOL ) . ']'; + $show_optype_var = '[' . $tc( WP_MySQL_Lexer::VARIABLES_SYMBOL ) . $tc( WP_MySQL_Lexer::STATUS_SYMBOL ) . ']'; + $show_keys_kw = '[' . $tc( WP_MySQL_Lexer::KEYS_SYMBOL ) . $tc( WP_MySQL_Lexer::INDEX_SYMBOL ) . $tc( WP_MySQL_Lexer::INDEXES_SYMBOL ) . ']'; + $show_pf_kw = '[' . $tc( WP_MySQL_Lexer::PROCEDURE_SYMBOL ) . $tc( WP_MySQL_Lexer::FUNCTION_SYMBOL ) . ']'; + + $pat_show = $tc( WP_MySQL_Lexer::SHOW_SYMBOL ) . '(?:' + . '(?:' . $tc( WP_MySQL_Lexer::CREATE_SYMBOL ) . $tc( WP_MySQL_Lexer::TABLE_SYMBOL ) . $tbl_re . ')' + . '|(?:' . $tc( WP_MySQL_Lexer::CREATE_SYMBOL ) . $tc( WP_MySQL_Lexer::DATABASE_SYMBOL ) . $ident_re . ')' + . '|(?:' . $tc( WP_MySQL_Lexer::CREATE_SYMBOL ) . $show_pf_kw . $tbl_re . ')' + . '|(?:' . $show_optype_kw . $show_optype_var . ')' + . '|(?:' . $show_pf_kw . $tc( WP_MySQL_Lexer::STATUS_SYMBOL ) . ')' + . '|(?:' . $tc( WP_MySQL_Lexer::COLUMNS_SYMBOL ) . $tc( WP_MySQL_Lexer::FROM_SYMBOL ) . $tbl_re . ')' + . '|(?:' . $show_keys_kw . $tc( WP_MySQL_Lexer::FROM_SYMBOL ) . $tbl_re . ')' + . '|(?:' . $show_bare_kw . ')' + . ')'; + + // SELECT (* | col [AS x] [, col [AS x]]*) FROM t [AS x] [, t [AS x]]* + // [WHERE ...] [ORDER BY ...] [LIMIT ...]. + $select_alias_re = '(?:' . $tc( WP_MySQL_Lexer::AS_SYMBOL ) . '?+' . $ident_re . ')?+'; + $one_select_item = $col_simple . $select_alias_re; + $select_items = '(?:' . $tc( WP_MySQL_Lexer::MULT_OPERATOR ) . '|' . $one_select_item + . '(?:' . $tc( WP_MySQL_Lexer::COMMA_SYMBOL ) . $one_select_item . ')*+)'; + $tbl_alias_re = '(?:' . $tc( WP_MySQL_Lexer::AS_SYMBOL ) . '?+' . $ident_re . ')?+'; + $one_tbl = $tbl_re . $tbl_alias_re; + $tbl_list = $one_tbl . '(?:' . $tc( WP_MySQL_Lexer::COMMA_SYMBOL ) . $one_tbl . ')*+'; + $one_order_item = $col_simple . '(?:[' . $tc( WP_MySQL_Lexer::ASC_SYMBOL ) . $tc( WP_MySQL_Lexer::DESC_SYMBOL ) . '])?+'; + $order_re = '(?:' . $tc( WP_MySQL_Lexer::ORDER_SYMBOL ) . $tc( WP_MySQL_Lexer::BY_SYMBOL ) . $one_order_item + . '(?:' . $tc( WP_MySQL_Lexer::COMMA_SYMBOL ) . $one_order_item . ')*+)?+'; + $limit_re = '(?:' . $tc( WP_MySQL_Lexer::LIMIT_SYMBOL ) . $tc( WP_MySQL_Lexer::INT_NUMBER ) . '(?:' . $tc( WP_MySQL_Lexer::COMMA_SYMBOL ) . $tc( WP_MySQL_Lexer::INT_NUMBER ) . ')?+)?+'; + $pat_select = $tc( WP_MySQL_Lexer::SELECT_SYMBOL ) . $select_items . $tc( WP_MySQL_Lexer::FROM_SYMBOL ) . $tbl_list + . $where_re . $order_re . $limit_re; + + // UPDATE t SET c=v [, c=v]* [WHERE ...]. + $pat_update = $tc( WP_MySQL_Lexer::UPDATE_SYMBOL ) . $tbl_re . $tc( WP_MySQL_Lexer::SET_SYMBOL ) + . $assign_re . '(?:' . $tc( WP_MySQL_Lexer::COMMA_SYMBOL ) . $assign_re . ')*+' + . $where_re; + + // DELETE FROM t [WHERE ...]. + $pat_delete = $tc( WP_MySQL_Lexer::DELETE_SYMBOL ) . $tc( WP_MySQL_Lexer::FROM_SYMBOL ) . $tbl_re . $where_re; + + // SET — three forms: optionType-prefixed, no-optionType (ident= / @x= / @@var=). + $set_var_no_opt = '(?:' . $ident_re . '|' . $tc( WP_MySQL_Lexer::AT_TEXT_SUFFIX ) + . '|' . $tc( WP_MySQL_Lexer::AT_AT_SIGN_SYMBOL ) + . '(?:[' . $tc( WP_MySQL_Lexer::GLOBAL_SYMBOL ) . $tc( WP_MySQL_Lexer::SESSION_SYMBOL ) . ']' . $tc( WP_MySQL_Lexer::DOT_SYMBOL ) . ')?+' + . $ident_re . ')'; + $set_assign_no_opt = $set_var_no_opt . $tc( WP_MySQL_Lexer::EQUAL_OPERATOR ) . $idlit_re; + $set_optype_kw = '[' . $tc( WP_MySQL_Lexer::GLOBAL_SYMBOL ) . $tc( WP_MySQL_Lexer::SESSION_SYMBOL ) . $tc( WP_MySQL_Lexer::PERSIST_SYMBOL ) . $tc( WP_MySQL_Lexer::PERSIST_ONLY_SYMBOL ) . ']'; + $set_assign_optype = $ident_re . $tc( WP_MySQL_Lexer::EQUAL_OPERATOR ) . $idlit_re; + $pat_set = $tc( WP_MySQL_Lexer::SET_SYMBOL ) + . '(?:' + . '(?:' . $set_optype_kw . $set_assign_optype . '(?:' . $tc( WP_MySQL_Lexer::COMMA_SYMBOL ) . $set_assign_optype . ')*+)' + . '|(?:' . $set_assign_no_opt . '(?:' . $tc( WP_MySQL_Lexer::COMMA_SYMBOL ) . $set_assign_no_opt . ')*+)' + . ')'; + + $pat_use = $tc( WP_MySQL_Lexer::USE_SYMBOL ) . $ident_re; + $pat_begin = $tc( WP_MySQL_Lexer::BEGIN_SYMBOL ) . '(?:' . $tc( WP_MySQL_Lexer::WORK_SYMBOL ) . ')?+'; + $pat_commit = $tc( WP_MySQL_Lexer::COMMIT_SYMBOL ); + $pat_rollback = $tc( WP_MySQL_Lexer::ROLLBACK_SYMBOL ); + $pat_truncate = $tc( WP_MySQL_Lexer::TRUNCATE_SYMBOL ) . '(?:' . $tc( WP_MySQL_Lexer::TABLE_SYMBOL ) . ')?+' . $tbl_re; + $pat_explain = $tc( WP_MySQL_Lexer::EXPLAIN_SYMBOL ) . $pat_select; + + // More-specific shapes first. EXPLAIN before SELECT. + $shapes = array( + 'explain' => $pat_explain, + 'insert' => $pat_insert, + 'select' => $pat_select, + 'update' => $pat_update, + 'delete' => $pat_delete, + 'drop' => $pat_drop, + 'show' => $pat_show, + 'set' => $pat_set, + 'use' => $pat_use, + 'begin' => $pat_begin, + 'commit' => $pat_commit, + 'rollback' => $pat_rollback, + 'truncate' => $pat_truncate, + ); + + $alts = array(); + foreach ( $shapes as $name => $pat ) { + // Each branch must end with \z BEFORE (*ACCEPT). Without \z, e.g. + // `DELETE FROM t WHERE a=1 or a=5` would match the delete shape + // up to `a=1` and accept, ignoring the remainder. + $alts[] = '(?:' . $pat . $tail . '\z(*MARK:' . $name . ')(*ACCEPT))'; + } + return '/\A(?:' . implode( '|', $alts ) . ')/u'; + } + + /** + * Encode a single Unicode codepoint as UTF-8 bytes (BMP only). + */ + private static function cp_to_utf8( int $cp ): string { + if ( $cp < 0x80 ) { + return chr( $cp ); + } + if ( $cp < 0x800 ) { + return chr( 0xC0 | ( $cp >> 6 ) ) . chr( 0x80 | ( $cp & 0x3F ) ); + } + return chr( 0xE0 | ( $cp >> 12 ) ) . chr( 0x80 | ( ( $cp >> 6 ) & 0x3F ) ) . chr( 0x80 | ( $cp & 0x3F ) ); + } + + // ============================================================ + // AST node construction. + // ============================================================ + + /** + * Build a WP_Parser_Node, resolving the rule id once and caching it. + */ + private function node( string $rule_name, array $children ): WP_Parser_Node { + if ( ! isset( $this->rule_ids[ $rule_name ] ) ) { + $this->rule_ids[ $rule_name ] = $this->grammar->get_rule_id( $rule_name ); + } + return new WP_Parser_Node( $this->rule_ids[ $rule_name ], $rule_name, $children ); + } + + // ============================================================ + // AST building primitives shared across shapes. + // ============================================================ + + /** + * literal subtree from a single token. + */ + private function lit( WP_Parser_Token $tok ): WP_Parser_Node { + switch ( $tok->id ) { + case WP_MySQL_Lexer::INT_NUMBER: + return $this->node( 'numLiteral', array( $tok ) ); + case WP_MySQL_Lexer::SINGLE_QUOTED_TEXT: + case WP_MySQL_Lexer::DOUBLE_QUOTED_TEXT: + return $this->node( 'textLiteral', array( $this->node( 'textStringLiteral', array( $tok ) ) ) ); + case WP_MySQL_Lexer::NULL_SYMBOL: + return $this->node( 'nullLiteral', array( $tok ) ); + } + throw new RuntimeException( 'fast-path: unsupported literal token id ' . $tok->id ); + } + + /** + * expr wrapping a literal — produces the identity-spine + * expr->boolPri->predicate->bitExpr->simpleExpr->simpleExprBody->literal. + */ + private function expr_wrap_lit( WP_Parser_Token $tok ): WP_Parser_Node { + $body = $this->node( 'simpleExprBody', array( $this->node( 'literal', array( $this->lit( $tok ) ) ) ) ); + return $this->expr_identity_spine( $body ); + } + + /** + * expr wrapping a column reference. + */ + private function expr_wrap_col( WP_Parser_Token $a, ?WP_Parser_Token $dot = null, ?WP_Parser_Token $b = null ): WP_Parser_Node { + $body = $this->node( 'simpleExprBody', array( $this->column_ref( $a, $dot, $b ) ) ); + return $this->expr_identity_spine( $body ); + } + + /** + * expr that may be a column ref, paramMarker, or literal — used for the + * RHS of c=v in UPDATE/SET. + */ + private function expr_for_rhs( WP_Parser_Token $tok ): WP_Parser_Node { + if ( WP_MySQL_Lexer::IDENTIFIER === $tok->id || WP_MySQL_Lexer::BACK_TICK_QUOTED_ID === $tok->id ) { + return $this->expr_wrap_col( $tok ); + } + if ( WP_MySQL_Lexer::PARAM_MARKER === $tok->id ) { + $body = $this->node( 'simpleExprBody', array( $this->node( 'paramMarker', array( $tok ) ) ) ); + return $this->expr_identity_spine( $body ); + } + return $this->expr_wrap_lit( $tok ); + } + + /** + * Wrap a simpleExprBody in the identity spine to produce an `expr` node. + */ + private function expr_identity_spine( WP_Parser_Node $body ): WP_Parser_Node { + return $this->node( + 'expr', + array( + $this->node( + 'boolPri', + array( + $this->node( + 'predicate', + array( + $this->node( + 'bitExpr', + array( + $this->node( 'simpleExpr', array( $body ) ), + ) + ), + ) + ), + ) + ), + ) + ); + } + + /** + * qualifiedIdentifier { identifier { pure } [dotIdentifier { . identifier { pure } }] }. + */ + private function qualified_ident( WP_Parser_Token $a, ?WP_Parser_Token $dot = null, ?WP_Parser_Token $b = null ): WP_Parser_Node { + $kids = array( $this->node( 'identifier', array( $this->node( 'pureIdentifier', array( $a ) ) ) ) ); + if ( null !== $dot ) { + $kids[] = $this->node( + 'dotIdentifier', + array( + $dot, + $this->node( 'identifier', array( $this->node( 'pureIdentifier', array( $b ) ) ) ), + ) + ); + } + return $this->node( 'qualifiedIdentifier', $kids ); + } + + /** + * columnRef wrapping a (possibly qualified) identifier. + */ + private function column_ref( WP_Parser_Token $a, ?WP_Parser_Token $dot = null, ?WP_Parser_Token $b = null ): WP_Parser_Node { + return $this->node( + 'columnRef', + array( + $this->node( 'fieldIdentifier', array( $this->qualified_ident( $a, $dot, $b ) ) ), + ) + ); + } + + /** + * tableRef wrapping a (possibly qualified) identifier. + */ + private function table_ref( WP_Parser_Token $a, ?WP_Parser_Token $dot = null, ?WP_Parser_Token $b = null ): WP_Parser_Node { + return $this->node( 'tableRef', array( $this->qualified_ident( $a, $dot, $b ) ) ); + } + + /** + * Read tokens[$i..]: returns [tableRef_node, next_i] handling optional db.t form. + * + * @return array{0:WP_Parser_Node,1:int} + */ + private function consume_table_ref( array $tokens, int $i ): array { + $a = $tokens[ $i ]; + if ( WP_MySQL_Lexer::DOT_SYMBOL === ( $tokens[ $i + 1 ]->id ?? 0 ) ) { + $dot = $tokens[ $i + 1 ]; + $b = $tokens[ $i + 2 ]; + return array( $this->table_ref( $a, $dot, $b ), $i + 3 ); + } + return array( $this->table_ref( $a ), $i + 1 ); + } + + /** + * Read a SELECT FROM-list item with optional [AS] alias. + * + * @return array{0:WP_Parser_Node,1:int} + */ + private function consume_table_reference( array $tokens, int $i ): array { + list( $tref, $i ) = $this->consume_table_ref( $tokens, $i ); + $st_kids = array( $tref ); + + if ( $i < count( $tokens ) ) { + $as_tok = null; + $j = $i; + if ( WP_MySQL_Lexer::AS_SYMBOL === $tokens[ $j ]->id ) { + $as_tok = $tokens[ $j ]; + ++$j; + } + if ( $j < count( $tokens ) && ( WP_MySQL_Lexer::IDENTIFIER === $tokens[ $j ]->id || WP_MySQL_Lexer::BACK_TICK_QUOTED_ID === $tokens[ $j ]->id ) ) { + $alias_kids = array(); + if ( null !== $as_tok ) { + $alias_kids[] = $as_tok; + } + $alias_kids[] = $this->node( 'identifier', array( $this->node( 'pureIdentifier', array( $tokens[ $j ] ) ) ) ); + $st_kids[] = $this->node( 'tableAlias', $alias_kids ); + $i = $j + 1; + } + } + + $single = $this->node( 'singleTable', $st_kids ); + $tref_n = $this->node( 'tableReference', array( $this->node( 'tableFactor', array( $single ) ) ) ); + return array( $tref_n, $i ); + } + + /** + * Comparison expr `lhs rhs` for WHERE/ON clauses. + * + * Produces: + * expr → boolPri { predicate { ...lhs }, compOp { = }, predicate { ...rhs } }. + */ + private function expr_eq( WP_Parser_Node $colref, WP_Parser_Token $eq, WP_Parser_Token $rhs ): WP_Parser_Node { + $lhs_pred = $this->node( + 'predicate', + array( + $this->node( + 'bitExpr', + array( + $this->node( + 'simpleExpr', + array( + $this->node( 'simpleExprBody', array( $colref ) ), + ) + ), + ) + ), + ) + ); + + if ( WP_MySQL_Lexer::IDENTIFIER === $rhs->id || WP_MySQL_Lexer::BACK_TICK_QUOTED_ID === $rhs->id ) { + $rhs_inner = $this->column_ref( $rhs ); + } elseif ( WP_MySQL_Lexer::PARAM_MARKER === $rhs->id ) { + $rhs_inner = $this->node( 'paramMarker', array( $rhs ) ); + } else { + $rhs_inner = $this->node( 'literal', array( $this->lit( $rhs ) ) ); + } + + $rhs_pred = $this->node( + 'predicate', + array( + $this->node( + 'bitExpr', + array( + $this->node( + 'simpleExpr', + array( + $this->node( 'simpleExprBody', array( $rhs_inner ) ), + ) + ), + ) + ), + ) + ); + + return $this->node( + 'expr', + array( + $this->node( 'boolPri', array( $lhs_pred, $this->node( 'compOp', array( $eq ) ), $rhs_pred ) ), + ) + ); + } + + /** + * Consume `col[.col] = rhs`. Returns [expr_node, next_i]. + * + * @return array{0:WP_Parser_Node,1:int} + */ + private function consume_one_eq( array $tokens, int $i ): array { + $col_a = $tokens[ $i ]; + $dot = null; + $col_b = null; + $j = $i + 1; + if ( WP_MySQL_Lexer::DOT_SYMBOL === ( $tokens[ $j ]->id ?? 0 ) ) { + $dot = $tokens[ $j ]; + $col_b = $tokens[ $j + 1 ]; + $j += 2; + } + $colref = $this->column_ref( $col_a, $dot, $col_b ); + $eq = $tokens[ $j ]; + $rhs = $tokens[ $j + 1 ]; + return array( $this->expr_eq( $colref, $eq, $rhs ), $j + 2 ); + } + + /** + * Optional `WHERE col[.col]=rhs [AND col[.col]=rhs]*`. + * + * AND-chains fold right-associatively: + * expr { boolPri c1, AND, expr { boolPri c2, AND, expr { boolPri c3 } } }. + * + * @return array{0:WP_Parser_Node|null,1:int} + */ + private function maybe_where( array $tokens, int $i ): array { + if ( $i >= count( $tokens ) || WP_MySQL_Lexer::WHERE_SYMBOL !== $tokens[ $i ]->id ) { + return array( null, $i ); + } + $where_tok = $tokens[ $i ]; + $j = $i + 1; + + list( $first, $j ) = $this->consume_one_eq( $tokens, $j ); + $comparisons = array( $first ); + $ands = array(); + while ( $j < count( $tokens ) && WP_MySQL_Lexer::AND_SYMBOL === $tokens[ $j ]->id ) { + $ands[] = $tokens[ $j ]; + ++$j; + list( $next, $j ) = $this->consume_one_eq( $tokens, $j ); + $comparisons[] = $next; + } + + $expr = end( $comparisons ); + for ( $k = count( $comparisons ) - 2; $k >= 0; $k-- ) { + // Unwrap expr->boolPri so we can recombine with the AND on the right. + $boolpri_kids = $comparisons[ $k ]->get_children_ref(); + $boolpri = $boolpri_kids[0]; + $expr = $this->node( 'expr', array( $boolpri, $ands[ $k ], $expr ) ); + } + return array( $this->node( 'whereClause', array( $where_tok, $expr ) ), $j ); + } + + /** + * Consume an ORDER BY item: `col[.col] [ASC|DESC]`. + * + * @return array{0:WP_Parser_Node,1:int} + */ + private function consume_order_item( array $tokens, int $i ): array { + $col = $tokens[ $i ]; + $dot = null; + $col_b = null; + $j = $i + 1; + if ( WP_MySQL_Lexer::DOT_SYMBOL === ( $tokens[ $j ]->id ?? 0 ) ) { + $dot = $tokens[ $j ]; + $col_b = $tokens[ $j + 1 ]; + $j += 2; + } + $kids = array( $this->expr_wrap_col( $col, $dot, $col_b ) ); + if ( $j < count( $tokens ) + && ( WP_MySQL_Lexer::ASC_SYMBOL === $tokens[ $j ]->id || WP_MySQL_Lexer::DESC_SYMBOL === $tokens[ $j ]->id ) + ) { + $kids[] = $this->node( 'direction', array( $tokens[ $j ] ) ); + ++$j; + } + return array( $this->node( 'orderExpression', $kids ), $j ); + } + + /** + * Wrap a `simpleStatement`/`utilityStatement`/etc. node and any trailing + * `;`/EOF tokens into a top-level `query` node. + */ + private function with_tail( WP_Parser_Node $simple_stmt, array $tokens, int $start_i ): WP_Parser_Node { + $kids = array( $simple_stmt ); + for ( $j = $start_i, $n = count( $tokens ); $j < $n; $j++ ) { + $kids[] = $tokens[ $j ]; + } + return $this->node( 'query', $kids ); + } + + // ============================================================ + // Per-shape AST builders. + // ============================================================ + + private function build_insert( array $tokens ): WP_Parser_Node { + $insert_tok = $tokens[0]; + $into_tok = $tokens[1]; + list( $tbl, $i ) = $this->consume_table_ref( $tokens, 2 ); + + $ifc_kids = array(); + $has_collist = WP_MySQL_Lexer::OPEN_PAR_SYMBOL === $tokens[ $i ]->id; + $i_values = $i; + if ( $has_collist ) { + $open_i = $i; + $j = $open_i + 1; + while ( WP_MySQL_Lexer::CLOSE_PAR_SYMBOL !== $tokens[ $j ]->id ) { + ++$j; + } + $close_i = $j; + $ifc_kids[] = $tokens[ $open_i ]; + $ifc_kids[] = $this->insert_fields_node( $tokens, $open_i, $close_i ); + $ifc_kids[] = $tokens[ $close_i ]; + $i_values = $close_i + 1; + } + + // Find the end of the values clause (first SEMICOLON/EOF or end-of-array). + $end = $i_values + 1; + $n = count( $tokens ); + while ( $end < $n && WP_MySQL_Lexer::SEMICOLON_SYMBOL !== $tokens[ $end ]->id && WP_MySQL_Lexer::EOF !== $tokens[ $end ]->id ) { + ++$end; + } + $ifc_kids[] = $this->insert_values_node( $tokens, $i_values, $end ); + $ifc = $this->node( 'insertFromConstructor', $ifc_kids ); + + $insert_stmt = $this->node( 'insertStatement', array( $insert_tok, $into_tok, $tbl, $ifc ) ); + return $this->with_tail( $this->node( 'simpleStatement', array( $insert_stmt ) ), $tokens, $end ); + } + + /** + * fields { insertIdentifier { columnRef { ... } } [, insertIdentifier { ... }]* }. + */ + private function insert_fields_node( array $tokens, int $i_open, int $i_close ): WP_Parser_Node { + $kids = array(); + for ( $i = $i_open + 1; $i < $i_close; $i++ ) { + if ( WP_MySQL_Lexer::COMMA_SYMBOL === $tokens[ $i ]->id ) { + $kids[] = $tokens[ $i ]; + continue; + } + $kids[] = $this->node( 'insertIdentifier', array( $this->column_ref( $tokens[ $i ] ) ) ); + } + return $this->node( 'fields', $kids ); + } + + /** + * insertValues { VALUES, valueList { ( values ) [, ( values )]* } }. + */ + private function insert_values_node( array $tokens, int $i_values, int $i_end_excl ): WP_Parser_Node { + $values_tok = $tokens[ $i_values ]; + $vl_kids = array(); + $i = $i_values + 1; + while ( $i < $i_end_excl && WP_MySQL_Lexer::OPEN_PAR_SYMBOL === $tokens[ $i ]->id ) { + $open_i = $i; + ++$i; + while ( $i < $i_end_excl && WP_MySQL_Lexer::CLOSE_PAR_SYMBOL !== $tokens[ $i ]->id ) { + ++$i; + } + $close_i = $i; + $vl_kids[] = $tokens[ $open_i ]; + $vl_kids[] = $this->values_node( $tokens, $open_i, $close_i ); + $vl_kids[] = $tokens[ $close_i ]; + ++$i; + if ( $i < $i_end_excl && WP_MySQL_Lexer::COMMA_SYMBOL === $tokens[ $i ]->id ) { + $vl_kids[] = $tokens[ $i ]; + ++$i; + } + } + return $this->node( 'insertValues', array( $values_tok, $this->node( 'valueList', $vl_kids ) ) ); + } + + /** + * values { expr [, expr]* } — for one (lit, lit, ...) row. + */ + private function values_node( array $tokens, int $i_open, int $i_close ): WP_Parser_Node { + $kids = array(); + for ( $i = $i_open + 1; $i < $i_close; $i++ ) { + if ( WP_MySQL_Lexer::COMMA_SYMBOL === $tokens[ $i ]->id ) { + $kids[] = $tokens[ $i ]; + continue; + } + $kids[] = $this->expr_wrap_lit( $tokens[ $i ] ); + } + return $this->node( 'values', $kids ); + } + + private function build_drop( array $tokens ): WP_Parser_Node { + $drop_tok = $tokens[0]; + $table_tok = $tokens[1]; + $dt_kids = array( $table_tok ); + $i = 2; + if ( WP_MySQL_Lexer::IF_SYMBOL === $tokens[ $i ]->id ) { + $dt_kids[] = $this->node( 'ifExists', array( $tokens[ $i ], $tokens[ $i + 1 ] ) ); + $i += 2; + } + list( $tref, $i ) = $this->consume_table_ref( $tokens, $i ); + $tref_kids = array( $tref ); + while ( $i < count( $tokens ) && WP_MySQL_Lexer::COMMA_SYMBOL === $tokens[ $i ]->id ) { + $tref_kids[] = $tokens[ $i ]; + ++$i; + list( $tref2, $i ) = $this->consume_table_ref( $tokens, $i ); + $tref_kids[] = $tref2; + } + $dt_kids[] = $this->node( 'tableRefList', $tref_kids ); + $drop_stmt = $this->node( 'dropStatement', array( $drop_tok, $this->node( 'dropTable', $dt_kids ) ) ); + return $this->with_tail( $this->node( 'simpleStatement', array( $drop_stmt ) ), $tokens, $i ); + } + + private function build_show( array $tokens ): WP_Parser_Node { + $show_tok = $tokens[0]; + $t1 = $tokens[1]->id; + + if ( WP_MySQL_Lexer::CREATE_SYMBOL === $t1 ) { + $t2 = $tokens[2]->id; + if ( WP_MySQL_Lexer::TABLE_SYMBOL === $t2 ) { + list( $tref, $end ) = $this->consume_table_ref( $tokens, 3 ); + $ss = $this->node( 'showStatement', array( $show_tok, $tokens[1], $tokens[2], $tref ) ); + } elseif ( WP_MySQL_Lexer::DATABASE_SYMBOL === $t2 ) { + $schema_ref = $this->node( + 'schemaRef', + array( + $this->node( 'identifier', array( $this->node( 'pureIdentifier', array( $tokens[3] ) ) ) ), + ) + ); + $ss = $this->node( 'showStatement', array( $show_tok, $tokens[1], $tokens[2], $schema_ref ) ); + $end = 4; + } else { + // PROCEDURE | FUNCTION — produce procedureRef/functionRef wrapping + // the same qualifiedIdentifier as a tableRef would. + list( $tref, $end ) = $this->consume_table_ref( $tokens, 3 ); + $tref_kids = $tref->get_children_ref(); + $ref_name = ( WP_MySQL_Lexer::PROCEDURE_SYMBOL === $t2 ) ? 'procedureRef' : 'functionRef'; + $ref = $this->node( $ref_name, $tref_kids ); + $ss = $this->node( 'showStatement', array( $show_tok, $tokens[1], $tokens[2], $ref ) ); + } + } elseif ( WP_MySQL_Lexer::SESSION_SYMBOL === $t1 || WP_MySQL_Lexer::GLOBAL_SYMBOL === $t1 ) { + $opt = $this->node( 'optionType', array( $tokens[1] ) ); + $ss = $this->node( 'showStatement', array( $show_tok, $opt, $tokens[2] ) ); + $end = 3; + } elseif ( WP_MySQL_Lexer::PROCEDURE_SYMBOL === $t1 || WP_MySQL_Lexer::FUNCTION_SYMBOL === $t1 ) { + $ss = $this->node( 'showStatement', array( $show_tok, $tokens[1], $tokens[2] ) ); + $end = 3; + } elseif ( WP_MySQL_Lexer::COLUMNS_SYMBOL === $t1 ) { + list( $tref, $end ) = $this->consume_table_ref( $tokens, 3 ); + $ss = $this->node( 'showStatement', array( $show_tok, $tokens[1], $tokens[2], $tref ) ); + } elseif ( WP_MySQL_Lexer::KEYS_SYMBOL === $t1 || WP_MySQL_Lexer::INDEX_SYMBOL === $t1 || WP_MySQL_Lexer::INDEXES_SYMBOL === $t1 ) { + $from_or_in = $this->node( 'fromOrIn', array( $tokens[2] ) ); + list( $tref, $end ) = $this->consume_table_ref( $tokens, 3 ); + $ss = $this->node( 'showStatement', array( $show_tok, $tokens[1], $from_or_in, $tref ) ); + } else { + $ss = $this->node( 'showStatement', array( $show_tok, $tokens[1] ) ); + $end = 2; + } + + return $this->with_tail( $this->node( 'simpleStatement', array( $ss ) ), $tokens, $end ); + } + + private function build_use( array $tokens ): WP_Parser_Node { + $use_cmd = $this->node( + 'useCommand', + array( + $tokens[0], + $this->node( 'identifier', array( $this->node( 'pureIdentifier', array( $tokens[1] ) ) ) ), + ) + ); + $util = $this->node( 'utilityStatement', array( $use_cmd ) ); + return $this->with_tail( $this->node( 'simpleStatement', array( $util ) ), $tokens, 2 ); + } + + private function build_begin( array $tokens ): WP_Parser_Node { + $bw_kids = array( $tokens[0] ); + $i = 1; + if ( WP_MySQL_Lexer::WORK_SYMBOL === ( $tokens[1]->id ?? 0 ) ) { + $bw_kids[] = $tokens[1]; + $i = 2; + } + // BEGIN is the second alternative of `query`: `query → ... | beginWork tail`. + $kids = array( $this->node( 'beginWork', $bw_kids ) ); + for ( $j = $i, $n = count( $tokens ); $j < $n; $j++ ) { + $kids[] = $tokens[ $j ]; + } + return $this->node( 'query', $kids ); + } + + private function build_commit( array $tokens ): WP_Parser_Node { + $tx = $this->node( 'transactionStatement', array( $tokens[0] ) ); + $txl = $this->node( 'transactionOrLockingStatement', array( $tx ) ); + return $this->with_tail( $this->node( 'simpleStatement', array( $txl ) ), $tokens, 1 ); + } + + private function build_rollback( array $tokens ): WP_Parser_Node { + $sp = $this->node( 'savepointStatement', array( $tokens[0] ) ); + $txl = $this->node( 'transactionOrLockingStatement', array( $sp ) ); + return $this->with_tail( $this->node( 'simpleStatement', array( $txl ) ), $tokens, 1 ); + } + + private function build_truncate( array $tokens ): WP_Parser_Node { + $kids = array( $tokens[0] ); + $i = 1; + if ( WP_MySQL_Lexer::TABLE_SYMBOL === $tokens[1]->id ) { + $kids[] = $tokens[1]; + $i = 2; + } + list( $tref, $i ) = $this->consume_table_ref( $tokens, $i ); + $kids[] = $tref; + $tt = $this->node( 'truncateTableStatement', $kids ); + return $this->with_tail( $this->node( 'simpleStatement', array( $tt ) ), $tokens, $i ); + } + + /** + * SET — three forms (optionType-prefixed, no-optionType, plus combinations). + */ + private function build_set( array $tokens ): WP_Parser_Node { + $set_tok = $tokens[0]; + $i = 1; + $t1 = $tokens[ $i ]->id; + + if ( WP_MySQL_Lexer::GLOBAL_SYMBOL === $t1 || WP_MySQL_Lexer::SESSION_SYMBOL === $t1 + || WP_MySQL_Lexer::PERSIST_SYMBOL === $t1 || WP_MySQL_Lexer::PERSIST_ONLY_SYMBOL === $t1 + ) { + // optionType form: first assignment is optionValueFollowingOptionType, + // subsequent assignments use optionValueListContinued (with + // optionValueNoOptionType inner). + $opt_type = $this->node( 'optionType', array( $tokens[ $i ] ) ); + ++$i; + list( $first, $i ) = $this->build_set_option_following( $tokens, $i ); + $following_kids = array( $first ); + if ( $i < count( $tokens ) && WP_MySQL_Lexer::COMMA_SYMBOL === $tokens[ $i ]->id ) { + $cont_kids = array(); + while ( $i < count( $tokens ) && WP_MySQL_Lexer::COMMA_SYMBOL === $tokens[ $i ]->id ) { + $cont_kids[] = $tokens[ $i ]; + ++$i; + list( $opt, $i ) = $this->build_set_option_no_type( $tokens, $i ); + $cont_kids[] = $this->node( 'optionValue', array( $opt ) ); + } + $following_kids[] = $this->node( 'optionValueListContinued', $cont_kids ); + } + $following = $this->node( 'startOptionValueListFollowingOptionType', $following_kids ); + $start = $this->node( 'startOptionValueList', array( $opt_type, $following ) ); + } else { + list( $first, $i ) = $this->build_set_option_no_type( $tokens, $i ); + $start_kids = array( $first ); + if ( $i < count( $tokens ) && WP_MySQL_Lexer::COMMA_SYMBOL === $tokens[ $i ]->id ) { + $cont_kids = array(); + while ( $i < count( $tokens ) && WP_MySQL_Lexer::COMMA_SYMBOL === $tokens[ $i ]->id ) { + $cont_kids[] = $tokens[ $i ]; + ++$i; + list( $opt, $i ) = $this->build_set_option_no_type( $tokens, $i ); + $cont_kids[] = $this->node( 'optionValue', array( $opt ) ); + } + $start_kids[] = $this->node( 'optionValueListContinued', $cont_kids ); + } + $start = $this->node( 'startOptionValueList', $start_kids ); + } + + $set_stmt = $this->node( 'setStatement', array( $set_tok, $start ) ); + return $this->with_tail( $this->node( 'simpleStatement', array( $set_stmt ) ), $tokens, $i ); + } + + /** + * @return array{0:WP_Parser_Node,1:int} + */ + private function build_set_option_no_type( array $tokens, int $i ): array { + $var_tok = $tokens[ $i ]; + + if ( WP_MySQL_Lexer::AT_AT_SIGN_SYMBOL === $var_tok->id ) { + // setSystemVariable: AT_AT [GLOBAL|SESSION DOT]? internalVariableName. + $kids = array( $var_tok ); + ++$i; + $next_id = $tokens[ $i ]->id ?? 0; + if ( WP_MySQL_Lexer::GLOBAL_SYMBOL === $next_id || WP_MySQL_Lexer::SESSION_SYMBOL === $next_id ) { + $kids[] = $this->node( 'setVarIdentType', array( $tokens[ $i ], $tokens[ $i + 1 ] ) ); + $i += 2; + } + $kids[] = $this->node( + 'internalVariableName', + array( + $this->node( 'identifier', array( $this->node( 'pureIdentifier', array( $tokens[ $i ] ) ) ) ), + ) + ); + ++$i; + $ssv = $this->node( 'setSystemVariable', $kids ); + $eq_tok = $tokens[ $i ]; + $rhs_tok = $tokens[ $i + 1 ]; + $opt = $this->node( + 'optionValueNoOptionType', + array( + $ssv, + $this->node( 'equal', array( $eq_tok ) ), + $this->node( 'setExprOrDefault', array( $this->expr_for_rhs( $rhs_tok ) ) ), + ) + ); + return array( $opt, $i + 2 ); + } + + $eq_tok = $tokens[ $i + 1 ]; + $rhs_tok = $tokens[ $i + 2 ]; + if ( WP_MySQL_Lexer::AT_TEXT_SUFFIX === $var_tok->id ) { + // userVariable form (does NOT use setExprOrDefault — expr directly). + $var = $this->node( 'userVariable', array( $var_tok ) ); + $opt = $this->node( + 'optionValueNoOptionType', + array( + $var, + $this->node( 'equal', array( $eq_tok ) ), + $this->expr_for_rhs( $rhs_tok ), + ) + ); + } else { + $ivn = $this->node( + 'internalVariableName', + array( + $this->node( 'identifier', array( $this->node( 'pureIdentifier', array( $var_tok ) ) ) ), + ) + ); + $opt = $this->node( + 'optionValueNoOptionType', + array( + $ivn, + $this->node( 'equal', array( $eq_tok ) ), + $this->node( 'setExprOrDefault', array( $this->expr_for_rhs( $rhs_tok ) ) ), + ) + ); + } + return array( $opt, $i + 3 ); + } + + /** + * optionValueFollowingOptionType — only `ident = expr`. + * + * @return array{0:WP_Parser_Node,1:int} + */ + private function build_set_option_following( array $tokens, int $i ): array { + $var_tok = $tokens[ $i ]; + $eq_tok = $tokens[ $i + 1 ]; + $rhs_tok = $tokens[ $i + 2 ]; + $ivn = $this->node( + 'internalVariableName', + array( + $this->node( 'identifier', array( $this->node( 'pureIdentifier', array( $var_tok ) ) ) ), + ) + ); + $opt = $this->node( + 'optionValueFollowingOptionType', + array( + $ivn, + $this->node( 'equal', array( $eq_tok ) ), + $this->node( 'setExprOrDefault', array( $this->expr_for_rhs( $rhs_tok ) ) ), + ) + ); + return array( $opt, $i + 3 ); + } + + /** + * SELECT entrypoint. + */ + private function build_select( array $tokens ): WP_Parser_Node { + $ss = $this->build_select_inner( $tokens, 0 ); + + $i = 0; + for ( $j = count( $tokens ) - 1; $j >= 0; $j-- ) { + if ( WP_MySQL_Lexer::EOF !== $tokens[ $j ]->id && WP_MySQL_Lexer::SEMICOLON_SYMBOL !== $tokens[ $j ]->id ) { + $i = $j + 1; + break; + } + } + return $this->with_tail( $ss, $tokens, $i ); + } + + /** + * Inner SELECT body — returns a `simpleStatement{ selectStatement{...} }` + * node without the trailing `;`/EOF tokens. Reused by EXPLAIN. + */ + private function build_select_inner( array $tokens, int $start ): WP_Parser_Node { + $select_tok = $tokens[ $start ]; + + // Find FROM. + $i_from = $start + 1; + while ( WP_MySQL_Lexer::FROM_SYMBOL !== $tokens[ $i_from ]->id ) { + ++$i_from; + } + $select_items = $this->select_item_list( $tokens, $start + 1, $i_from ); + + // FROM list. + $i = $i_from + 1; + list( $tr_first, $i ) = $this->consume_table_reference( $tokens, $i ); + $trl_kids = array( $tr_first ); + while ( $i < count( $tokens ) && WP_MySQL_Lexer::COMMA_SYMBOL === $tokens[ $i ]->id ) { + $trl_kids[] = $tokens[ $i ]; + ++$i; + list( $tr, $i ) = $this->consume_table_reference( $tokens, $i ); + $trl_kids[] = $tr; + } + $from_clause = $this->node( + 'fromClause', + array( + $tokens[ $i_from ], + $this->node( 'tableReferenceList', $trl_kids ), + ) + ); + + $qs_kids = array( $select_tok, $select_items, $from_clause ); + list( $where, $i ) = $this->maybe_where_for_select( $tokens, $i ); + if ( null !== $where ) { + $qs_kids[] = $where; + } + $qs = $this->node( 'querySpecification', $qs_kids ); + $qp = $this->node( 'queryPrimary', array( $qs ) ); + $qt = $this->node( 'queryTerm', array( $qp ) ); + $qeb = $this->node( 'queryExpressionBody', array( $qt ) ); + + $qe_kids = array( $qeb ); + + // ORDER BY (multi-column, lives at queryExpression level). + if ( $i < count( $tokens ) && WP_MySQL_Lexer::ORDER_SYMBOL === $tokens[ $i ]->id ) { + $order_tok = $tokens[ $i ]; + $by_tok = $tokens[ $i + 1 ]; + $i += 2; + $ol_kids = array(); + while ( true ) { + list( $oe, $i ) = $this->consume_order_item( $tokens, $i ); + $ol_kids[] = $oe; + if ( $i >= count( $tokens ) || WP_MySQL_Lexer::COMMA_SYMBOL !== $tokens[ $i ]->id ) { + break; + } + $ol_kids[] = $tokens[ $i ]; + ++$i; + } + $qe_kids[] = $this->node( 'orderClause', array( $order_tok, $by_tok, $this->node( 'orderList', $ol_kids ) ) ); + } + + // LIMIT n [, n]. + if ( $i < count( $tokens ) && WP_MySQL_Lexer::LIMIT_SYMBOL === $tokens[ $i ]->id ) { + $lim_tok = $tokens[ $i ]; + $n_tok = $tokens[ $i + 1 ]; + $lo_kids = array( $this->node( 'limitOption', array( $n_tok ) ) ); + $i += 2; + if ( $i < count( $tokens ) && WP_MySQL_Lexer::COMMA_SYMBOL === $tokens[ $i ]->id ) { + $lo_kids[] = $tokens[ $i ]; + $lo_kids[] = $this->node( 'limitOption', array( $tokens[ $i + 1 ] ) ); + $i += 2; + } + $qe_kids[] = $this->node( 'limitClause', array( $lim_tok, $this->node( 'limitOptions', $lo_kids ) ) ); + } + + $qe = $this->node( 'queryExpression', $qe_kids ); + $sel_stmt = $this->node( 'selectStatement', array( $qe ) ); + return $this->node( 'simpleStatement', array( $sel_stmt ) ); + } + + /** + * Variant of maybe_where for SELECT, where the parser handles `false` + * outcomes by stopping before ORDER/LIMIT. + * + * @return array{0:WP_Parser_Node|null,1:int} + */ + private function maybe_where_for_select( array $tokens, int $i ): array { + return $this->maybe_where( $tokens, $i ); + } + + /** + * `selectItemList` covering both `*` and identifier lists with optional + * aliases. + */ + private function select_item_list( array $tokens, int $i_start, int $i_end_excl ): WP_Parser_Node { + $kids = array(); + if ( 1 === $i_end_excl - $i_start && WP_MySQL_Lexer::MULT_OPERATOR === $tokens[ $i_start ]->id ) { + $kids[] = $tokens[ $i_start ]; + return $this->node( 'selectItemList', $kids ); + } + + $i = $i_start; + while ( $i < $i_end_excl ) { + if ( WP_MySQL_Lexer::COMMA_SYMBOL === $tokens[ $i ]->id ) { + $kids[] = $tokens[ $i ]; + ++$i; + continue; + } + $col = $tokens[ $i ]; + $dot = null; + $col_b = null; + if ( ( $i + 1 < $i_end_excl ) && WP_MySQL_Lexer::DOT_SYMBOL === $tokens[ $i + 1 ]->id ) { + $dot = $tokens[ $i + 1 ]; + $col_b = $tokens[ $i + 2 ]; + $i += 3; + } else { + ++$i; + } + $si_kids = array( $this->expr_wrap_col( $col, $dot, $col_b ) ); + // Optional alias: [AS] ident. + if ( $i < $i_end_excl ) { + $as_tok = null; + if ( WP_MySQL_Lexer::AS_SYMBOL === $tokens[ $i ]->id ) { + $as_tok = $tokens[ $i ]; + ++$i; + } + if ( $i < $i_end_excl && ( WP_MySQL_Lexer::IDENTIFIER === $tokens[ $i ]->id || WP_MySQL_Lexer::BACK_TICK_QUOTED_ID === $tokens[ $i ]->id ) ) { + $alias_kids = array(); + if ( null !== $as_tok ) { + $alias_kids[] = $as_tok; + } + $alias_kids[] = $this->node( 'identifier', array( $this->node( 'pureIdentifier', array( $tokens[ $i ] ) ) ) ); + $si_kids[] = $this->node( 'selectAlias', $alias_kids ); + ++$i; + } elseif ( null !== $as_tok ) { + // AS without identifier — give back the AS_SYMBOL. + --$i; + } + } + $kids[] = $this->node( 'selectItem', $si_kids ); + } + return $this->node( 'selectItemList', $kids ); + } + + private function build_explain( array $tokens ): WP_Parser_Node { + $expl_tok = $tokens[0]; + $inner_simple = $this->build_select_inner( $tokens, 1 ); + // Inner returns simpleStatement{ selectStatement{...} }; unwrap selectStatement. + $inner_kids = $inner_simple->get_children_ref(); + $inner_select_stmt = $inner_kids[0]; + $exp_able = $this->node( 'explainableStatement', array( $inner_select_stmt ) ); + $expl = $this->node( 'explainStatement', array( $expl_tok, $exp_able ) ); + $util = $this->node( 'utilityStatement', array( $expl ) ); + + $i = 0; + for ( $j = count( $tokens ) - 1; $j >= 0; $j-- ) { + if ( WP_MySQL_Lexer::EOF !== $tokens[ $j ]->id && WP_MySQL_Lexer::SEMICOLON_SYMBOL !== $tokens[ $j ]->id ) { + $i = $j + 1; + break; + } + } + return $this->with_tail( $this->node( 'simpleStatement', array( $util ) ), $tokens, $i ); + } + + private function build_update( array $tokens ): WP_Parser_Node { + $upd_tok = $tokens[0]; + list( $tref, $i ) = $this->consume_table_ref( $tokens, 1 ); + $tbl = $this->node( + 'tableReference', + array( + $this->node( + 'tableFactor', + array( + $this->node( 'singleTable', array( $tref ) ), + ) + ), + ) + ); + $trl = $this->node( 'tableReferenceList', array( $tbl ) ); + + $set_tok = $tokens[ $i ]; + ++$i; + $ul_kids = array(); + $first = true; + while ( $i < count( $tokens ) && WP_MySQL_Lexer::WHERE_SYMBOL !== $tokens[ $i ]->id + && WP_MySQL_Lexer::SEMICOLON_SYMBOL !== $tokens[ $i ]->id + && WP_MySQL_Lexer::EOF !== $tokens[ $i ]->id + ) { + if ( ! $first ) { + $ul_kids[] = $tokens[ $i ]; // COMMA. + ++$i; + } + $col_tok = $tokens[ $i ]; + $eq_tok = $tokens[ $i + 1 ]; + $rhs_tok = $tokens[ $i + 2 ]; + $ul_kids[] = $this->node( + 'updateElement', + array( + $this->column_ref( $col_tok ), + $this->node( 'equal', array( $eq_tok ) ), + $this->expr_for_rhs( $rhs_tok ), + ) + ); + $i += 3; + $first = false; + } + $ul = $this->node( 'updateList', $ul_kids ); + + $up_kids = array( $upd_tok, $trl, $set_tok, $ul ); + list( $where, $i ) = $this->maybe_where( $tokens, $i ); + if ( null !== $where ) { + $up_kids[] = $where; + } + $up_stmt = $this->node( 'updateStatement', $up_kids ); + return $this->with_tail( $this->node( 'simpleStatement', array( $up_stmt ) ), $tokens, $i ); + } + + private function build_delete( array $tokens ): WP_Parser_Node { + $del_tok = $tokens[0]; + $from_tok = $tokens[1]; + list( $tref, $i ) = $this->consume_table_ref( $tokens, 2 ); + $kids = array( $del_tok, $from_tok, $tref ); + + list( $where, $i ) = $this->maybe_where( $tokens, $i ); + if ( null !== $where ) { + $kids[] = $where; + } + $del_stmt = $this->node( 'deleteStatement', $kids ); + return $this->with_tail( $this->node( 'simpleStatement', array( $del_stmt ) ), $tokens, $i ); + } +} From 4ad1df527c45c3ea61f96fa8b36f2b48885994f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 17:49:01 +0200 Subject: [PATCH 12/23] Add PCRE2 capture/trace AST-extraction experiment Three walls: compile complexity limit, ~4.6x JIT collapse on captures around recursion, ~26us export with ~1400 named groups. Infeasible in stock PHP. From local branch parser-fast-path. --- experiments/pcre2-capture-trace/NOTES.md | 30 +++ .../exp-pcre2-trace-findings.php | 200 ++++++++++++++++++ .../wall-a-capture-limit.php | 81 +++++++ .../pcre2-capture-trace/wall-a-fine.php | 81 +++++++ .../pcre2-capture-trace/wall-a-narrow.php | 25 +++ .../pcre2-capture-trace/wall-b-hot.php | 58 +++++ .../wall-b-jit-captures.php | 57 +++++ .../pcre2-capture-trace/wall-b-jitcheck.php | 18 ++ .../pcre2-capture-trace/wall-b-jitsize.php | 36 ++++ .../pcre2-capture-trace/wall-c-export.php | 35 +++ .../pcre2-capture-trace/wall-c-export2.php | 25 +++ 11 files changed, 646 insertions(+) create mode 100644 experiments/pcre2-capture-trace/NOTES.md create mode 100644 experiments/pcre2-capture-trace/exp-pcre2-trace-findings.php create mode 100644 experiments/pcre2-capture-trace/wall-a-capture-limit.php create mode 100644 experiments/pcre2-capture-trace/wall-a-fine.php create mode 100644 experiments/pcre2-capture-trace/wall-a-narrow.php create mode 100644 experiments/pcre2-capture-trace/wall-b-hot.php create mode 100644 experiments/pcre2-capture-trace/wall-b-jit-captures.php create mode 100644 experiments/pcre2-capture-trace/wall-b-jitcheck.php create mode 100644 experiments/pcre2-capture-trace/wall-b-jitsize.php create mode 100644 experiments/pcre2-capture-trace/wall-c-export.php create mode 100644 experiments/pcre2-capture-trace/wall-c-export2.php diff --git a/experiments/pcre2-capture-trace/NOTES.md b/experiments/pcre2-capture-trace/NOTES.md new file mode 100644 index 000000000..bd39e3aae --- /dev/null +++ b/experiments/pcre2-capture-trace/NOTES.md @@ -0,0 +1,30 @@ +# Extracting a parse tree from PCRE2 captures / MARK trace + +**Origin:** local branch `parser-fast-path`, `exp-pcre2-trace-findings.php` (a +documented negative result) + the `wall-*.php` probes (rebuilt here). No PR. + +**Idea:** compile the grammar to one pattern with a numbered capture (or `(*MARK)` +/ `(?&rule())`) per rule occurrence, match once, and walk the captures to +assemble the tree. + +**Run:** `php -d ...jit... wall-a-*.php / wall-b-*.php / wall-c-*.php` + +**Result — three independent walls:** +- **Capture/compile limit:** PCRE2 compilation is bounded by total complexity, not a + fixed capture count. The 76 KB grammar pattern compiles at 0 captures and + tolerates ~1,175 added captures before failing ("pattern too large"); tiny + patterns tolerate thousands. Capturing all ~1500 rules at once won't compile. +- **JIT collapse on captures around recursion:** adding ~6 numbered captures around + `(?&rule)` call sites collapses a JIT-able pattern's throughput ~4.6×. (The 76 KB + grammar pattern doesn't JIT at all, so captures cost little THERE — the collapse + is the reason a capture-heavy recursive pattern can't be made fast.) +- **$matches export cost:** a pattern with ~1,400 named subroutines costs ~26 µs per + `preg_match` with `PREG_OFFSET_CAPTURE | PREG_UNMATCHED_AS_NULL` — already over the + parser's per-query budget before walking anything. + +Separately, PCRE2 only ever exposes last-write-wins ovector slots + one MARK per +match (verified: marking the start rule's branches yields exactly 2 distinct MARKs +for the whole corpus). No per-recursion-frame stack is recoverable. + +**Verdict:** Single-pass PCRE2 → AST is infeasible at this grammar's scale in stock +PHP. The structural information simply isn't exposed. diff --git a/experiments/pcre2-capture-trace/exp-pcre2-trace-findings.php b/experiments/pcre2-capture-trace/exp-pcre2-trace-findings.php new file mode 100644 index 000000000..4bf46dc6f --- /dev/null +++ b/experiments/pcre2-capture-trace/exp-pcre2-trace-findings.php @@ -0,0 +1,200 @@ +)...) / (*scs:()...) [PCRE2 10.46+] + * Compiles and runs. Inner named captures inside (*scs:...) DO + * persist to the outer match's $matches. The scs operates on the + * captured substring, so this is effectively a free second-pass + * parse with composable capture exposure. Bounded depth — useful + * for, e.g., capturing a SELECT clause then its column-list, but + * not for arbitrary tree depth. + * + * (?&rule()) capture-retaining subroutine call [PCRE2 10.46+] + * Compiles. is retained after the subroutine returns. BUT: + * repeated (?&rule())+ overwrites the same slot — the post- + * match value is the LAST iteration's value. Confirmed + * empirically with input "abc" → tag="c". The new feature does + * not give a stack; it only changes the default of "throw + * captures away" to "keep them, last-wins". + * + * MARK from inside (?&rule), lookarounds, atomic groups + * Surfaces to outer match (matching-path rule applies). + * + * Verified dead-ends: + * ------------------- + * ${*MARK} / $MARK / ${MARK} substitution syntax + * PHP's preg_replace does not parse these — output contains the + * literal text. PHP does not call pcre2_substitute(). + * + * Failed-match MARK + * pcre2_get_mark() returns the deepest MARK reached even on + * failure, but PHP's preg_match populates $matches['MARK'] only + * when the match succeeds. + * + * Recursive backref \k with re-entered captures + * PCRE2 returns "Internal error" in PHP — recursion does not + * stack capture values for backref purposes. + * + * PCRE2_AUTO_CALLOUT, (?C) callouts without a registered callback + * Compile fine, silently no-op. + * + * pcre2_dfa_match (multi-result match) + * Not exposed by PHP; also doesn't populate captures even if it + * were. + * + * pcre2_callout_enumerate, pcre2_substring_nametable_scan + * Not exposed by PHP. + * + * FFI binding of a PHP closure to pcre2_set_callout's function + * pointer + * PHP's libffi closure support is not enabled. Documented in + * exp-pcre-ffi.php. + * + * (?J) duplicate names with PREG_OFFSET_CAPTURE + * One slot per static occurrence (NOT per recursion). Last-wins. + * + * PCRE2 substitute callouts (pcre2_set_substitute_callout) + * Would expose subscount per replacement, but PHP does not call + * pcre2_substitute(); preg_replace[ _callback]() iterate + * pcre2_match() manually. + * + * WHY THE BAR CAN'T BE MET IN PURE PHP + * ==================================== + * The corpus AST has, on average, dozens of nodes per query. Even + * with a perfect O(N)-token MARK trace at validator speed + * (105K QPS), reconstructing the same AST that the parser produces + * requires creating those PHP nodes — and PHP object construction + * alone (~200-500ns per node) consumes most of the 16µs/query + * budget that separates 105K QPS from the parser's 41K QPS. A + * coarse top-level MARK gives nothing the parser doesn't already + * derive from the FIRST-set table at zero cost. + * + * The parser is the floor in pure PHP. Beating it requires either + * (a) a smaller AST (different consumer), or (b) a callout-driven + * trace from a single PCRE2 match. + * + * MINIMAL PHP EXTENSION PROPOSAL + * ============================== + * The cleanest path forward is a tiny PHP extension that exposes + * pcre2_set_callout(). It needs ~150 lines of C and three exposed + * functions: + * + * resource pcre_callout_compile(string $pattern, int $flags = 0) + * Wraps pcre2_compile_8() + pcre2_jit_compile_8(). + * + * array pcre_callout_match(resource $code, string $subject, + * int $options = 0) + * Calls pcre2_match_8() with a C trampoline registered via + * pcre2_set_callout_8(). The trampoline appends a fixed-size + * record (callout_number, current_position, capture_top) to a + * pre-allocated buffer for each fired callout. After the match, + * the buffer is materialised into a PHP array of [num, pos, cap] + * tuples and returned alongside the standard ovector. + * + * void pcre_callout_set_buffer_size(int $bytes) + * Tunes the per-thread callout buffer (default 64K records, + * ~1MB). + * + * Why this is enough: with auto-callouts enabled (PCRE2_AUTO_CALLOUT) + * or explicit (?C) at every grammar rule entry / branch boundary, + * a single pcre2_match_8 call against the existing 76KB grammar + * pattern yields a complete trace of which alternative entered and + * succeeded at each rule, in chronological order. Reconstructing the + * AST is then a linear walk of the trace. + * + * Cost estimate: the existing validator at 126K QPS (warm JIT) does + * ~8µs of PCRE2 work per query. Adding a callout trampoline that + * appends 24 bytes per fire is a few-ns overhead per callout; for an + * average MySQL query with ~30-50 firing callouts we'd add <1µs of + * trampoline overhead, leaving budget for AST construction. + * + * Build complexity: a single .c file linked against the same + * libpcre2-8 PHP itself uses, packaged as a loadable Zend extension. + * No third-party deps. Could ship as a vendored optional extension + * (parallel to opcache) without affecting the pure-PHP fallback path. + * + * Risks: + * - JIT-mode callouts: per pcre2callout(3), JIT supports user + * callouts but sets callout_flags=0. Field availability on + * pcre2_callout_block needs to be sanity-checked when + * pcre2_jit_compile is enabled. Worst case: fall back to + * interpreted matching for traced runs. + * - PHP version skew: pcre2_set_callout signature is stable since + * PCRE2 10.00, but PHP bundles its own libpcre2 (`pcrelib`) on + * Windows and some BSDs. The extension should dlsym against the + * same library PHP uses, not a system one. + * + * If the extension is built and the trampoline emits ~24 bytes per + * callout into a flat buffer, the realistic ceiling is in the + * 80K-100K QPS range with full AST reconstruction — a 2× win over + * the current parser. That is the payoff that makes the C work + * worth it; nothing in pure PHP comes close. + */ diff --git a/experiments/pcre2-capture-trace/wall-a-capture-limit.php b/experiments/pcre2-capture-trace/wall-a-capture-limit.php new file mode 100644 index 000000000..0b29140b6 --- /dev/null +++ b/experiments/pcre2-capture-trace/wall-a-capture-limit.php @@ -0,0 +1,81 @@ +lowest_non_terminal_id; $rules=$grammar->rules; +$single=$grammar->single_candidate_rules ?? array(); +$select_rid=$grammar->get_rule_id('selectStatement'); +$into=tch(WP_MySQL_Lexer::INTO_SYMBOL); +$compiled=array(); +$compile=function($rid)use(&$compiled,$rules,$low_nt,$single,$select_rid,$into){ + if(isset($compiled[$rid]))return $compiled[$rid]; + $alts=array(); $safe=isset($single[$rid]); + foreach($rules[$rid] as $branch){ $alt=''; + foreach($branch as $i=>$sym){ $alt.= $sym<$low_nt ? tch($sym) : "RREF{$sym}RREF"; if($i===0&&$safe)$alt.='(*THEN)'; } + $alts[]=$alt; } + $body='(?:'.implode('|',$alts).')'; if($rid===$select_rid)$body.='(?!'.$into.')'; + return $compiled[$rid]=$body; +}; +foreach(array_keys($rules) as $rid)$compile($rid); +// inline single-use non-recursive rules to fixpoint +do{ $changed=false; $refs=array(); + foreach($compiled as $rid=>$b)$refs[$rid]=0; + foreach($compiled as $b){ if(preg_match_all('/RREF(\d+)RREF/',$b,$m))foreach($m[1] as $r)$refs[(int)$r]=($refs[(int)$r]??0)+1; } + foreach($compiled as $rid=>$b){ if(($refs[$rid]??0)!==1)continue; if(strpos($b,"RREF{$rid}RREF")!==false)continue; + foreach($compiled as $crid=>$cb){ if(strpos($cb,"RREF{$rid}RREF")!==false){ $compiled[$crid]=str_replace("RREF{$rid}RREF",$b,$cb); unset($compiled[$rid]); $changed=true; break 2; } } } +}while($changed); +$rule_to_idx=array(); $idx_to_rule=array(); +foreach($compiled as $rid=>$_){ $rule_to_idx[$rid]=count($idx_to_rule); $idx_to_rule[]=$rid; } +// Build define with a callback that can OPTIONALLY wrap the (?&rN) call sites in captures. +$build_define=function($num_captures)use($compiled,$idx_to_rule,$rule_to_idx){ + $wrapped=0; $define=''; + foreach($idx_to_rule as $rid){ + $body=$compiled[$rid]; + $body=preg_replace_callback('/RREF(\d+)RREF/',function($m)use($rule_to_idx,&$wrapped,$num_captures){ + $ref='(?&r'.$rule_to_idx[(int)$m[1]].')'; + if($wrapped<$num_captures){ $wrapped++; return '('.$ref.')'; } // numbered capture wrapping (?&rule) + return $ref; + },$body); + $define.="(?{$body})"; + } + return array($define,$wrapped); +}; +$start_rid=$grammar->get_rule_id('query'); +$total_callsites=0; +foreach($compiled as $b){ $total_callsites+=preg_match_all('/RREF\d+RREF/',$b,$x); } +printf("Final DEFINE rules=%d, total (?&rule) call sites=%d\n",count($idx_to_rule),$total_callsites); + +// Sweep number of captures. +$ns=array(0,50,100,150,200,250,300,400,600,800,1000,1500,2000,$total_callsites); +foreach($ns as $N){ + list($define,$wrapped)=$build_define($N); + $pattern='/(?(DEFINE)'.$define.')\A(?&r'.$rule_to_idx[$start_rid].')\z/u'; + $ok=@preg_match($pattern,"\xff"); + $err=preg_last_error(); + $errmsg=preg_last_error_msg(); + $compile_ok = !($ok===false && $err!==PREG_BAD_UTF8_ERROR); + printf("captures=%-5d wrapped=%-5d pattern=%s bytes -> %s%s\n", + $N,$wrapped,number_format(strlen($pattern)), + $compile_ok?'COMPILES':'FAIL', + $compile_ok?'':" ($errmsg)"); + if(!$compile_ok && $err!==PREG_BAD_UTF8_ERROR) break; +} diff --git a/experiments/pcre2-capture-trace/wall-a-fine.php b/experiments/pcre2-capture-trace/wall-a-fine.php new file mode 100644 index 000000000..7b8869e02 --- /dev/null +++ b/experiments/pcre2-capture-trace/wall-a-fine.php @@ -0,0 +1,81 @@ +lowest_non_terminal_id; $rules=$grammar->rules; +$single=$grammar->single_candidate_rules ?? array(); +$select_rid=$grammar->get_rule_id('selectStatement'); +$into=tch(WP_MySQL_Lexer::INTO_SYMBOL); +$compiled=array(); +$compile=function($rid)use(&$compiled,$rules,$low_nt,$single,$select_rid,$into){ + if(isset($compiled[$rid]))return $compiled[$rid]; + $alts=array(); $safe=isset($single[$rid]); + foreach($rules[$rid] as $branch){ $alt=''; + foreach($branch as $i=>$sym){ $alt.= $sym<$low_nt ? tch($sym) : "RREF{$sym}RREF"; if($i===0&&$safe)$alt.='(*THEN)'; } + $alts[]=$alt; } + $body='(?:'.implode('|',$alts).')'; if($rid===$select_rid)$body.='(?!'.$into.')'; + return $compiled[$rid]=$body; +}; +foreach(array_keys($rules) as $rid)$compile($rid); +// inline single-use non-recursive rules to fixpoint +do{ $changed=false; $refs=array(); + foreach($compiled as $rid=>$b)$refs[$rid]=0; + foreach($compiled as $b){ if(preg_match_all('/RREF(\d+)RREF/',$b,$m))foreach($m[1] as $r)$refs[(int)$r]=($refs[(int)$r]??0)+1; } + foreach($compiled as $rid=>$b){ if(($refs[$rid]??0)!==1)continue; if(strpos($b,"RREF{$rid}RREF")!==false)continue; + foreach($compiled as $crid=>$cb){ if(strpos($cb,"RREF{$rid}RREF")!==false){ $compiled[$crid]=str_replace("RREF{$rid}RREF",$b,$cb); unset($compiled[$rid]); $changed=true; break 2; } } } +}while($changed); +$rule_to_idx=array(); $idx_to_rule=array(); +foreach($compiled as $rid=>$_){ $rule_to_idx[$rid]=count($idx_to_rule); $idx_to_rule[]=$rid; } +// Build define with a callback that can OPTIONALLY wrap the (?&rN) call sites in captures. +$build_define=function($num_captures)use($compiled,$idx_to_rule,$rule_to_idx){ + $wrapped=0; $define=''; + foreach($idx_to_rule as $rid){ + $body=$compiled[$rid]; + $body=preg_replace_callback('/RREF(\d+)RREF/',function($m)use($rule_to_idx,&$wrapped,$num_captures){ + $ref='(?&r'.$rule_to_idx[(int)$m[1]].')'; + if($wrapped<$num_captures){ $wrapped++; return '('.$ref.')'; } // numbered capture wrapping (?&rule) + return $ref; + },$body); + $define.="(?{$body})"; + } + return array($define,$wrapped); +}; +$start_rid=$grammar->get_rule_id('query'); +$total_callsites=0; +foreach($compiled as $b){ $total_callsites+=preg_match_all('/RREF\d+RREF/',$b,$x); } +printf("Final DEFINE rules=%d, total (?&rule) call sites=%d\n",count($idx_to_rule),$total_callsites); + +// Sweep number of captures. +$ns=range(1000,1500,25); +foreach($ns as $N){ + list($define,$wrapped)=$build_define($N); + $pattern='/(?(DEFINE)'.$define.')\A(?&r'.$rule_to_idx[$start_rid].')\z/u'; + $ok=@preg_match($pattern,"\xff"); + $err=preg_last_error(); + $errmsg=preg_last_error_msg(); + $compile_ok = !($ok===false && $err!==PREG_BAD_UTF8_ERROR); + printf("captures=%-5d wrapped=%-5d pattern=%s bytes -> %s%s\n", + $N,$wrapped,number_format(strlen($pattern)), + $compile_ok?'COMPILES':'FAIL', + $compile_ok?'':" ($errmsg)"); + if(!$compile_ok && $err!==PREG_BAD_UTF8_ERROR) break; +} diff --git a/experiments/pcre2-capture-trace/wall-a-narrow.php b/experiments/pcre2-capture-trace/wall-a-narrow.php new file mode 100644 index 000000000..78f6843b4 --- /dev/null +++ b/experiments/pcre2-capture-trace/wall-a-narrow.php @@ -0,0 +1,25 @@ +a(?&p)?b))\A'.$wrapped.'\z/'; + $ok=@preg_match($pat,'',$m); + if($ok===false){ printf("FAIL at N=%d captures, pattern=%s bytes: %s\n",$N,number_format(strlen($pat)),preg_last_error_msg()); break; } +} + +// (2) JIT compile of the minimal: does jit add a tighter limit? +echo "\n== Same minimal but force a real match (JIT) ==\n"; +for($N=1000;$N<=20000;$N+=1000){ + $wrapped=str_repeat('((?&p))',$N); + $pat='/(?(DEFINE)(?

a(?&p)?b))\A'.$wrapped.'\z/'; + $ok=@preg_match($pat,str_repeat('ab',$N),$m); + if($ok===false){ printf("FAIL at N=%d, pattern=%s bytes: %s\n",$N,number_format(strlen($pat)),preg_last_error_msg()); break; } +} +echo "done\n"; diff --git a/experiments/pcre2-capture-trace/wall-b-hot.php b/experiments/pcre2-capture-trace/wall-b-hot.php new file mode 100644 index 000000000..f72055532 --- /dev/null +++ b/experiments/pcre2-capture-trace/wall-b-hot.php @@ -0,0 +1,58 @@ +lowest_non_terminal_id; $rules=$grammar->rules; +$single=$grammar->single_candidate_rules ?? array(); +$select_rid=$grammar->get_rule_id('selectStatement'); $into=tch(WP_MySQL_Lexer::INTO_SYMBOL); +$compiled=array(); +$compile=function($rid)use(&$compiled,$rules,$low_nt,$single,$select_rid,$into){ + if(isset($compiled[$rid]))return $compiled[$rid]; $alts=array(); $safe=isset($single[$rid]); + foreach($rules[$rid] as $branch){ $alt=''; foreach($branch as $i=>$sym){ $alt.= $sym<$low_nt?tch($sym):"RREF{$sym}RREF"; if($i===0&&$safe)$alt.='(*THEN)'; } $alts[]=$alt; } + $body='(?:'.implode('|',$alts).')'; if($rid===$select_rid)$body.='(?!'.$into.')'; return $compiled[$rid]=$body; }; +foreach(array_keys($rules) as $rid)$compile($rid); +do{ $changed=false; $refs=array(); foreach($compiled as $rid=>$b)$refs[$rid]=0; + foreach($compiled as $b){ if(preg_match_all('/RREF(\d+)RREF/',$b,$m))foreach($m[1] as $r)$refs[(int)$r]=($refs[(int)$r]??0)+1; } + foreach($compiled as $rid=>$b){ if(($refs[$rid]??0)!==1)continue; if(strpos($b,"RREF{$rid}RREF")!==false)continue; + foreach($compiled as $crid=>$cb){ if(strpos($cb,"RREF{$rid}RREF")!==false){ $compiled[$crid]=str_replace("RREF{$rid}RREF",$b,$cb); unset($compiled[$rid]); $changed=true; break 2; } } } +}while($changed); +$rule_to_idx=array(); $idx_to_rule=array(); +foreach($compiled as $rid=>$_){ $rule_to_idx[$rid]=count($idx_to_rule); $idx_to_rule[]=$rid; } +$start_rid=$grammar->get_rule_id('query'); +// Hot rules: expression cascade. Find their rule ids if present after inlining. +$hot_names=array('expr','boolPri','predicate','bitExpr','simpleExpr','exprList','primaryExpr'); +$hot_idx=array(); +foreach($hot_names as $nm){ $rid=@$grammar->get_rule_id($nm); if($rid!==null&&isset($rule_to_idx[$rid]))$hot_idx[$rule_to_idx[$rid]]=$nm; } +echo "Hot rules surviving inlining: ".implode(',',array_values($hot_idx))." (".count($hot_idx)." rules)\n"; + +$build=function($ncap,$hot_only)use($compiled,$idx_to_rule,$rule_to_idx,$start_rid,$hot_idx){ + $wrapped=0; $define=''; + foreach($idx_to_rule as $rid){ $body=$compiled[$rid]; + $body=preg_replace_callback('/RREF(\d+)RREF/',function($m)use($rule_to_idx,&$wrapped,$ncap,$hot_only,$hot_idx){ + $tgt=$rule_to_idx[(int)$m[1]]; $ref='(?&r'.$tgt.')'; + $eligible = $hot_only ? isset($hot_idx[$tgt]) : true; + if($eligible && $wrapped<$ncap){$wrapped++;return '('.$ref.')';} return $ref; },$body); + $define.="(?{$body})"; } + return array('/(?(DEFINE)'.$define.')\A(?&r'.$rule_to_idx[$start_rid].')\z/u',$wrapped); }; + +$h=fopen("$base/tests/mysql/data/mysql-server-tests-queries.csv",'r'); $queries=array(); $hdr=true; +while(($r=fgetcsv($h,null,',','"','\\'))!==false){ if($hdr){$hdr=false;continue;} if($r[0]!==null)$queries[]=$r[0]; if(count($queries)>=30000)break; } +fclose($h); +$enc=array(); foreach($queries as $q){ $s=''; foreach((new WP_MySQL_Lexer($q))->remaining_tokens() as $t)$s.=tch($t->id); $enc[]=$s; } +$n=count($enc); +$bench=function($pat)use($enc,$n){ $run=function()use($pat,$enc){ $s=microtime(true); foreach($enc as $e)@preg_match($pat,$e); return microtime(true)-$s; }; + for($i=0;$i<2;$i++)$run(); $qs=array(); for($r=0;$r<7;$r++)$qs[]=$n/$run(); sort($qs); return $qs[count($qs)-1]; }; + +foreach(array(array(0,false),array(6,true),array(50,true),array(200,true)) as $cfg){ + list($ncap,$hot)=$cfg; list($pat,$w)=$build($ncap,$hot); + @preg_match($pat,"\xff"); $qps=$bench($pat); + printf("captures=%-3d (hot=%s, wrapped=%d) validate=%d QPS\n",$ncap,$hot?'y':'n',$w,$qps); +} diff --git a/experiments/pcre2-capture-trace/wall-b-jit-captures.php b/experiments/pcre2-capture-trace/wall-b-jit-captures.php new file mode 100644 index 000000000..758144ab4 --- /dev/null +++ b/experiments/pcre2-capture-trace/wall-b-jit-captures.php @@ -0,0 +1,57 @@ +lowest_non_terminal_id; $rules=$grammar->rules; +$single=$grammar->single_candidate_rules ?? array(); +$select_rid=$grammar->get_rule_id('selectStatement'); $into=tch(WP_MySQL_Lexer::INTO_SYMBOL); +$compiled=array(); +$compile=function($rid)use(&$compiled,$rules,$low_nt,$single,$select_rid,$into){ + if(isset($compiled[$rid]))return $compiled[$rid]; $alts=array(); $safe=isset($single[$rid]); + foreach($rules[$rid] as $branch){ $alt=''; foreach($branch as $i=>$sym){ $alt.= $sym<$low_nt?tch($sym):"RREF{$sym}RREF"; if($i===0&&$safe)$alt.='(*THEN)'; } $alts[]=$alt; } + $body='(?:'.implode('|',$alts).')'; if($rid===$select_rid)$body.='(?!'.$into.')'; return $compiled[$rid]=$body; }; +foreach(array_keys($rules) as $rid)$compile($rid); +do{ $changed=false; $refs=array(); foreach($compiled as $rid=>$b)$refs[$rid]=0; + foreach($compiled as $b){ if(preg_match_all('/RREF(\d+)RREF/',$b,$m))foreach($m[1] as $r)$refs[(int)$r]=($refs[(int)$r]??0)+1; } + foreach($compiled as $rid=>$b){ if(($refs[$rid]??0)!==1)continue; if(strpos($b,"RREF{$rid}RREF")!==false)continue; + foreach($compiled as $crid=>$cb){ if(strpos($cb,"RREF{$rid}RREF")!==false){ $compiled[$crid]=str_replace("RREF{$rid}RREF",$b,$cb); unset($compiled[$rid]); $changed=true; break 2; } } } +}while($changed); +$rule_to_idx=array(); $idx_to_rule=array(); +foreach($compiled as $rid=>$_){ $rule_to_idx[$rid]=count($idx_to_rule); $idx_to_rule[]=$rid; } +$start_rid=$grammar->get_rule_id('query'); +$build=function($ncap)use($compiled,$idx_to_rule,$rule_to_idx,$start_rid){ + $wrapped=0; $define=''; + foreach($idx_to_rule as $rid){ $body=$compiled[$rid]; + $body=preg_replace_callback('/RREF(\d+)RREF/',function($m)use($rule_to_idx,&$wrapped,$ncap){ + $ref='(?&r'.$rule_to_idx[(int)$m[1]].')'; if($wrapped<$ncap){$wrapped++;return '('.$ref.')';} return $ref; },$body); + $define.="(?{$body})"; } + return '/(?(DEFINE)'.$define.')\A(?&r'.$rule_to_idx[$start_rid].')\z/u'; }; + +// Load + encode 30K queries. +$h=fopen("$base/tests/mysql/data/mysql-server-tests-queries.csv",'r'); $queries=array(); $hdr=true; +while(($r=fgetcsv($h,null,',','"','\\'))!==false){ if($hdr){$hdr=false;continue;} if($r[0]!==null)$queries[]=$r[0]; if(count($queries)>=30000)break; } +fclose($h); +$enc=array(); foreach($queries as $q){ $s=''; foreach((new WP_MySQL_Lexer($q))->remaining_tokens() as $t)$s.=tch($t->id); $enc[]=$s; } +$n=count($enc); + +$bench=function($pat)use($enc,$n){ + $run=function()use($pat,$enc){ $s=microtime(true); foreach($enc as $e)@preg_match($pat,$e); return microtime(true)-$s; }; + for($i=0;$i<2;$i++)$run(); $qs=array(); for($r=0;$r<7;$r++)$qs[]=$n/$run(); sort($qs); return $qs[count($qs)-1]; }; + +foreach(array(0,6) as $ncap){ + $pat=$build($ncap); + $ok=@preg_match($pat,"\xff"); // warm compile + $qps=$bench($pat); + printf("captures=%d pattern=%s bytes validate=%d QPS\n",$ncap,number_format(strlen($pat)),$qps); +} diff --git a/experiments/pcre2-capture-trace/wall-b-jitcheck.php b/experiments/pcre2-capture-trace/wall-b-jitcheck.php new file mode 100644 index 000000000..75080b3b7 --- /dev/null +++ b/experiments/pcre2-capture-trace/wall-b-jitcheck.php @@ -0,0 +1,18 @@ +(?&t)(?:[+\-](?&t))*)(?(?&f)(?:[*\/](?&f))*)(?[0-9]+|\((?&e)\))'; +$pat0='/(?(DEFINE)'.$def0.')\A(?&e)\z/'; +// Same but wrap the 6 (?&...) call sites in captures. +$def6='(?((?&t))(?:[+\-]((?&t)))*)(?((?&f))(?:[*\/]((?&f)))*)(?[0-9]+|\(((?&e)))\))'; +$pat6='/(?(DEFINE)'.$def6.')\A((?&e))\z/'; +// Build inputs: nested expressions. +function gen($d){ if($d<=0)return (string)mt_rand(0,9); return '('.gen($d-1).'+'.gen($d-1).'*'.gen($d-1).')'; } +mt_srand(1); $inp=array(); for($i=0;$i<20000;$i++)$inp[]=gen(3); +$bench=function($pat)use($inp){ $run=function()use($pat,$inp){$s=microtime(true);foreach($inp as $x)@preg_match($pat,$x);return microtime(true)-$s;}; + for($i=0;$i<3;$i++)$run(); $qs=array(); for($r=0;$r<7;$r++)$qs[]=count($inp)/$run(); sort($qs); return $qs[count($qs)-1]; }; +@preg_match($pat0,'1'); $q0=$bench($pat0); +@preg_match($pat6,'1'); $q6=$bench($pat6); +printf("small recursive: 0-cap=%d QPS 6-cap=%d QPS ratio=%.2fx\n",$q0,$q6,$q0/$q6); diff --git a/experiments/pcre2-capture-trace/wall-b-jitsize.php b/experiments/pcre2-capture-trace/wall-b-jitsize.php new file mode 100644 index 000000000..d1969d6a1 --- /dev/null +++ b/experiments/pcre2-capture-trace/wall-b-jitsize.php @@ -0,0 +1,36 @@ + huge +// speedup; on the grammar pattern, toggling pcre.jit should change nothing if it +// can't JIT. +$base='/Users/janjakes/.superset/worktrees/SQLite/parser-perf/packages/mysql-on-sqlite'; +require_once "$base/src/parser/class-wp-parser-grammar.php"; +require_once "$base/src/parser/class-wp-parser-token.php"; +require_once "$base/src/mysql/class-wp-mysql-token.php"; +require_once "$base/src/mysql/class-wp-mysql-lexer.php"; +ini_set('pcre.backtrack_limit','1000000000'); ini_set('pcre.recursion_limit','10000000'); +const TOKEN_OFFSET=0x4000; function tch($t){return mb_chr($t+TOKEN_OFFSET,'UTF-8');} +$grammar=new WP_Parser_Grammar(require "$base/src/mysql/mysql-grammar.php"); +$low_nt=$grammar->lowest_non_terminal_id; $rules=$grammar->rules; +$single=$grammar->single_candidate_rules ?? array(); +$select_rid=$grammar->get_rule_id('selectStatement'); $into=tch(WP_MySQL_Lexer::INTO_SYMBOL); +$compiled=array(); +$compile=function($rid)use(&$compiled,$rules,$low_nt,$single,$select_rid,$into){ + if(isset($compiled[$rid]))return $compiled[$rid]; $alts=array(); $safe=isset($single[$rid]); + foreach($rules[$rid] as $branch){ $alt=''; foreach($branch as $i=>$sym){ $alt.= $sym<$low_nt?tch($sym):"RREF{$sym}RREF"; if($i===0&&$safe)$alt.='(*THEN)'; } $alts[]=$alt; } + $body='(?:'.implode('|',$alts).')'; if($rid===$select_rid)$body.='(?!'.$into.')'; return $compiled[$rid]=$body; }; +foreach(array_keys($rules) as $rid)$compile($rid); +do{ $changed=false; $refs=array(); foreach($compiled as $rid=>$b)$refs[$rid]=0; + foreach($compiled as $b){ if(preg_match_all('/RREF(\d+)RREF/',$b,$m))foreach($m[1] as $r)$refs[(int)$r]=($refs[(int)$r]??0)+1; } + foreach($compiled as $rid=>$b){ if(($refs[$rid]??0)!==1)continue; if(strpos($b,"RREF{$rid}RREF")!==false)continue; + foreach($compiled as $crid=>$cb){ if(strpos($cb,"RREF{$rid}RREF")!==false){ $compiled[$crid]=str_replace("RREF{$rid}RREF",$b,$cb); unset($compiled[$rid]); $changed=true; break 2; } } } +}while($changed); +$rule_to_idx=array(); $i2r=array(); foreach($compiled as $rid=>$_){ $rule_to_idx[$rid]=count($i2r); $i2r[]=$rid; } +$define=''; foreach($i2r as $rid){ $b=preg_replace_callback('/RREF(\d+)RREF/',function($m)use($rule_to_idx){return '(?&r'.$rule_to_idx[(int)$m[1]].')';},$compiled[$rid]); $define.="(?{$b})"; } +$pat='/(?(DEFINE)'.$define.')\A(?&r'.$rule_to_idx[$grammar->get_rule_id('query')].')\z/u'; +$h=fopen("$base/tests/mysql/data/mysql-server-tests-queries.csv",'r'); $q=array(); $hdr=true; +while(($r=fgetcsv($h,null,',','"','\\'))!==false){ if($hdr){$hdr=false;continue;} if($r[0]!==null)$q[]=$r[0]; if(count($q)>=10000)break; } fclose($h); +$enc=array(); foreach($q as $x){ $s=''; foreach((new WP_MySQL_Lexer($x))->remaining_tokens() as $t)$s.=tch($t->id); $enc[]=$s; } +$bench=function($pat)use($enc){ $run=function()use($pat,$enc){$s=microtime(true);foreach($enc as $e)@preg_match($pat,$e);return microtime(true)-$s;}; + for($i=0;$i<2;$i++)$run(); $qs=array();for($r=0;$r<5;$r++)$qs[]=count($enc)/$run(); sort($qs); return $qs[count($qs)-1]; }; +foreach(array('1','0') as $jit){ ini_set('pcre.jit',$jit); @preg_match($pat,"\xff"); printf("pcre.jit=%s grammar validate=%d QPS\n",$jit,$bench($pat)); } diff --git a/experiments/pcre2-capture-trace/wall-c-export.php b/experiments/pcre2-capture-trace/wall-c-export.php new file mode 100644 index 000000000..5e47c463d --- /dev/null +++ b/experiments/pcre2-capture-trace/wall-c-export.php @@ -0,0 +1,35 @@ + exercised by UNMATCHED_AS_NULL). + $def=''; for($i=0;$i<$N;$i++){ $def.="(?x$i?)"; } // each is a named group that can match + // Start: match a fixed short string, then optionally call a few subroutines. + $start='\A(?&g0)(?&g1)(?&g2)'; + return '/(?(DEFINE)'.$def.')'.$start.'/'; +}; +$subject=''; // g0,g1,g2 are all 'xK?' so empty subject matches (zero-length) +foreach(array(100,400,800,1200,1400,1600) as $N){ + $pat=$mk($N); + $ok=@preg_match($pat,$subject,$m,PREG_OFFSET_CAPTURE|PREG_UNMATCHED_AS_NULL); + if($ok===false){ printf("N=%d FAIL: %s\n",$N,preg_last_error_msg()); continue; } + $named=count($m); + // Bench: with export vs without. + $iters=100000; + // warm + for($i=0;$i<5000;$i++)@preg_match($pat,$subject,$m,PREG_OFFSET_CAPTURE|PREG_UNMATCHED_AS_NULL); + $best_exp=INF; for($r=0;$r<5;$r++){ $s=microtime(true); for($i=0;$i<$iters;$i++)@preg_match($pat,$subject,$m,PREG_OFFSET_CAPTURE|PREG_UNMATCHED_AS_NULL); $d=microtime(true)-$s; if($d<$best_exp)$best_exp=$d; } + for($i=0;$i<5000;$i++)@preg_match($pat,$subject); + $best_no=INF; for($r=0;$r<5;$r++){ $s=microtime(true); for($i=0;$i<$iters;$i++)@preg_match($pat,$subject); $d=microtime(true)-$s; if($d<$best_no)$best_no=$d; } + printf("N=%-4d matches-exported=%-4d with-export=%.2f us/call no-export=%.2f us/call export-cost=%.2f us\n", + $N,$named,$best_exp/$iters*1e6,$best_no/$iters*1e6,($best_exp-$best_no)/$iters*1e6); +} diff --git a/experiments/pcre2-capture-trace/wall-c-export2.php b/experiments/pcre2-capture-trace/wall-c-export2.php new file mode 100644 index 000000000..6930ea612 --- /dev/null +++ b/experiments/pcre2-capture-trace/wall-c-export2.php @@ -0,0 +1,25 @@ +x)? . Subject 'x' matches only g0 via + // the first; rest stay unmatched -> exported as null (or [null,-1] w/ offsets). + $body=''; for($i=0;$i<$N;$i++){ $body.="(?x)?"; } + return '/\A'.$body.'\z/'; +}; +$subject='x'; // matches g0; g1..g(N-1) unmatched +foreach(array(100,400,800,1200,1400,1600,2000) as $N){ + $pat=$mk($N); + $ok=@preg_match($pat,$subject,$m,PREG_OFFSET_CAPTURE|PREG_UNMATCHED_AS_NULL); + if($ok===false){ printf("N=%d FAIL: %s\n",$N,preg_last_error_msg()); continue; } + $named=count($m); + $iters=50000; + for($i=0;$i<3000;$i++)@preg_match($pat,$subject,$m,PREG_OFFSET_CAPTURE|PREG_UNMATCHED_AS_NULL); + $be=INF; for($r=0;$r<5;$r++){ $s=microtime(true); for($i=0;$i<$iters;$i++)@preg_match($pat,$subject,$m,PREG_OFFSET_CAPTURE|PREG_UNMATCHED_AS_NULL); $d=microtime(true)-$s; if($d<$be)$be=$d; } + for($i=0;$i<3000;$i++)@preg_match($pat,$subject); + $bn=INF; for($r=0;$r<5;$r++){ $s=microtime(true); for($i=0;$i<$iters;$i++)@preg_match($pat,$subject); $d=microtime(true)-$s; if($d<$bn)$bn=$d; } + printf("N=%-4d exported=%-5d with-export=%.2f us no-export=%.2f us export-only=%.2f us\n", + $N,$named,$be/$iters*1e6,$bn/$iters*1e6,($be-$bn)/$iters*1e6); +} From 03257badce50a2639e0edaced6495db235978033 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 17:49:01 +0200 Subject: [PATCH 13/23] Add PCRE2-callouts-via-FFI experiment and correction Binding a PHP closure to pcre2_set_callout_8 works and yields a structural trace (~314K/63K/29K QPS by callout density). Corrects an earlier probe (dea9df7) that wrongly concluded callouts were blocked. Needs PHP 7.4+ FFI. --- experiments/pcre2-callouts-ffi/NOTES.md | 36 ++++ .../pcre2-callouts-ffi/bench-callout.php | 176 ++++++++++++++++++ .../pcre2-callouts-ffi/exp-pcre-ffi-stale.php | 164 ++++++++++++++++ .../pcre2-callouts-ffi/probe-callout.php | 149 +++++++++++++++ 4 files changed, 525 insertions(+) create mode 100644 experiments/pcre2-callouts-ffi/NOTES.md create mode 100644 experiments/pcre2-callouts-ffi/bench-callout.php create mode 100644 experiments/pcre2-callouts-ffi/exp-pcre-ffi-stale.php create mode 100644 experiments/pcre2-callouts-ffi/probe-callout.php diff --git a/experiments/pcre2-callouts-ffi/NOTES.md b/experiments/pcre2-callouts-ffi/NOTES.md new file mode 100644 index 000000000..bf723348a --- /dev/null +++ b/experiments/pcre2-callouts-ffi/NOTES.md @@ -0,0 +1,36 @@ +# PCRE2 callouts via FFI + +**Origin:** the working probe (`probe-callout.php`, `bench-callout.php`) was rebuilt +fresh. `exp-pcre-ffi-stale.php` is the earlier `_parser_perf` probe (commit dea9df7) +whose conclusion is WRONG — kept here only to record the correction. No PR. + +**Idea:** PCRE2 user callouts ((?C) markers) fire a C callback during matching. With +a callout at every rule entry, one `pcre2_match_8` yields a full trace of which +alternative entered at each rule; a linear walk reconstructs the AST. Stock PHP +doesn't expose `pcre2_set_callout`, so the bridge is PHP FFI. + +**Correction:** the stale probe concluded "PHP FFI cannot bind a closure to a C +function pointer → callouts blocked." That is FALSE. The correct idiom is to pass +the PHP closure DIRECTLY as the function-pointer argument to `pcre2_set_callout_8` +(PHP FFI builds a libffi trampoline) — NOT `FFI::cast('cb_t', $closure)`. Verified +on PHP 8.5.5 + libpcre2-8 10.47: matching `1+2*3` against a recursive arithmetic +grammar produced a correct (rule, position) trace from a single match. + +**Run:** `php -d ...jit... probe-callout.php` (shows the trace); `bench-callout.php` +(throughput by callout density). + +**Result (trace-building closure, best-of-7):** + +| input | callouts/match | QPS | +|----------|----------------|-------| +| 10 tok | ~35 | ~314K | +| 50 tok | ~175 | ~63K | +| 100 tok | ~350 | ~29K | + +Per-callout overhead ~50 ns (libffi trampoline + Zend re-entry); register the +closure once per request and reuse the match context (per-registration leak). + +**Verdict:** Real and powerful WHERE AVAILABLE — a callout-emitting grammar regex + +trace-driven AST builder is a genuine architecture. But FFI was introduced in PHP +7.4 (so PHP 7.2/7.3 have none) and `ffi.enable` is routinely disabled on shared/ +managed WP hosting. The deployment story rules it out as a default. diff --git a/experiments/pcre2-callouts-ffi/bench-callout.php b/experiments/pcre2-callouts-ffi/bench-callout.php new file mode 100644 index 000000000..7d85ba7c2 --- /dev/null +++ b/experiments/pcre2-callouts-ffi/bench-callout.php @@ -0,0 +1,176 @@ +(?C1)(?&product)(?:\+(?&sum))?)' + . '(?(?C2)(?&atom)(?:\*(?&product))?)' + . '(?(?C3)\d+|(?C4)\((?&sum)\))' + . ')^(?&sum)$'; + +function compile_pattern( FFI $ffi, string $pattern ) { + $err_code = $ffi->new( 'int' ); + $err_off = $ffi->new( 'size_t' ); + $pat_arr = $ffi->new( 'char[' . strlen( $pattern ) . ']', false ); + FFI::memcpy( $pat_arr, $pattern, strlen( $pattern ) ); + $code = $ffi->pcre2_compile_8( + $ffi->cast( 'PCRE2_SPTR8', FFI::addr( $pat_arr ) ), + strlen( $pattern ), + 0, + FFI::addr( $err_code ), + FFI::addr( $err_off ), + null + ); + FFI::free( FFI::addr( $pat_arr ) ); + if ( null === $code ) { + $buf = $ffi->new( 'char[256]' ); + $ffi->pcre2_get_error_message_8( $err_code->cdata, $ffi->cast( 'PCRE2_UCHAR8 *', FFI::addr( $buf ) ), 256 ); + fwrite( STDERR, 'compile failed: ' . FFI::string( FFI::addr( $buf ) ) . "\n" ); + exit( 1 ); + } + return $code; +} + +// Build a subject of roughly N tokens: "1+1+...+1*1" style alternating ops. +function make_subject( int $tokens ) { + $parts = array(); + for ( $i = 0; $i < $tokens; $i++ ) { + $parts[] = (string) ( ( $i % 9 ) + 1 ); + } + // Join with alternating + and * so the grammar exercises both rules. + $out = $parts[0]; + for ( $i = 1; $i < count( $parts ); $i++ ) { + $out .= ( $i % 2 ) ? '+' : '*'; + $out .= $parts[ $i ]; + } + return $out; +} + +$jit = getenv( 'USE_JIT_COMPILE' ) === '1'; + +$code = compile_pattern( $ffi, $pattern ); +if ( $jit ) { + $rc = $ffi->pcre2_jit_compile_8( $code, 1 ); // PCRE2_JIT_COMPLETE + fwrite( STDERR, "pcre2_jit_compile rc=$rc\n" ); +} + +// Reused across ALL iterations. +$mctx = $ffi->pcre2_match_context_create_8( null ); +$mdata = $ffi->pcre2_match_data_create_from_pattern_8( $code, null ); + +$trace = array(); +$callout = function ( $blockptr, $data ) use ( &$trace ) { + $blk = $blockptr[0]; + $trace[] = array( $blk->callout_number, $blk->current_position ); + return 0; +}; +$ffi->pcre2_set_callout_8( $mctx, $callout, null ); + +$sizes = array( 10, 50, 100 ); +echo str_pad( 'tokens', 8 ) . str_pad( 'callouts', 10 ) . str_pad( 'QPS', 12 ) . "\n"; + +foreach ( $sizes as $tokens ) { + $subject = make_subject( $tokens ); + $slen = strlen( $subject ); + $subj = $ffi->new( 'char[' . $slen . ']', false ); + FFI::memcpy( $subj, $subject, $slen ); + $subj_ptr = $ffi->cast( 'PCRE2_SPTR8', FFI::addr( $subj ) ); + + // Warm + capture callout count. + $trace = array(); + $rc = $ffi->pcre2_match_8( $code, $subj_ptr, $slen, 0, 0, $mdata, $mctx ); + if ( $rc < 0 ) { + fwrite( STDERR, "no match for tokens=$tokens (rc=$rc), subj=$subject\n" ); + FFI::free( FFI::addr( $subj ) ); + continue; + } + $callout_count = count( $trace ); + + $best = 0.0; + for ( $run = 0; $run < 7; $run++ ) { + $iters = 2000; + $t0 = hrtime( true ); + for ( $i = 0; $i < $iters; $i++ ) { + $trace = array(); + $ffi->pcre2_match_8( $code, $subj_ptr, $slen, 0, 0, $mdata, $mctx ); + } + $dt = ( hrtime( true ) - $t0 ) / 1e9; + $qps = $iters / $dt; + if ( $qps > $best ) { + $best = $qps; + } + } + + printf( "%-8d%-10d%-12s\n", $tokens, $callout_count, number_format( $best, 0 ) ); + FFI::free( FFI::addr( $subj ) ); +} diff --git a/experiments/pcre2-callouts-ffi/exp-pcre-ffi-stale.php b/experiments/pcre2-callouts-ffi/exp-pcre-ffi-stale.php new file mode 100644 index 000000000..df18c8590 --- /dev/null +++ b/experiments/pcre2-callouts-ffi/exp-pcre-ffi-stale.php @@ -0,0 +1,164 @@ +pcre2_compile_8( + FFI::cast( 'PCRE2_SPTR8', FFI::addr( FFI::new( 'char[' . strlen( $pat_buf ) . ']' ) ) ), + 0, // We'll set length below in real code. + 0, + FFI::addr( $err_code ), + FFI::addr( $err_off ), + null +); + +// The above is wrong because we didn't actually copy the pattern bytes +// into the buffer. Let's do it properly. +$pat_arr = $ffi->new( 'char[' . strlen( $pat_buf ) . ']' ); +FFI::memcpy( $pat_arr, $pat_buf, strlen( $pat_buf ) ); +$code = $ffi->pcre2_compile_8( + FFI::cast( 'PCRE2_SPTR8', FFI::addr( $pat_arr ) ), + strlen( $pat_buf ), + 0, + FFI::addr( $err_code ), + FFI::addr( $err_off ), + null +); +if ( null === $code ) { + $buf = $ffi->new( 'char[256]' ); + $ffi->pcre2_get_error_message_8( $err_code->cdata, FFI::cast( 'PCRE2_UCHAR8 *', FFI::addr( $buf ) ), 256 ); + echo 'compile failed: code=', $err_code->cdata, ' offset=', $err_off->cdata, ' msg=', FFI::string( FFI::addr( $buf ) ), "\n"; + exit( 1 ); +} +echo "Pattern compiled OK\n"; + +// Try setting up a callout via FFI. +$callout_log = array(); +$mctx = $ffi->pcre2_match_context_create_8( null ); +$callout_cb = function ( $blockptr, $data ) use ( &$callout_log ) { + // $blockptr is FFI\CData type pcre2_callout_block_8*. + $blk = $blockptr; + $callout_log[] = array( + 'num' => $blk->callout_number, + 'pos' => $blk->current_position, + 'mat' => $blk->start_match, + ); + return 0; // continue matching +}; +// Cast our PHP closure to a C function pointer. PHP FFI supports this +// for callbacks via `FFI::cast` on a closure. +$cb_type = 'int (*)(pcre2_callout_block_8 *, void *)'; +echo "Trying to bind callout callback...\n"; +try { + $cb_ffi = $ffi->new( $cb_type ); + echo "Callback type created.\n"; + // PHP FFI does not directly support binding a closure to a function + // pointer in arbitrary C signatures - this typically needs a Zend + // FFI extension feature or libffi closures. +} catch ( \Throwable $e ) { + echo 'Could not bind: ', $e->getMessage(), "\n"; +} + +// Even attempting to call pcre2_set_callout_8 with a closure tends to +// fail. Document and stop. +echo "\nConclusion: PHP FFI cannot bind a PHP callback to a C function pointer in stock PHP, so it cannot supply a PCRE2 callout function.\n"; diff --git a/experiments/pcre2-callouts-ffi/probe-callout.php b/experiments/pcre2-callouts-ffi/probe-callout.php new file mode 100644 index 000000000..51d4d225b --- /dev/null +++ b/experiments/pcre2-callouts-ffi/probe-callout.php @@ -0,0 +1,149 @@ +(?C1)(?&product)(?:\+(?&sum))?)' + . '(?(?C2)(?&atom)(?:\*(?&product))?)' + . '(?(?C3)\d+|(?C4)\((?&sum)\))' + . ')^(?&sum)$'; + +$err_code = $ffi->new( 'int' ); +$err_off = $ffi->new( 'size_t' ); +$pat_arr = $ffi->new( 'char[' . strlen( $pattern ) . ']', false ); +FFI::memcpy( $pat_arr, $pattern, strlen( $pattern ) ); + +$code = $ffi->pcre2_compile_8( + $ffi->cast( 'PCRE2_SPTR8', FFI::addr( $pat_arr ) ), + strlen( $pattern ), + 0, + FFI::addr( $err_code ), + FFI::addr( $err_off ), + null +); +if ( null === $code ) { + $buf = $ffi->new( 'char[256]' ); + $ffi->pcre2_get_error_message_8( $err_code->cdata, $ffi->cast( 'PCRE2_UCHAR8 *', FFI::addr( $buf ) ), 256 ); + fwrite( STDERR, 'compile failed: code=' . $err_code->cdata . ' offset=' . $err_off->cdata . ' msg=' . FFI::string( FFI::addr( $buf ) ) . "\n" ); + exit( 1 ); +} +echo "Pattern compiled OK\n"; + +$mctx = $ffi->pcre2_match_context_create_8( null ); + +$trace = array(); +$callout = function ( $blockptr, $data ) use ( &$trace ) { + $blk = $blockptr[0]; // deref pcre2_callout_block_8* + $trace[] = array( + 'num' => $blk->callout_number, + 'pos' => $blk->pattern_position, + 'cur' => $blk->current_position, + 'cap' => $blk->capture_last, + ); + return 0; // continue matching +}; + +// THE CORRECT IDIOM: pass the closure DIRECTLY as the function pointer. +$rc = $ffi->pcre2_set_callout_8( $mctx, $callout, null ); +echo "pcre2_set_callout_8 rc=$rc\n"; + +$subject = '1+2*3'; +$mdata = $ffi->pcre2_match_data_create_from_pattern_8( $code, null ); +$subj = $ffi->new( 'char[' . strlen( $subject ) . ']', false ); +FFI::memcpy( $subj, $subject, strlen( $subject ) ); + +$rc = $ffi->pcre2_match_8( + $code, + $ffi->cast( 'PCRE2_SPTR8', FFI::addr( $subj ) ), + strlen( $subject ), + 0, + 0, + $mdata, + $mctx +); + +echo "match rc=$rc (>=0 means matched)\n"; +echo 'callout fired ' . count( $trace ) . " times\n"; +$rulemap = array( 1 => 'sum', 2 => 'product', 3 => 'atom(num)', 4 => 'atom(paren)' ); +foreach ( $trace as $i => $t ) { + printf( + " [%2d] C%d %-12s subject_pos=%d\n", + $i, + $t['num'], + $rulemap[ $t['num'] ] ?? '?', + $t['cur'] + ); +} From 810a499442a1c42589e53805be1b340bfb1b5c62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 17:49:01 +0200 Subject: [PATCH 14/23] Add preg_replace_callback shift-reduce experiment Mega-pattern of 4223 RHS alternatives. One no-op pass already ~2.5x slower than the parser; epsilon branches block bottom-up reduction. --- .../NOTES.md | 20 +++ .../bench-13.php | 136 ++++++++++++++++++ .../build-mega.php | 122 ++++++++++++++++ 3 files changed, 278 insertions(+) create mode 100644 experiments/preg-replace-callback-shiftreduce/NOTES.md create mode 100644 experiments/preg-replace-callback-shiftreduce/bench-13.php create mode 100644 experiments/preg-replace-callback-shiftreduce/build-mega.php diff --git a/experiments/preg-replace-callback-shiftreduce/NOTES.md b/experiments/preg-replace-callback-shiftreduce/NOTES.md new file mode 100644 index 000000000..22439a507 --- /dev/null +++ b/experiments/preg-replace-callback-shiftreduce/NOTES.md @@ -0,0 +1,20 @@ +# Iterative preg_replace_callback shift-reduce + +**Origin:** ephemeral exploration (rebuilt fresh). No PR/commit. + +**Idea:** build a mega-pattern = alternation of every rule right-hand side; each +`preg_replace_callback` pass reduces matches into non-terminal placeholders; +iterate until the input collapses to the start symbol. `build-mega.php` is the +shared builder (also used by `../binary-bottomup-reduction/`). + +**Run:** `php -d ...jit... bench-13.php` + +**Result:** mega-pattern = 4223 alternatives, ~30 KB, JIT-compiles. Per-call costs: +preg_match (first) ~278K QPS; preg_match_all ~22K; preg_replace_callback no-op +~21K; parser baseline ~56K. + +**Verdict:** even one no-op `preg_replace_callback` pass is ~2.5× slower than the +parser; a real reducer needs 6+ passes plus AST building (~4K QPS, ~15× slower). +The "find all non-overlapping matches" cost dominates per call — a native C +function is not free. Structural blocker: ~25% of rules have an empty (ε) branch, +which bottom-up reduction can't synthesize. diff --git a/experiments/preg-replace-callback-shiftreduce/bench-13.php b/experiments/preg-replace-callback-shiftreduce/bench-13.php new file mode 100644 index 000000000..86ec50f46 --- /dev/null +++ b/experiments/preg-replace-callback-shiftreduce/bench-13.php @@ -0,0 +1,136 @@ +lowest_non_terminal_id; +$rules = $grammar->rules; + +$alts = array(); +foreach ( $rules as $rid => $branches ) { + foreach ( $branches as $branch ) { + if ( count( $branch ) === 0 ) { + continue; + } + $s = ''; + foreach ( $branch as $sym ) { + $s .= ( $sym < $low_nt ) ? tok_char( $sym ) : nt_char( $sym ); + } + $alts[] = preg_quote( $s, '/' ); + } +} +usort( + $alts, + function ( $a, $b ) { + return strlen( $b ) - strlen( $a ); + } +); +$pattern = '/(?:' . implode( '|', $alts ) . ')/u'; + +ini_set( 'pcre.jit', '1' ); +ini_set( 'pcre.backtrack_limit', '100000000' ); +ini_set( 'pcre.recursion_limit', '10000000' ); + +// Load + encode corpus (token-only, no non-terminals: this is the raw input +// a bottom-up reducer would see on pass 1). +$limit = (int) ( $argv[1] ?? 5000 ); +$handle = fopen( __DIR__ . '/../../corpus/mysql-server-tests-queries.csv', 'r' ); +$queries = array(); +while ( ( $r = fgetcsv( $handle, null, ',', '"', '\\' ) ) !== false ) { + $q = $r[0] ?? null; + if ( null === $q || '' === $q ) { + continue; + } + $queries[] = $q; + if ( count( $queries ) >= $limit ) { + break; + } +} +fclose( $handle ); + +$encoded = array(); +foreach ( $queries as $q ) { + $tokens = ( new WP_MySQL_Lexer( $q ) )->remaining_tokens(); + $s = ''; + foreach ( $tokens as $t ) { + $s .= tok_char( $t->id ); + } + $encoded[] = $s; +} +$n = count( $encoded ); + +$identity = function ( $m ) { + return $m[0]; +}; + +$ops = array( + 'preg_match (first)' => function () use ( $pattern, $encoded ) { + foreach ( $encoded as $s ) { + preg_match( $pattern, $s ); + } + }, + 'preg_match_all (no off)' => function () use ( $pattern, $encoded ) { + foreach ( $encoded as $s ) { + preg_match_all( $pattern, $s, $m ); + } + }, + 'preg_match_all (offsets)' => function () use ( $pattern, $encoded ) { + foreach ( $encoded as $s ) { + preg_match_all( $pattern, $s, $m, PREG_OFFSET_CAPTURE ); + } + }, + 'preg_replace_callback noop' => function () use ( $pattern, $encoded, $identity ) { + foreach ( $encoded as $s ) { + preg_replace_callback( $pattern, $identity, $s ); + } + }, +); + +$warmup = 2; +$runs = 7; +printf( "corpus encoded: %d queries\n", $n ); +foreach ( $ops as $name => $fn ) { + for ( $w = 0; $w < $warmup; $w++ ) { + $fn(); + } + $best = INF; + for ( $r = 0; $r < $runs; $r++ ) { + $t = microtime( true ); + $fn(); + $d = microtime( true ) - $t; + $best = min( $best, $d ); + } + printf( "%-28s %8.0f QPS\n", $name, $n / $best ); +} diff --git a/experiments/preg-replace-callback-shiftreduce/build-mega.php b/experiments/preg-replace-callback-shiftreduce/build-mega.php new file mode 100644 index 000000000..55460748b --- /dev/null +++ b/experiments/preg-replace-callback-shiftreduce/build-mega.php @@ -0,0 +1,122 @@ + mb_chr(id + 0x4000) +const NT_OFFSET = 0x40000; // non-terminals: rule id -> mb_chr(rid + 0x40000) (separate plane) + +function tok_char( $tid ) { + return mb_chr( $tid + TOKEN_OFFSET, 'UTF-8' ); +} +function nt_char( $rid ) { + return mb_chr( $rid + NT_OFFSET, 'UTF-8' ); +} + +$grammar = new WP_Parser_Grammar( require "$src/mysql/mysql-grammar.php" ); +$low_nt = $grammar->lowest_non_terminal_id; +$rules = $grammar->rules; + +/** + * Build the mega-pattern alternation over UTF-8 codepoint encoding. + * Returns [pattern, alt_count, empty_branch_count]. + */ +function build_mega_utf8( $rules, $low_nt ) { + $alts = array(); + $empty = 0; + foreach ( $rules as $rid => $branches ) { + foreach ( $branches as $branch ) { + if ( count( $branch ) === 0 ) { + ++$empty; + continue; // epsilon branch: cannot be a bottom-up RHS pattern. + } + $s = ''; + foreach ( $branch as $sym ) { + $s .= ( $sym < $low_nt ) ? tok_char( $sym ) : nt_char( $sym ); + } + $alts[] = preg_quote( $s, '/' ); + } + } + // Sort longest-first so the alternation prefers maximal RHS (greedy reduce). + usort( + $alts, + function ( $a, $b ) { + return strlen( $b ) - strlen( $a ); + } + ); + $pattern = '/(?:' . implode( '|', $alts ) . ')/u'; + return array( $pattern, count( $alts ), $empty ); +} + +if ( ( $argv[1] ?? '' ) === 'info' ) { + list( $pattern, $altc, $empty ) = build_mega_utf8( $rules, $low_nt ); + printf( "rules=%d\n", count( $rules ) ); + printf( "alt_count (non-empty branches)=%d\n", $altc ); + printf( "empty/epsilon branches=%d\n", $empty ); + printf( "pattern bytes=%s\n", number_format( strlen( $pattern ) ) ); + + ini_set( 'pcre.jit', '1' ); + $t = microtime( true ); + $ok = @preg_match( $pattern, "\xff", $m ); + printf( + "compile: %.2fms ok=%s err=%s\n", + ( microtime( true ) - $t ) * 1000, + var_export( $ok, true ), + preg_last_error_msg() + ); + $study = @preg_match( $pattern . 'S', "\xff" ); // not jit probe; do explicit below + // JIT probe: run a tiny match many times; PCRE JIT is on if jit ini set & supported. + printf( "pcre.jit ini=%s\n", ini_get( 'pcre.jit' ) ); + echo "PCRE version: " . ( defined( 'PCRE_VERSION' ) ? PCRE_VERSION : 'n/a' ) . "\n"; +} + +if ( ( $argv[1] ?? '' ) === 'compile' ) { + list( $pattern, $altc, $empty ) = build_mega_utf8( $rules, $low_nt ); + ini_set( 'pcre.jit', '1' ); + // Valid UTF-8 probe (a known token codepoint). + $probe = tok_char( 1 ); + $t = microtime( true ); + $ok = preg_match( $pattern, $probe, $m ); + printf( + "compile+match valid probe: %.2fms ok=%s err=%s match=%s\n", + ( microtime( true ) - $t ) * 1000, + var_export( $ok, true ), + preg_last_error_msg(), + $ok ? '"' . bin2hex( $m[0] ) . '"' : '-' + ); + // Force a large run to confirm JIT engages without error. + $reps = 200000; + $t = microtime( true ); + for ( $i = 0; $i < $reps; $i++ ) { + preg_match( $pattern, $probe ); + } + $d = microtime( true ) - $t; + printf( "warm preg_match probe: %.0f QPS (err=%s)\n", $reps / $d, preg_last_error_msg() ); +} From 7b1311f641bbc0538b69ba8807e66681a158f578 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 17:49:02 +0200 Subject: [PATCH 15/23] Add binary-encoded bottom-up reduction experiment Fixed-width binary encodings hit the same encoding-independent ~20-30K per-call floor; the 4-byte variant won't compile. Same wall, different direction. --- .../binary-bottomup-reduction/NOTES.md | 19 ++ .../binary-bottomup-reduction/bench-14.php | 187 ++++++++++++++++++ 2 files changed, 206 insertions(+) create mode 100644 experiments/binary-bottomup-reduction/NOTES.md create mode 100644 experiments/binary-bottomup-reduction/bench-14.php diff --git a/experiments/binary-bottomup-reduction/NOTES.md b/experiments/binary-bottomup-reduction/NOTES.md new file mode 100644 index 000000000..94e61c015 --- /dev/null +++ b/experiments/binary-bottomup-reduction/NOTES.md @@ -0,0 +1,19 @@ +# Bottom-up reduction with custom binary encoding + +**Origin:** ephemeral exploration (rebuilt fresh). No PR/commit. Reuses +`../preg-replace-callback-shiftreduce/build-mega.php`. + +**Idea:** encode tokens and reduced non-terminals as fixed-width binary records +(4-byte type+slot, 3-byte UTF-8 codepoints, single-byte), match right-hand-side +sequences via PCRE2 in `/s` mode, and reduce iteratively — hoping a tighter +encoding beats the codepoint shift-reduce. + +**Run:** `php -d ...jit... bench-14.php` + +**Result:** across encodings the per-call floor is a ~20–30K QPS band (±~1.4× by +byte width), all below the ~56K parser. The 4-byte binary variant doesn't compile +(~72 KB pattern → PCRE2 "too large"). + +**Verdict:** same wall as the preg_replace_callback approach, from a different +direction — the per-call match-finding cost is encoding-independent, and the same +epsilon-reduction problem applies. diff --git a/experiments/binary-bottomup-reduction/bench-14.php b/experiments/binary-bottomup-reduction/bench-14.php new file mode 100644 index 000000000..c29d8360d --- /dev/null +++ b/experiments/binary-bottomup-reduction/bench-14.php @@ -0,0 +1,187 @@ + pack('n', tag) . pack('n', id): + * tag 0 = terminal, tag 1 = non-terminal. + * + * For byte4 we anchor each symbol on its 4-byte boundary implicitly: because + * every record is exactly 4 bytes and the alternation only contains whole + * records, matches always land on boundaries. /s makes '.' (unused here) span + * newlines and disables UTF-8 validation; the bytes are arbitrary binary. + * + * The claim under test: regardless of encoding, the dominant cost is PCRE2's + * "scan the subject, find all non-overlapping matches of a 4223-alt pattern" — + * a per-byte automaton walk whose cost tracks subject length, not encoding + * width. So all encodings hit the same floor. + */ + +set_error_handler( + function ( $s, $m, $f, $l ) { + // Respect the @ silence operator so probe-compile failures are catchable. + if ( 0 === ( error_reporting() & $s ) ) { + return false; + } + throw new ErrorException( $m, 0, $s, $f, $l ); + } +); + +const TOKEN_OFFSET = 0x4000; +const NT_OFFSET = 0x40000; + +$src = '/Users/janjakes/.superset/worktrees/SQLite/performance/packages/mysql-on-sqlite/src'; +require_once "$src/parser/class-wp-parser-grammar.php"; +require_once "$src/parser/class-wp-parser-token.php"; +require_once "$src/mysql/class-wp-mysql-token.php"; +require_once "$src/mysql/class-wp-mysql-lexer.php"; + +$grammar = new WP_Parser_Grammar( require "$src/mysql/mysql-grammar.php" ); +$low_nt = $grammar->lowest_non_terminal_id; +$rules = $grammar->rules; + +// ---- UTF-8 codepoint encoding ---- +function u_tok( $t ) { + return mb_chr( $t + TOKEN_OFFSET, 'UTF-8' ); +} +function u_nt( $r ) { + return mb_chr( $r + NT_OFFSET, 'UTF-8' ); +} +// ---- 4-byte binary record encoding (2-byte tag + 2-byte id) ---- +function b_tok( $t ) { + return pack( 'nn', 0, $t ); +} +function b_nt( $r ) { + return pack( 'nn', 1, $r ); +} +// ---- 2-byte binary record encoding ---- +// 16-bit value, high bit = non-terminal flag, low 15 bits = id. +// Token ids and (rule_id - low_nt) both fit in 15 bits (< 32768). +function b2_tok( $t ) { + return pack( 'n', $t & 0x7fff ); +} +function b2_nt( $r ) { + return pack( 'n', 0x8000 | ( $r & 0x7fff ) ); +} +// ---- 3-byte binary record encoding (raw, not UTF-8) ---- +function b3_tok( $t ) { + return chr( 0 ) . pack( 'n', $t ); +} +function b3_nt( $r ) { + return chr( 1 ) . pack( 'n', $r ); +} + +function build_pattern( $rules, $low_nt, $tokfn, $ntfn, $flags ) { + $alts = array(); + foreach ( $rules as $rid => $branches ) { + foreach ( $branches as $branch ) { + if ( count( $branch ) === 0 ) { + continue; + } + $s = ''; + foreach ( $branch as $sym ) { + $s .= ( $sym < $low_nt ) ? $tokfn( $sym ) : $ntfn( $sym ); + } + $alts[] = preg_quote( $s, '/' ); + } + } + usort( + $alts, + function ( $a, $b ) { + return strlen( $b ) - strlen( $a ); + } + ); + return '/(?:' . implode( '|', $alts ) . ')/' . $flags; +} + +// Each encoding: [label, tok-fn, nt-fn, flags]. +$encodings = array( + 'utf8 (3B codepoint, /u)' => array( 'u_tok', 'u_nt', 'u' ), + 'byte4 (2B tag+2B id, /s)' => array( 'b_tok', 'b_nt', 's' ), + 'byte3 (1B tag+2B id, /s)' => array( 'b3_tok', 'b3_nt', 's' ), + 'byte2 (16b tag+id, /s)' => array( 'b2_tok', 'b2_nt', 's' ), +); + +ini_set( 'pcre.jit', '1' ); +ini_set( 'pcre.backtrack_limit', '100000000' ); +ini_set( 'pcre.recursion_limit', '10000000' ); + +// ---- corpus ---- +$limit = (int) ( $argv[1] ?? 5000 ); +$handle = fopen( __DIR__ . '/../../corpus/mysql-server-tests-queries.csv', 'r' ); +$queries = array(); +while ( ( $r = fgetcsv( $handle, null, ',', '"', '\\' ) ) !== false ) { + $q = $r[0] ?? null; + if ( null === $q || '' === $q ) { + continue; + } + $queries[] = $q; + if ( count( $queries ) >= $limit ) { + break; + } +} +fclose( $handle ); + +// Pre-lex once; encode the token stream under each encoding's terminal fn. +$token_ids = array(); +foreach ( $queries as $q ) { + $ids = array(); + foreach ( ( new WP_MySQL_Lexer( $q ) )->remaining_tokens() as $t ) { + $ids[] = $t->id; + } + $token_ids[] = $ids; +} +$n = count( $queries ); + +$identity = function ( $m ) { + return $m[0]; +}; + +function bench_noop_callback( $pattern, $encoded, $identity, $warmup = 2, $runs = 7 ) { + $fn = function () use ( $pattern, $encoded, $identity ) { + foreach ( $encoded as $s ) { + preg_replace_callback( $pattern, $identity, $s ); + } + }; + for ( $w = 0; $w < $warmup; $w++ ) { + $fn(); + } + $best = INF; + for ( $r = 0; $r < $runs; $r++ ) { + $t = microtime( true ); + $fn(); + $best = min( $best, microtime( true ) - $t ); + } + return count( $encoded ) / $best; +} + +printf( "corpus: %d queries\n\n", $n ); +printf( "%-28s %10s %8s %10s\n", 'encoding', 'pat_bytes', 'compiles', 'noop_QPS' ); +foreach ( $encodings as $label => $cfg ) { + list( $tokfn, $ntfn, $flags ) = $cfg; + $pattern = build_pattern( $rules, $low_nt, $tokfn, $ntfn, $flags ); + // Compile test with a valid single-symbol probe. + $probe = $tokfn( 1 ); + $ok = @preg_match( $pattern, $probe ); + if ( false === $ok ) { + printf( "%-28s %10s %8s %10s (%s)\n", $label, number_format( strlen( $pattern ) ), 'NO', '-', preg_last_error_msg() ); + continue; + } + // Encode corpus for this encoding. + $encoded = array(); + foreach ( $token_ids as $ids ) { + $s = ''; + foreach ( $ids as $id ) { + $s .= $tokfn( $id ); + } + $encoded[] = $s; + } + $qps = bench_noop_callback( $pattern, $encoded, $identity ); + printf( "%-28s %10s %8s %10.0f\n", $label, number_format( strlen( $pattern ) ), 'yes', $qps ); +} From 38868e091d9741359ce537b3d06fb35a02d672c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 17:49:02 +0200 Subject: [PATCH 16/23] Add Oniguruma capture-trees finding ONIG_MAX_CAPTURE_HISTORY_GROUP=31 (far too small) and PHP mbstring exposes no capture-tree accessor. Source finding, not runnable in PHP. --- experiments/oniguruma-capture-trees/NOTES.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 experiments/oniguruma-capture-trees/NOTES.md diff --git a/experiments/oniguruma-capture-trees/NOTES.md b/experiments/oniguruma-capture-trees/NOTES.md new file mode 100644 index 000000000..5b5655939 --- /dev/null +++ b/experiments/oniguruma-capture-trees/NOTES.md @@ -0,0 +1,19 @@ +# Oniguruma (?@...) capture trees (source finding) + +**Status:** not runnable in stock PHP — a source/headers finding, not an executed +experiment. No code. + +**Idea:** Oniguruma's `mb_ereg` engine has a feature PCRE2 lacks — +`onig_get_capture_tree` returns a structured tree of captures, including those +inside recursion (`(?@...)` capture history) — which could yield a parse tree. + +**Findings:** +- The capacity cap is real: `ONIG_MAX_CAPTURE_HISTORY_GROUP = 31` (confirmed in the + Oniguruma headers PHP links against; error + `ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY = -222`). 31 groups is nowhere near + a grammar of this size. +- More fundamentally, PHP's mbstring exposes neither the `(?@...)` syntax option nor + any capture-tree accessor to userland — `mb_ereg` can't even enable it, and the + tree can't be read without FFI/C. + +**Verdict:** not enough capacity, and not reachable from stock PHP regardless. Dead end. From 2af208bbd07f6527c2d457c35dbd57527c2a6f4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 17:49:02 +0200 Subject: [PATCH 17/23] Add strtr blind-reduction experiment strtr iterate-to-stable is ~2650x slower than hand RD: it scans the whole table per call. Dead end. --- experiments/strtr-blind-reduction/NOTES.md | 20 ++ .../strtr-blind-reduction/strtr-bench.php | 225 ++++++++++++++++++ 2 files changed, 245 insertions(+) create mode 100644 experiments/strtr-blind-reduction/NOTES.md create mode 100644 experiments/strtr-blind-reduction/strtr-bench.php diff --git a/experiments/strtr-blind-reduction/NOTES.md b/experiments/strtr-blind-reduction/NOTES.md new file mode 100644 index 000000000..2f01c764e --- /dev/null +++ b/experiments/strtr-blind-reduction/NOTES.md @@ -0,0 +1,20 @@ +# strtr blind reduction + +**Origin:** ephemeral exploration (rebuilt fresh, toy grammar). No PR/commit. + +**Idea:** build a `strtr` translation table whose keys are reducible right-hand-side +sequences and values are non-terminal placeholders, then iterate `strtr` to a fixed +point. `strtr` with an array does parallel substitution in C, which is fast in +principle. + +**Run:** `php -d ...jit... strtr-bench.php` (toy expression grammar, ~79K-entry table). + +**Result (ops/sec, warm JIT):** hand-written recursive descent ~7.0M; +preg_match validate-only ~23.6M; preg_replace_callback shift-reduce ~1.8M; +`strtr` iterate-to-stable ~2,600 — roughly 2,650× slower than hand RD. +Throughput is ∝ 1/table-size and independent of input length. + +**Verdict:** Dead end. `strtr` scans its entire translation table on every call +regardless of input, so a grammar-sized table dominates per call. The native +function is fast; the per-call whole-table scan is not. (Blind parallel +substitution also can't model ordered shift-reduce — it needs a confluent rule set.) diff --git a/experiments/strtr-blind-reduction/strtr-bench.php b/experiments/strtr-blind-reduction/strtr-bench.php new file mode 100644 index 000000000..e18e88414 --- /dev/null +++ b/experiments/strtr-blind-reduction/strtr-bench.php @@ -0,0 +1,225 @@ + E ('+'|'*') T | T + * T -> num | '(' E ')' + * + * Four recognizers over a stream of TOY TOKENS (single-char symbols): + * n = num literal, + * ( ) as themselves. + * + * (a) hand-written recursive descent (validating) + * (b) preg_match validate-only (regex for the toy language via recursion (?R)) + * (c) preg_replace_callback shift-reduce (iterate reducing an RHS -> placeholder) + * (d) strtr iterate-to-stable with a LARGE (~79K-entry) translation table, + * padded with synthetic non-matching keys to reproduce the table-scan cost. + * + * We measure QPS = recognitions per second, warm JIT, best-of-N. + */ + +// --------------------------------------------------------------------------- +// Token alphabet used by all recognizers: +// 'n' = number, '+','*','(',')' +// Reducers (c) and (d) work on these single-char symbols; non-terminals are +// encoded as single bytes too so RHS sequences are fixed strings. +// 'E' and 'T' are the non-terminals. +// --------------------------------------------------------------------------- + +/** (a) Hand-written recursive descent. Returns true if the token string is a valid E. */ +function rd_parse(string $s): bool { + $pos = 0; + $len = strlen($s); + $ok = rd_E($s, $pos, $len); + return $ok && $pos === $len; +} +function rd_E(string $s, int &$pos, int $len): bool { + if (!rd_T($s, $pos, $len)) return false; + while ($pos < $len && ($s[$pos] === '+' || $s[$pos] === '*')) { + $pos++; + if (!rd_T($s, $pos, $len)) return false; + } + return true; +} +function rd_T(string $s, int &$pos, int $len): bool { + if ($pos >= $len) return false; + $c = $s[$pos]; + if ($c === 'n') { $pos++; return true; } + if ($c === '(') { + $pos++; + if (!rd_E($s, $pos, $len)) return false; + if ($pos >= $len || $s[$pos] !== ')') return false; + $pos++; + return true; + } + return false; +} + +/** (b) preg_match validate-only. PCRE recursive subpattern for the toy language. */ +$GLOBALS['toy_re'] = '/^(?(?n|\((?&E)\))(?:[+*](?&T))*)$/'; +function pcre_validate(string $s): bool { + return (bool) preg_match($GLOBALS['toy_re'], $s); +} + +/** (c) preg_replace_callback shift-reduce. + * Repeatedly reduce reducible RHS sequences to a single non-terminal until + * the string is exactly "E" (accept) or no further reduction applies (reject). + * RHS handled: T->n , T->(E) , E->T , E->E[+*]T + */ +function prc_reduce(string $s): bool { + // Same confluent rule set as the strtr reducer, but driven by a regex that + // matches any reducible RHS; iterate to a fixed point. Each call rewrites all + // non-overlapping matches found in the current string (callback re-derives the + // replacement), and we loop until the string stops changing. + // n -> E ; (E) -> E ; E+E -> E ; E*E -> E + $re = '/\(E\)|E[+*]E|n/'; + $guard = 0; + while (true) { + $s = preg_replace_callback($re, static function () { return 'E'; }, $s, -1, $count); + if ($count === 0) break; + if (++$guard > 100000) break; + } + return $s === 'E'; +} + +/** (d) strtr iterate-to-stable with a LARGE padded table. + * The reduction rules are the same RHS->placeholder rewrites, but applied via + * strtr() against a table padded to ~PAD_TARGET entries with synthetic + * non-matching keys, to reproduce the "whole-table scan per call" cost. + */ +$GLOBALS['strtr_table'] = null; +function build_strtr_table(int $pad_target): array { + // Real reduction rules (longest-key-first is handled by strtr automatically: + // strtr prefers the longest matching key). + // strtr does a SINGLE left-to-right non-overlapping pass per call, preferring + // the longest matching key, and applies all rules SIMULTANEOUSLY (no re-scan of + // already-substituted output within the same call). So the rule set must be + // CONFLUENT under that semantics. We encode every value as the single non-terminal + // 'E' and collapse binary forms, which converges to 'E' for any valid expression: + // n -> E (atom) + // (E) -> E (parenthesised) + // E+E -> E (binary) + // E*E -> E (binary) + $table = [ + '(E)' => 'E', + 'E+E' => 'E', + 'E*E' => 'E', + 'n' => 'E', + ]; + // Pad with synthetic non-matching keys. Use a byte range that never appears + // in our token strings (uppercase hex of a counter prefixed with '#'). + $i = 0; + while (count($table) < $pad_target) { + $key = '#' . dechex($i); // '#0','#1',... never present in token input + $table[$key] = '~'; // arbitrary non-terminal-ish value, never used + $i++; + } + return $table; +} +function strtr_reduce(string $s): bool { + $table =& $GLOBALS['strtr_table']; + $guard = 0; + while (true) { + $next = strtr($s, $table); + if ($next === $s) break; // fixed point + $s = $next; + if (++$guard > 100000) break; + } + return $s === 'E'; +} + +// --------------------------------------------------------------------------- +// Representative toy input set (token strings). All are VALID expressions. +// --------------------------------------------------------------------------- +function make_inputs(): array { + return [ + 'n', + 'n+n', + 'n*n', + 'n+n*n', + '(n)', + '(n+n)', + '(n+n)*n', + 'n+(n*n)+n', + '((n))', + '(n+n)*(n+n)', + 'n+n+n+n+n', + 'n*n*n*n*n', + '(n+n*n)+(n*n+n)', + '((n+n)*(n+n))+n', + 'n+n*(n+n)*n+n', + ]; +} + +// --------------------------------------------------------------------------- +// Sanity check: all four agree on the input set (and on a few invalids). +// --------------------------------------------------------------------------- +function sanity(): void { + $valids = make_inputs(); + $invalids = ['', 'n+', '+n', '(n', 'n)', 'n++n', '()', 'nn', '(n+)']; + foreach (['rd_parse','pcre_validate','prc_reduce','strtr_reduce'] as $fn) { + foreach ($valids as $v) { + if ($fn($v) !== true) { fwrite(STDERR, "SANITY FAIL: $fn rejected valid '$v'\n"); exit(1); } + } + foreach ($invalids as $iv) { + if ($fn($iv) !== false) { fwrite(STDERR, "SANITY FAIL: $fn accepted invalid '$iv'\n"); exit(1); } + } + } + fwrite(STDERR, "sanity OK (all 4 agree on " . count($valids) . " valid + " . count($invalids) . " invalid)\n"); +} + +// --------------------------------------------------------------------------- +// Benchmark harness: QPS = total recognitions / elapsed, best-of-N runs. +// --------------------------------------------------------------------------- +function bench(callable $fn, array $inputs, int $iters): float { + // returns ops/sec for one run + $t0 = hrtime(true); + $acc = 0; + for ($i = 0; $i < $iters; $i++) { + foreach ($inputs as $in) { + $acc += $fn($in) ? 1 : 0; + } + } + $dt = (hrtime(true) - $t0) / 1e9; + $ops = $iters * count($inputs); + if ($acc < 0) echo $acc; // prevent DCE + return $ops / $dt; +} + +$pad = (int) ($argv[1] ?? 79000); +$GLOBALS['strtr_table'] = build_strtr_table($pad); +fwrite(STDERR, "strtr table entries: " . count($GLOBALS['strtr_table']) . "\n"); + +sanity(); + +$inputs = make_inputs(); + +$cfgs = [ + 'hand RD ' => ['fn' => 'rd_parse', 'iters' => 200000], + 'preg_match validate ' => ['fn' => 'pcre_validate', 'iters' => 200000], + 'preg_replace_callback ' => ['fn' => 'prc_reduce', 'iters' => 20000], + 'strtr iterate-to-stable' => ['fn' => 'strtr_reduce', 'iters' => 2000], +]; + +$N = (int) ($argv[2] ?? 7); +$warmup = 2; + +$results = []; +foreach ($cfgs as $label => $c) { + $fn = $c['fn']; $iters = $c['iters']; + for ($w = 0; $w < $warmup; $w++) bench($fn, $inputs, max(1, (int)($iters/4))); + $best = 0.0; + for ($r = 0; $r < $N; $r++) { + $qps = bench($fn, $inputs, $iters); + if ($qps > $best) $best = $qps; + } + $results[$label] = $best; +} + +echo "\n=== QPS (best-of-$N, warm) — strtr pad=$pad ===\n"; +$rd = $results['hand RD ']; +foreach ($results as $label => $qps) { + printf("%-25s %12s QPS (%.1fx vs hand RD)\n", + $label, number_format($qps, 0), $qps / $rd); +} +printf("\nstrtr is %.0fx slower than hand RD\n", $rd / $results['strtr iterate-to-stable']); From 90aa5dc27966b3813716a4b25c8d3df545303494 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 17:49:02 +0200 Subject: [PATCH 18/23] Add native tree-builders reasoning json_decode/unserialize/DOMDocument: any SQL->JSON/XML transform that encodes nesting is itself the parse. Structurally circular. --- experiments/native-tree-builders/NOTES.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 experiments/native-tree-builders/NOTES.md diff --git a/experiments/native-tree-builders/NOTES.md b/experiments/native-tree-builders/NOTES.md new file mode 100644 index 000000000..cc3067c48 --- /dev/null +++ b/experiments/native-tree-builders/NOTES.md @@ -0,0 +1,15 @@ +# Native tree builders: json_decode / unserialize / DOMDocument (reasoning) + +**Status:** reasoning, not benchmarked. No code. + +**Idea:** PHP ships several very fast C-implemented tree parsers. Transform the SQL +token stream into a format one of them understands (JSON, PHP-serialize, XML), then +let the native function build the tree for free. + +**Why it fails:** any meaningful transform from SQL into JSON/serialize/XML must +encode the nesting structure — and computing that structure *is* parsing. SQL is +not mechanically pre-formattable into a nested JSON/XML shape without already +having parsed it. (The one shallow exception is a flat, non-recursive subset — e.g. +INSERT value lists — the same narrow-shape limit as the multi-shape fast parser.) + +**Verdict:** conceptually appealing, structurally circular. From 85476a75d6998e11f77214116f620f92e864ba87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 17:49:02 +0200 Subject: [PATCH 19/23] Add parle PECL extension proposal Native C++ LALR(1) (lexertl/parsertl), PHP 7.4+, PECL-only, non-serializable tables (per-cold-worker rebuild). Est. 3-10x where installable; not benchmarked. --- experiments/parle-extension/NOTES.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 experiments/parle-extension/NOTES.md diff --git a/experiments/parle-extension/NOTES.md b/experiments/parle-extension/NOTES.md new file mode 100644 index 000000000..e5026ae36 --- /dev/null +++ b/experiments/parle-extension/NOTES.md @@ -0,0 +1,16 @@ +# parle PECL extension (proposal) + +**Status:** evaluated, not installed/benchmarked. No code. The "3–10×" below is an +estimate, not a measurement. + +**Idea:** `parle` is a PECL extension wrapping Ben Hanson's C++ `lexertl`/`parsertl` +template libraries (LALR(1)). Push grammar rules at runtime, build the tables at +startup, then parse with semantic actions in native code. + +**Constraints (confirmed):** PHP 7.4+; requires a PECL install (absent on most +shared/managed hosting); the parser tables can't be serialized, so the +table-build cost is paid on every cold worker — significant for a grammar of this +complexity. + +**Verdict:** a realistic native fast path (est. 3–10×) WHERE it can be installed, +but shared-hosting reality rules it out as a default. Not prototyped here. From ead2eb239e03fac9d1246e1459b009106965aa19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 17:49:02 +0200 Subject: [PATCH 20/23] Add survey of other PHP parser libraries PHP-PEG (packrat memo overhead), Hoa\Compiler (grammar interpreter), Phlexy (lexer-only). None likely to beat the optimized parser. --- experiments/other-php-parser-libs/NOTES.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 experiments/other-php-parser-libs/NOTES.md diff --git a/experiments/other-php-parser-libs/NOTES.md b/experiments/other-php-parser-libs/NOTES.md new file mode 100644 index 000000000..dfcfb9aa9 --- /dev/null +++ b/experiments/other-php-parser-libs/NOTES.md @@ -0,0 +1,13 @@ +# Other PHP parser libraries: PHP-PEG, Hoa\Compiler, Phlexy (literature) + +**Status:** literature/reasoning, not benchmarked. No code. + +**Assessment:** +- PEG/packrat parsers in PHP (e.g. PHP-PEG) carry memo-store overhead that makes + them slower than a tuned hand-written recursive descent. +- `Hoa\Compiler` is itself a `.pp`-grammar interpreter — the same architecture as + this parser — and slower in practice (it's LL(k), not packrat). +- `Phlexy` (nikic) is lexer-only, and the lexer isn't the bottleneck. + +**Verdict:** none of these is likely to beat the current optimized parser; not worth +adopting. From 071a8ca2b8bdefd65269194a4ef1670662497c83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 17:49:03 +0200 Subject: [PATCH 21/23] Add SQLite-as-parser proposal SQLite exposes no parse tree; EXPLAIN QUERY PLAN is an execution plan, not an AST. At most a syntactic accept/reject classifier. --- experiments/sqlite-as-parser/NOTES.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 experiments/sqlite-as-parser/NOTES.md diff --git a/experiments/sqlite-as-parser/NOTES.md b/experiments/sqlite-as-parser/NOTES.md new file mode 100644 index 000000000..fea2cef01 --- /dev/null +++ b/experiments/sqlite-as-parser/NOTES.md @@ -0,0 +1,17 @@ +# SQLite as the parser (proposal) + +**Status:** proposal, not implemented. No code. + +**Idea:** SQLite is already a dependency; for DML that overlaps SQLite syntax, +lean on SQLite's own (fast, native) parser to classify or pre-parse queries, only +falling back to the full MySQL parser for the rest. + +**Caveat (important):** SQLite does NOT expose a parse tree/AST. `EXPLAIN QUERY PLAN` +returns an *execution plan* (table scans, index usage, sort order — post-optimization), +not a syntactic structure, and its output is documented as unstable. The most +SQLite can give cheaply is syntactic accept/reject via `prepare()` — a coarse +classifier with no structure to translate, and syntactic acceptance ≠ MySQL +semantics anyway. + +**Verdict:** at best a yes/no gate for trivial queries; can't supply an AST. Maybe a +feasibility spike for the proxy. Not prototyped. From 60d6035165aea33ebbf0cdfe9eec559a6f8e257c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 17:49:03 +0200 Subject: [PATCH 22/23] Add AST cache keyed by parameterized template Cache the AST on a parameterized token-stream signature. ~2-2.4x on repeat-heavy workloads, net loss on unique queries. Reference artifacts from local branch ast-cache. --- experiments/ast-cache/NOTES.md | 30 ++ .../class-wp-mysql-parser-ast-cache.php | 366 +++++++++++++ .../ast-cache/profile-ast-cache-signature.php | 140 +++++ .../ast-cache/profile-cache-breakdown.php | 235 ++++++++ .../ast-cache/run-ast-cache-benchmark.php | 507 ++++++++++++++++++ 5 files changed, 1278 insertions(+) create mode 100644 experiments/ast-cache/NOTES.md create mode 100644 experiments/ast-cache/class-wp-mysql-parser-ast-cache.php create mode 100644 experiments/ast-cache/profile-ast-cache-signature.php create mode 100644 experiments/ast-cache/profile-cache-breakdown.php create mode 100644 experiments/ast-cache/run-ast-cache-benchmark.php diff --git a/experiments/ast-cache/NOTES.md b/experiments/ast-cache/NOTES.md new file mode 100644 index 000000000..f00bf5eca --- /dev/null +++ b/experiments/ast-cache/NOTES.md @@ -0,0 +1,30 @@ +# AST cache keyed by parameterized template + +**Origin:** local branch `ast-cache` (fully implemented: cache + driver wiring + +unit/equivalence tests + benchmark). No PR. The files here are copied from that +branch as reference artifacts — they run on the `ast-cache` branch, where the cache +is wired into `WP_MySQL_Parser`. + +**Idea:** cache the AST on a parameterized token-stream signature so e.g. +`WHERE id = 5` and `WHERE id = 8` share one entry; serve the cached AST on a hit +instead of re-parsing. Not a parser change — a layer above it. + +**Run (on the `ast-cache` branch):** +``` +php -d ...jit... run-ast-cache-benchmark.php --scenarios=hit,corpus,wp --rounds=2 --iters=5 --sleep=0 +``` + +**Result (ABAB cache-off vs cache-on, warm JIT):** + +| scenario | off | on | hit rate | speedup | +|------------------------|--------|-------|----------|---------| +| hit (100% repeats) | ~190K | ~374K | 1.00 | 1.96× | +| WordPress-like repeats | ~128K | ~303K | 1.00 | 2.36× | +| mostly-unique corpus | ~57K | ~47K | 0.27 | 0.84× | + +Memory: ~1.87 MB at a 200-entry cap. + +**Verdict:** ~2–2.4× on repeat-heavy workloads (e.g. WordPress's parameterized +queries), orthogonal to and stackable with the parser optimizations — but a net +LOSS (~0.84×) on unique-query streams, because computing the signature costs more +than the parse it avoids. Worth shipping only gated on observed query repetition. diff --git a/experiments/ast-cache/class-wp-mysql-parser-ast-cache.php b/experiments/ast-cache/class-wp-mysql-parser-ast-cache.php new file mode 100644 index 000000000..80f2ea04d --- /dev/null +++ b/experiments/ast-cache/class-wp-mysql-parser-ast-cache.php @@ -0,0 +1,366 @@ + + */ + private $entries = array(); + + /** + * Number of cache hits. + * + * @var int + */ + private $hits = 0; + + /** + * Number of cache misses. + * + * @var int + */ + private $misses = 0; + + /** + * Number of LRU evictions performed. + * + * @var int + */ + private $evictions = 0; + + /** + * Constructor. + * + * @param string $grammar_version An opaque marker that distinguishes the + * grammar version. Including a different + * value invalidates all entries. + * @param int $capacity Maximum number of entries (default 200). + */ + public function __construct( string $grammar_version, int $capacity = self::DEFAULT_CAPACITY ) { + $this->grammar_version = $grammar_version; + $this->capacity = $capacity > 0 ? $capacity : self::DEFAULT_CAPACITY; + } + + /** + * Get the cache capacity. + * + * @return int + */ + public function get_capacity(): int { + return $this->capacity; + } + + /** + * Update the cache capacity. If the new cap is smaller than the current + * size, oldest entries are evicted until size matches. + * + * @param int $capacity New capacity. Must be greater than 0. + */ + public function set_capacity( int $capacity ): void { + if ( $capacity <= 0 ) { + $capacity = self::DEFAULT_CAPACITY; + } + $this->capacity = $capacity; + while ( count( $this->entries ) > $this->capacity ) { + // PHP 7.2-compatible "drop the oldest entry": foreach yields the + // least-recently-used key first because PHP arrays preserve + // insertion order. + foreach ( $this->entries as $first_key => $_unused ) { + unset( $this->entries[ $first_key ] ); + break; + } + ++$this->evictions; + } + } + + /** + * Drop all entries and reset counters. + */ + public function clear(): void { + $this->entries = array(); + $this->hits = 0; + $this->misses = 0; + $this->evictions = 0; + } + + /** + * Get current cache statistics. + * + * @return array{entries:int,capacity:int,hits:int,misses:int,evictions:int} + */ + public function get_stats(): array { + return array( + 'entries' => count( $this->entries ), + 'capacity' => $this->capacity, + 'hits' => $this->hits, + 'misses' => $this->misses, + 'evictions' => $this->evictions, + ); + } + + /** + * Compute a cache key for a slice of the token stream. + * + * The signature emits the token id for every token, plus the raw bytes + * for identifier-like tokens. Literal token bytes are intentionally + * elided so that two queries that differ only in literal values produce + * the same key. + * + * @param WP_Parser_Token[] $tokens The full token stream. + * @param int $start Inclusive start index. + * @param int $end Exclusive end index. + * @return string A binary cache key (grammar version + 20-byte sha1). + */ + public function compute_signature( array $tokens, int $start, int $end ): string { + // PHP arrays use binary-safe string keys natively, so we use the + // raw buffer directly as the cache key instead of paying a hash + // pass like sha1. The grammar version is prefixed verbatim. + $buffer = $this->grammar_version; + for ( $i = $start; $i < $end; ++$i ) { + $token = $tokens[ $i ]; + $id = $token->id; + $buffer .= pack( 'N', $id ); + // Fast path: most tokens are keywords/operators outside the + // identifier/literal range, so we just emit the id. + if ( $id < self::PARAMETERIC_RANGE_START || $id > self::PARAMETERIC_RANGE_END ) { + continue; + } + // Literal: id only, no bytes (so 5 vs 8 etc. share a key). + if ( $id >= self::LITERAL_RANGE_START && $id <= self::LITERAL_RANGE_END ) { + continue; + } + // Identifier-like: id + bytes (so `FROM a` and `FROM b` differ). + $bytes = $token->get_bytes(); + $buffer .= pack( 'N', strlen( $bytes ) ) . $bytes; + } + return $buffer; + } + + /** + * Look up a cached AST by token slice. + * + * Returns `null` on miss. On hit, returns a freshly-cloned AST whose + * token leaves point to the entries of `$tokens` at the corresponding + * positions (so callers reading `$token->start` see the current query), + * along with the number of tokens consumed. + * + * @param WP_Parser_Token[] $tokens The full token stream. + * @param int $start Inclusive start index. + * @param int $end Exclusive end index. + * @return array{0:WP_Parser_Node,1:int}|null [$ast, $consumed] or null. + */ + public function lookup( array $tokens, int $start, int $end ): ?array { + $key = $this->compute_signature( $tokens, $start, $end ); + return $this->lookup_by_key( $key, $tokens, $start ); + } + + /** + * Look up a cached AST by precomputed key. + * + * Lets callers compute the signature once and reuse it for the + * subsequent {@see store_by_key()} call on a miss. This halves + * signature work on the parser's hot path. + * + * @param string $key Cache key from compute_signature(). + * @param WP_Parser_Token[] $tokens The full token stream. + * @param int $start Inclusive start index. + * @return array{0:WP_Parser_Node,1:int}|null [$ast, $consumed] or null. + */ + public function lookup_by_key( string $key, array $tokens, int $start ): ?array { + if ( ! isset( $this->entries[ $key ] ) ) { + ++$this->misses; + return null; + } + + ++$this->hits; + $entry = $this->entries[ $key ]; + // LRU: re-insert at the end so this becomes the most recently used. + unset( $this->entries[ $key ] ); + $this->entries[ $key ] = $entry; + + $ast = $this->rebuild_ast_from_template( $entry[0], $entry[1], $entry[2], $tokens, $start ); + return array( $ast, $entry[3] ); + } + + /** + * Store a successful parse in the cache. + * + * The signature is computed over `[$start, $start + $consumed)` (the + * exact tokens that produced the AST). The AST is stored by reference; + * callers must not mutate it after handing it to the cache. + * + * @param WP_Parser_Token[] $tokens The full token stream. + * @param int $start Inclusive start index. + * @param int $consumed Number of tokens consumed by the parse. + * @param WP_Parser_Node $ast The parsed AST. + */ + public function store( array $tokens, int $start, int $consumed, WP_Parser_Node $ast ): void { + $key = $this->compute_signature( $tokens, $start, $start + $consumed ); + $this->store_by_key( $key, $consumed, $ast ); + } + + /** + * Store a successful parse using a precomputed key. + * + * The AST is flattened into a post-order op stream so the hit path can + * rebuild it with a single linear loop and an explicit stack -- no + * recursion, no per-node method calls, no instanceof checks. + * + * @param string $key Cache key from compute_signature(). + * @param int $consumed Number of tokens consumed by the parse. + * @param WP_Parser_Node $ast The parsed AST. + */ + public function store_by_key( string $key, int $consumed, WP_Parser_Node $ast ): void { + // If the key already exists, dropping it first ensures the + // re-insertion below puts it back at the most-recently-used end. + if ( isset( $this->entries[ $key ] ) ) { + unset( $this->entries[ $key ] ); + } + + $rule_ids = array(); + $rule_names = array(); + $child_counts = array(); + $this->flatten_ast_post_order( $ast, $rule_ids, $rule_names, $child_counts ); + $this->entries[ $key ] = array( $rule_ids, $rule_names, $child_counts, $consumed ); + + if ( count( $this->entries ) > $this->capacity ) { + // PHP 7.2-compatible "drop the oldest entry": foreach yields the + // least-recently-used key first because PHP arrays preserve + // insertion order. + foreach ( $this->entries as $first_key => $_unused ) { + unset( $this->entries[ $first_key ] ); + break; + } + ++$this->evictions; + } + } + + /** + * Walk the AST in post-order, emitting one op per node and one op + * per token leaf into parallel flat arrays. + * + * Tokens are encoded as a sentinel rule id of 0 (which never matches + * a real grammar rule). Nodes carry their rule id, rule name, and the + * number of immediate children, so the replay loop can pop the right + * number of items from its stack. + * + * @param WP_Parser_Node|WP_Parser_Token $node + * @param int[] $rule_ids + * @param string[] $rule_names + * @param int[] $child_counts + */ + private function flatten_ast_post_order( $node, array &$rule_ids, array &$rule_names, array &$child_counts ): void { + if ( $node instanceof WP_Parser_Token ) { + $rule_ids[] = 0; + $rule_names[] = ''; + $child_counts[] = 0; + return; + } + $children = $node->get_children(); + $count = count( $children ); + for ( $i = 0; $i < $count; ++$i ) { + $this->flatten_ast_post_order( $children[ $i ], $rule_ids, $rule_names, $child_counts ); + } + $rule_ids[] = $node->rule_id; + $rule_names[] = $node->rule_name; + $child_counts[] = $count; + } + + /** + * Replay a flattened cache entry, binding token leaves to entries from + * the current token stream. + * + * Single linear loop, one explicit stack. No recursion, no + * `get_children()` method calls, no `instanceof` per child. + * + * @param array $rule_ids + * @param array $rule_names + * @param array $child_counts + * @param WP_Parser_Token[] $tokens Current query's full token stream. + * @param int $start Inclusive start index in $tokens. + * @return WP_Parser_Node + */ + private function rebuild_ast_from_template( array $rule_ids, array $rule_names, array $child_counts, array $tokens, int $start ): WP_Parser_Node { + $op_count = count( $rule_ids ); + $stack = array(); + $top = -1; + $token_idx = 0; + for ( $i = 0; $i < $op_count; ++$i ) { + $rule_id = $rule_ids[ $i ]; + if ( 0 === $rule_id ) { + $stack[ ++$top ] = $tokens[ $start + $token_idx ]; + ++$token_idx; + continue; + } + $count = $child_counts[ $i ]; + if ( 0 === $count ) { + // Leaf node with no children. Should not happen because the + // parser returns `true` for empty children, never a Node. + $stack[ ++$top ] = new WP_Parser_Node( $rule_id, $rule_names[ $i ], array() ); + continue; + } + // One C-level call to gather the last $count children into a + // fresh array, then drop them from the stack. + $children = array_slice( $stack, $top - $count + 1, $count ); + $top -= $count; + $stack[ ++$top ] = new WP_Parser_Node( $rule_id, $rule_names[ $i ], $children ); + } + return $stack[0]; + } +} diff --git a/experiments/ast-cache/profile-ast-cache-signature.php b/experiments/ast-cache/profile-ast-cache-signature.php new file mode 100644 index 000000000..71278e74b --- /dev/null +++ b/experiments/ast-cache/profile-ast-cache-signature.php @@ -0,0 +1,140 @@ += $query_count ) { + break; + } +} +fclose( $handle ); + +// Pre-tokenize. +$tokenized = array(); +foreach ( $queries as $q ) { + $lexer = new WP_MySQL_Lexer( $q ); + $tokenized[] = $lexer->remaining_tokens(); +} + +$cache = new WP_MySQL_Parser_Ast_Cache( $grammar_version, max( 1, count( $queries ) ) ); + +// Pre-fill cache so all queries are hits. +foreach ( $tokenized as $tokens ) { + $parser = new WP_MySQL_Parser( $grammar, $tokens, $cache ); + $parser->next_query(); +} + +function bench( callable $work, int $iters, int $count ): array { + $samples = array(); + for ( $i = 0; $i < $iters; ++$i ) { + $start = microtime( true ); + $work(); + $elapsed = microtime( true ) - $start; + $samples[] = $count / $elapsed; + } + sort( $samples ); + return $samples; +} + +// Bench: parse only (no cache). +$parse_qps = bench( + function () use ( $tokenized, $grammar ) { + foreach ( $tokenized as $tokens ) { + $parser = new WP_MySQL_Parser( $grammar, $tokens ); + $parser->next_query(); + } + }, + $iters, + count( $tokenized ) +); + +// Bench: signature only. +$sig_qps = bench( + function () use ( $tokenized, $cache ) { + $count = count( $tokenized ); + for ( $i = 0; $i < $count; ++$i ) { + $tokens = $tokenized[ $i ]; + $cache->compute_signature( $tokens, 0, count( $tokens ) ); + } + }, + $iters, + count( $tokenized ) +); + +// Bench: full cache hit path (signature + lookup_by_key + clone). +$hit_qps = bench( + function () use ( $tokenized, $grammar, $cache ) { + foreach ( $tokenized as $tokens ) { + $parser = new WP_MySQL_Parser( $grammar, $tokens, $cache ); + $parser->next_query(); + } + }, + $iters, + count( $tokenized ) +); + +function report( string $label, array $qps, int $count ): void { + $best = max( $qps ); + $median = $qps[ (int) ( count( $qps ) / 2 ) ]; + $us = $best > 0 ? 1e6 / $best : 0; + printf( + " %-22s best %8.0f QPS median %8.0f QPS (%.2f us/query)\n", + $label, + $best, + $median, + $us + ); +} + +$jit = function_exists( 'opcache_get_status' ) && ( opcache_get_status( false )['jit']['on'] ?? false ); +echo 'PHP ' . PHP_VERSION . ' JIT=' . ( $jit ? 'on' : 'off' ) . ' queries=' . count( $tokenized ) . " iters=$iters\n\n"; + +report( 'parse only', $parse_qps, count( $tokenized ) ); +report( 'signature only', $sig_qps, count( $tokenized ) ); +report( 'cache hit path', $hit_qps, count( $tokenized ) ); + +$parse_us = 1e6 / max( $parse_qps ); +$sig_us = 1e6 / max( $sig_qps ); +$hit_us = 1e6 / max( $hit_qps ); +$savings = $parse_us - $hit_us; +printf( + "\n signature is %.1f%% of parse cost\n hit path saves %.2f us/query (%.1f%% of parse)\n", + 100 * $sig_us / $parse_us, + $savings, + 100 * $savings / $parse_us +); diff --git a/experiments/ast-cache/profile-cache-breakdown.php b/experiments/ast-cache/profile-cache-breakdown.php new file mode 100644 index 000000000..4af7b1e6a --- /dev/null +++ b/experiments/ast-cache/profile-cache-breakdown.php @@ -0,0 +1,235 @@ += $query_count ) { + break; + } +} +fclose( $handle ); + +$tokenized = array(); +foreach ( $queries as $q ) { + $lexer = new WP_MySQL_Lexer( $q ); + $tokenized[] = $lexer->remaining_tokens(); +} + +$cache = new WP_MySQL_Parser_Ast_Cache( $grammar_version, max( 1, count( $queries ) ) ); + +// Pre-fill cache and pre-parse to obtain ASTs for the clone bench. +$asts = array(); +$signatures = array(); +foreach ( $tokenized as $tokens ) { + $parser = new WP_MySQL_Parser( $grammar, $tokens, $cache ); + $parser->next_query(); + $ast = $parser->get_query_ast(); + $asts[] = $ast; + $signatures[] = $cache->compute_signature( $tokens, 0, count( $tokens ) ); +} + +// Helper: time a closure for $count operations, return min/median/best QPS samples. +function bench( callable $work, int $iters, int $count ): array { + $samples = array(); + for ( $i = 0; $i < $iters; ++$i ) { + $start = microtime( true ); + $work(); + $elapsed = microtime( true ) - $start; + $samples[] = $count / $elapsed; + } + sort( $samples ); + return $samples; +} + +function us_per( array $qps ): float { + $best = max( $qps ); + return $best > 0 ? 1e6 / $best : 0; +} + +// 1. Parse only. +$parse_qps = bench( + function () use ( $tokenized, $grammar ) { + foreach ( $tokenized as $tokens ) { + $parser = new WP_MySQL_Parser( $grammar, $tokens ); + $parser->next_query(); + } + }, + $iters, + count( $tokenized ) +); + +// 2. Signature only. +$sig_qps = bench( + function () use ( $tokenized, $cache ) { + $count = count( $tokenized ); + for ( $i = 0; $i < $count; ++$i ) { + $tokens = $tokenized[ $i ]; + $cache->compute_signature( $tokens, 0, count( $tokens ) ); + } + }, + $iters, + count( $tokenized ) +); + +// 3. sha1 of typical buffer alone. +$buf = str_repeat( "\x00\x01\x02\x03", 32 ); // 128 bytes representative +$sha1_qps = bench( + function () use ( $buf ) { + for ( $i = 0; $i < 5000; ++$i ) { + sha1( $buf, true ); + } + }, + $iters, + 5000 +); + +// 4. Lookup only (precomputed key, full hit path WITHOUT clone). We mock by +// using a closure that does the isset+LRU bookkeeping but skips clone. +class BenchCacheNoClone { + public $entries; + public $hits = 0; + + public function __construct( array $entries ) { + $this->entries = $entries; + } + + public function lookup_no_clone( string $key ): bool { + if ( ! isset( $this->entries[ $key ] ) ) { + return false; + } + ++$this->hits; + $entry = $this->entries[ $key ]; + unset( $this->entries[ $key ] ); + $this->entries[ $key ] = $entry; + return true; + } +} + +$bench_entries = array(); +foreach ( $signatures as $i => $key ) { + $bench_entries[ $key ] = array( $asts[ $i ], 0 ); +} +$bench_no_clone = new BenchCacheNoClone( $bench_entries ); + +$lookup_qps = bench( + function () use ( $signatures, $bench_no_clone ) { + foreach ( $signatures as $key ) { + $bench_no_clone->lookup_no_clone( $key ); + } + }, + $iters, + count( $signatures ) +); + +// 5. Clone only (using cache lookup_by_key which clones). +$clone_qps = bench( + function () use ( $signatures, $tokenized, $cache ) { + $n = count( $signatures ); + for ( $i = 0; $i < $n; ++$i ) { + $cache->lookup_by_key( $signatures[ $i ], $tokenized[ $i ], 0 ); + } + }, + $iters, + count( $signatures ) +); + +// 6. Full hit path via parser. +$hit_qps = bench( + function () use ( $tokenized, $grammar, $cache ) { + foreach ( $tokenized as $tokens ) { + $parser = new WP_MySQL_Parser( $grammar, $tokens, $cache ); + $parser->next_query(); + } + }, + $iters, + count( $tokenized ) +); + +// 7. Per-query "build a fresh AST that's just the cached one with current tokens" +// by walking and mutating in-place (UNSAFE - measures upper bound for an +// in-place implementation that skips cloning). +function walk_tokens_in_place( $node, array $tokens, int $start, int &$index ): void { + foreach ( $node->get_children() as $i => $child ) { + if ( $child instanceof WP_Parser_Token ) { + ++$index; + } else { + walk_tokens_in_place( $child, $tokens, $start, $index ); + } + } +} + +$walk_qps = bench( + function () use ( $asts, $tokenized ) { + $n = count( $asts ); + for ( $i = 0; $i < $n; ++$i ) { + $idx = 0; + walk_tokens_in_place( $asts[ $i ], $tokenized[ $i ], 0, $idx ); + } + }, + $iters, + count( $asts ) +); + +$jit = function_exists( 'opcache_get_status' ) && ( opcache_get_status( false )['jit']['on'] ?? false ); +echo 'PHP ' . PHP_VERSION . ' JIT=' . ( $jit ? 'on' : 'off' ) . ' queries=' . count( $tokenized ) . " iters=$iters\n\n"; + +printf( " %-22s best %8.0f QPS (%5.2f us/query)\n", 'parse only', max( $parse_qps ), us_per( $parse_qps ) ); +printf( " %-22s best %8.0f QPS (%5.2f us/query)\n", 'signature only', max( $sig_qps ), us_per( $sig_qps ) ); +printf( " %-22s best %8.0f QPS (%5.2f us/buf)\n", 'sha1 128 bytes', max( $sha1_qps ), us_per( $sha1_qps ) ); +printf( " %-22s best %8.0f QPS (%5.2f us/query)\n", 'lookup-only (no clone)', max( $lookup_qps ), us_per( $lookup_qps ) ); +printf( " %-22s best %8.0f QPS (%5.2f us/query)\n", 'clone only (lookup_by_key)', max( $clone_qps ), us_per( $clone_qps ) ); +printf( " %-22s best %8.0f QPS (%5.2f us/query)\n", 'full hit path', max( $hit_qps ), us_per( $hit_qps ) ); +printf( " %-22s best %8.0f QPS (%5.2f us/query)\n", 'walk-only (no alloc)', max( $walk_qps ), us_per( $walk_qps ) ); + +// Sub-phase costs (rough decomposition). +$parse_us = us_per( $parse_qps ); +$sig_us = us_per( $sig_qps ); +$lookup_us = us_per( $lookup_qps ); +$clone_us = us_per( $clone_qps ); // includes signature recompute? no, clone_only used precomputed key +$walk_us = us_per( $walk_qps ); +$hit_us = us_per( $hit_qps ); + +printf( "\n signature cost: %.2f us/query\n", $sig_us ); +printf( " lookup-bookkeeping: %.2f us/query\n", $lookup_us ); +printf( " clone (lookup_by_key) total: %.2f us/query\n", $clone_us ); +printf( " walk-only (no allocations): %.2f us/query (lower bound for clone walk)\n", $walk_us ); +printf( " full hit path: %.2f us/query\n", $hit_us ); +printf( " parse cost: %.2f us/query\n", $parse_us ); +printf( " speedup ceiling at 100%% hit: %.2fx\n", $parse_us / $hit_us ); diff --git a/experiments/ast-cache/run-ast-cache-benchmark.php b/experiments/ast-cache/run-ast-cache-benchmark.php new file mode 100644 index 000000000..82d7ec4fa --- /dev/null +++ b/experiments/ast-cache/run-ast-cache-benchmark.php @@ -0,0 +1,507 @@ += $limit ) { + break; + } + } + fclose( $handle ); + return $out; +} + +function build_unique_miss_queries( int $count ): array { + // Each query has a different identifier so signatures differ; cache + // will only ever miss. Tests pure overhead of the cache check on the + // hot path. + $out = array(); + for ( $i = 0; $i < $count; ++$i ) { + $out[] = 'SELECT * FROM table_' . $i . ' WHERE col_' . $i . ' = 1'; + } + return $out; +} + +function build_single_hit_query(): array { + // One query repeated -- cache always hits after the first miss. + return array( 'SELECT * FROM users WHERE id = 1' ); +} + +function build_wp_workload( int $repeats ): array { + // Mirror the shape of a WordPress page load: heavy wp_options reads + // with varying option_name, post lookups by ID, postmeta by post_id + + // meta_key, plus a sprinkle of writes. + $option_names = array( + 'siteurl', + 'home', + 'blogname', + 'blogdescription', + 'admin_email', + 'start_of_week', + 'use_balanceTags', + 'use_smilies', + 'require_name_email', + 'comments_notify', + 'posts_per_rss', + 'rss_use_excerpt', + 'mailserver_url', + 'mailserver_login', + 'default_category', + 'default_comment_status', + 'default_ping_status', + 'default_pingback_flag', + 'permalink_structure', + 'gmt_offset', + 'default_email_category', + 'recently_edited', + 'template', + 'stylesheet', + 'comment_registration', + 'html_type', + 'use_trackback', + 'default_role', + 'db_version', + 'uploads_use_yearmonth_folders', + 'upload_path', + 'blog_public', + 'default_link_category', + 'show_on_front', + 'tag_base', + 'show_avatars', + 'avatar_rating', + 'upload_url_path', + 'thumbnail_size_w', + 'thumbnail_size_h', + ); + $meta_keys = array( + '_edit_last', + '_edit_lock', + '_thumbnail_id', + '_wp_page_template', + '_wp_attached_file', + '_wp_attachment_metadata', + '_menu_item_type', + '_menu_item_object', + '_menu_item_object_id', + '_wp_old_slug', + ); + + $templates = array(); + foreach ( $option_names as $name ) { + $templates[] = "SELECT option_value FROM wp_options WHERE option_name = '$name' LIMIT 1"; + } + for ( $id = 1; $id <= 25; ++$id ) { + $templates[] = "SELECT * FROM wp_posts WHERE ID = $id"; + } + foreach ( $meta_keys as $mk ) { + for ( $pid = 1; $pid <= 5; ++$pid ) { + $templates[] = "SELECT meta_value FROM wp_postmeta WHERE post_id = $pid AND meta_key = '$mk' LIMIT 1"; + } + } + for ( $id = 1; $id <= 10; ++$id ) { + $templates[] = "SELECT * FROM wp_users WHERE ID = $id"; + $templates[] = "SELECT * FROM wp_terms WHERE term_id = $id"; + $templates[] = "SELECT * FROM wp_term_taxonomy WHERE term_taxonomy_id = $id"; + $templates[] = "SELECT * FROM wp_comments WHERE comment_post_ID = $id AND comment_approved = '1'"; + } + $templates[] = "INSERT INTO wp_options (option_name, option_value, autoload) VALUES ('cron', 'a:0:{}', 'yes')"; + $templates[] = "UPDATE wp_options SET option_value = 'a:1:{}' WHERE option_name = 'cron'"; + $templates[] = "DELETE FROM wp_options WHERE option_name = 'cron'"; + $templates[] = "SELECT COUNT(*) FROM wp_posts WHERE post_status = 'publish' AND post_type = 'post'"; + $templates[] = "SELECT * FROM wp_posts WHERE post_status = 'publish' ORDER BY post_date DESC LIMIT 10"; + + // Repeat the templates several times to amortise warmup; shuffle once + // so cached/miss patterns interleave. + $out = array(); + for ( $r = 0; $r < $repeats; ++$r ) { + foreach ( $templates as $t ) { + $out[] = $t; + } + } + mt_srand( 42 ); + shuffle( $out ); + return $out; +} + +// --- Bench primitive --- +function pretokenize( array $queries ): array { + $tokenized = array(); + foreach ( $queries as $q ) { + $lexer = new WP_MySQL_Lexer( $q ); + $tokens = $lexer->remaining_tokens(); + $tokenized[] = $tokens; + } + return $tokenized; +} + +/** + * Run one timed pass over the pre-tokenized workload. + * + * Returns [qps, elapsed_seconds, success_count, hit_rate]. + */ +function bench_pass( array $tokenized, WP_Parser_Grammar $grammar, ?WP_MySQL_Parser_Ast_Cache $cache ): array { + $success = 0; + $start = microtime( true ); + $hits_before = $cache ? $cache->get_stats()['hits'] : 0; + $misses_before = $cache ? $cache->get_stats()['misses'] : 0; + foreach ( $tokenized as $tokens ) { + $parser = new WP_MySQL_Parser( $grammar, $tokens, $cache ); + $parser->next_query(); + if ( null !== $parser->get_query_ast() ) { + ++$success; + } + } + $elapsed = microtime( true ) - $start; + $qps = $elapsed > 0 ? count( $tokenized ) / $elapsed : 0.0; + + $hit_rate = 0.0; + if ( $cache ) { + $delta_hits = $cache->get_stats()['hits'] - $hits_before; + $delta_misses = $cache->get_stats()['misses'] - $misses_before; + $total = $delta_hits + $delta_misses; + $hit_rate = $total > 0 ? $delta_hits / $total : 0.0; + } + return array( $qps, $elapsed, $success, $hit_rate ); +} + +/** + * Run one scenario in ABAB order. + * + * Returns array with keys 'off' and 'on', each holding raw QPS samples + * collected across all rounds. + */ +function run_abab( + array $tokenized, + WP_Parser_Grammar $grammar, + int $cap, + int $rounds, + int $iters, + int $warmup, + int $sleep_between, + string $scenario, + string $php_version, + $csv_path, + $grammar_version +): array { + $samples_off = array(); + $samples_on = array(); + $last_hit_rate_on = 0.0; + + for ( $r = 1; $r <= $rounds; ++$r ) { + // --- A pass: cache OFF --- + fwrite( STDERR, "[$scenario] round $r A (cache=off)" ); + for ( $w = 0; $w < $warmup; ++$w ) { + bench_pass( $tokenized, $grammar, null ); + } + for ( $i = 0; $i < $iters; ++$i ) { + list( $qps, $ms ) = bench_pass( $tokenized, $grammar, null ); + $samples_off[] = $qps; + fwrite( STDERR, sprintf( ' %.0f', $qps ) ); + if ( null !== $csv_path ) { + file_put_contents( + $csv_path, + "$scenario,$php_version,off,0,$r," . ( $i + 1 ) . ",$qps," . ( $ms * 1000 ) . ",0\n", + FILE_APPEND + ); + } + } + fwrite( STDERR, "\n" ); + + if ( $sleep_between > 0 ) { + fwrite( STDERR, " sleeping $sleep_between s ...\n" ); + sleep( $sleep_between ); + } + + // --- B pass: cache ON --- + // Fresh cache per round so memory and warm-up state are + // comparable across rounds. + $cache = new WP_MySQL_Parser_Ast_Cache( $grammar_version, $cap ); + fwrite( STDERR, "[$scenario] round $r B (cache=on,cap=$cap)" ); + for ( $w = 0; $w < $warmup; ++$w ) { + bench_pass( $tokenized, $grammar, $cache ); + } + for ( $i = 0; $i < $iters; ++$i ) { + list( $qps, $ms, $success, $hit_rate ) = bench_pass( $tokenized, $grammar, $cache ); + $samples_on[] = $qps; + $last_hit_rate_on = $hit_rate; + fwrite( STDERR, sprintf( ' %.0f(hr=%.2f)', $qps, $hit_rate ) ); + if ( null !== $csv_path ) { + file_put_contents( + $csv_path, + "$scenario,$php_version,on,$cap,$r," . ( $i + 1 ) . ",$qps," . ( $ms * 1000 ) . ",$hit_rate\n", + FILE_APPEND + ); + } + } + fwrite( STDERR, "\n" ); + + if ( $r < $rounds && $sleep_between > 0 ) { + fwrite( STDERR, " sleeping $sleep_between s ...\n" ); + sleep( $sleep_between ); + } + } + + return array( + 'off' => $samples_off, + 'on' => $samples_on, + 'hit_rate' => $last_hit_rate_on, + ); +} + +function summarize( array $samples ): array { + if ( ! $samples ) { + return array( + 'best' => 0.0, + 'median' => 0.0, + 'mean' => 0.0, + ); + } + sort( $samples ); + $n = count( $samples ); + $best = $samples[ $n - 1 ]; + $median = $n % 2 + ? $samples[ (int) ( $n / 2 ) ] + : ( $samples[ $n / 2 - 1 ] + $samples[ $n / 2 ] ) / 2.0; + $mean = array_sum( $samples ) / $n; + return compact( 'best', 'median', 'mean' ); +} + +function format_row( string $label, array $off_summary, array $on_summary, float $hit_rate ): string { + $speedup = $off_summary['median'] > 0 ? $on_summary['median'] / $off_summary['median'] : 0.0; + return sprintf( + "%-22s | off med %7.0f QPS best %7.0f | on med %7.0f QPS best %7.0f | hr %5.2f | speedup %5.2fx\n", + $label, + $off_summary['median'], + $off_summary['best'], + $on_summary['median'], + $on_summary['best'], + $hit_rate, + $speedup + ); +} + +// --- Memory measurement helper --- +function measure_cache_memory( WP_Parser_Grammar $grammar, string $grammar_version, array $tokenized, int $cap ): array { + gc_collect_cycles(); + $baseline = memory_get_usage(); + $cache = new WP_MySQL_Parser_Ast_Cache( $grammar_version, $cap ); + // Drive enough distinct queries through the cache to fill it. + $filled = 0; + foreach ( $tokenized as $tokens ) { + $parser = new WP_MySQL_Parser( $grammar, $tokens, $cache ); + $parser->next_query(); + ++$filled; + if ( $cache->get_stats()['entries'] >= $cap ) { + break; + } + } + gc_collect_cycles(); + $after = memory_get_usage(); + $peak = memory_get_peak_usage(); + return array( + 'entries' => $cache->get_stats()['entries'], + 'driven_queries' => $filled, + 'cold_bytes' => $baseline, + 'full_bytes' => $after, + 'delta_bytes' => $after - $baseline, + 'peak_bytes' => $peak, + ); +} + +// --- Run scenarios --- +$results = array(); + +if ( in_array( 'miss', $scenarios, true ) ) { + $queries = build_unique_miss_queries( 2000 ); + $tokenized = pretokenize( $queries ); + $results['miss'] = run_abab( $tokenized, $grammar, 200, $rounds, $iters, $warmup, $sleep_between, 'miss', $php_version, $csv_path, $grammar_version ); +} + +if ( in_array( 'hit', $scenarios, true ) ) { + $queries = build_single_hit_query(); + // Replicate to keep timing per-iter measurable. + $queries = array_merge( ...array_fill( 0, 50000, $queries ) ); + $tokenized = pretokenize( $queries ); + $results['hit'] = run_abab( $tokenized, $grammar, 200, $rounds, $iters, $warmup, $sleep_between, 'hit', $php_version, $csv_path, $grammar_version ); +} + +if ( in_array( 'corpus', $scenarios, true ) ) { + $queries = load_corpus_queries( $corpus_limit ); + $tokenized = pretokenize( $queries ); + $results['corpus'] = run_abab( $tokenized, $grammar, 200, $rounds, $iters, $warmup, $sleep_between, 'corpus', $php_version, $csv_path, $grammar_version ); +} + +if ( in_array( 'wp', $scenarios, true ) ) { + $queries = build_wp_workload( 100 ); + $tokenized = pretokenize( $queries ); + $results['wp'] = run_abab( $tokenized, $grammar, 200, $rounds, $iters, $warmup, $sleep_between, 'wp', $php_version, $csv_path, $grammar_version ); +} + +if ( in_array( 'cap_sweep', $scenarios, true ) ) { + // Cap-sweep needs enough *distinct* signatures to make caps actually + // bite. We synthesise ~600 distinct shapes (templates) and replay + // them in a long Zipf-ish stream so the LRU policy is exercised. + $cap_distinct = 600; + $cap_replays = 30; + $templates = array(); + for ( $i = 0; $i < $cap_distinct; ++$i ) { + $op = $i % 4; + if ( 0 === $op ) { + $templates[] = "SELECT col_$i FROM table_$i WHERE id = 1"; + } elseif ( 1 === $op ) { + $templates[] = "INSERT INTO table_$i (a, b) VALUES (1, 'x')"; + } elseif ( 2 === $op ) { + $templates[] = "UPDATE table_$i SET col_$i = 1 WHERE id = 1"; + } else { + $templates[] = "SELECT col_$i, other_$i FROM table_$i t JOIN other_$i o ON t.id = o.id WHERE t.col_$i > 0"; + } + } + $queries = array(); + for ( $r = 0; $r < $cap_replays; ++$r ) { + foreach ( $templates as $t ) { + $queries[] = $t; + } + } + mt_srand( 13 ); + shuffle( $queries ); + $tokenized = pretokenize( $queries ); + foreach ( array( 50, 100, 200, 500 ) as $cap ) { + $results[ "cap=$cap" ] = run_abab( $tokenized, $grammar, $cap, $rounds, $iters, $warmup, $sleep_between, "cap=$cap", $php_version, $csv_path, $grammar_version ); + } +} + +// --- Memory: separate (no ABAB needed). Use synthetic distinct shapes +// so we can actually fill the cache to cap. --- +$memory_summary = null; +if ( in_array( 'wp', $scenarios, true ) || in_array( 'cap_sweep', $scenarios, true ) || in_array( 'memory', $scenarios, true ) ) { + $mem_queries = array(); + for ( $i = 0; $i < 250; ++$i ) { + $op = $i % 4; + if ( 0 === $op ) { + $mem_queries[] = "SELECT col_$i FROM table_$i WHERE id = 1"; + } elseif ( 1 === $op ) { + $mem_queries[] = "INSERT INTO table_$i (a, b) VALUES (1, 'x')"; + } elseif ( 2 === $op ) { + $mem_queries[] = "UPDATE table_$i SET col_$i = 1 WHERE id = 1"; + } else { + $mem_queries[] = "SELECT col_$i, other_$i FROM table_$i t JOIN other_$i o ON t.id = o.id WHERE t.col_$i > 0"; + } + } + $tokenized = pretokenize( $mem_queries ); + $memory_summary = measure_cache_memory( $grammar, $grammar_version, $tokenized, 200 ); +} + +// --- Output table --- +echo "\n"; +echo str_repeat( '=', 100 ) . "\n"; +echo "AST cache benchmark | PHP $php_version | rounds=$rounds iters=$iters warmup=$warmup sleep={$sleep_between}s\n"; +echo str_repeat( '=', 100 ) . "\n"; + +foreach ( $results as $name => $res ) { + $off_summary = summarize( $res['off'] ); + $on_summary = summarize( $res['on'] ); + echo format_row( $name, $off_summary, $on_summary, $res['hit_rate'] ); +} + +if ( null !== $memory_summary ) { + echo "\n"; + echo "Memory at cap=200 (WP workload):\n"; + printf( + " entries filled = %d (after %d queries)\n delta = %.2f MB\n peak = %.2f MB\n", + $memory_summary['entries'], + $memory_summary['driven_queries'], + $memory_summary['delta_bytes'] / ( 1024 * 1024 ), + $memory_summary['peak_bytes'] / ( 1024 * 1024 ) + ); +} From fec4ae6693b78eb4947d5761152f1a4e59ed504d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Sat, 6 Jun 2026 17:49:03 +0200 Subject: [PATCH 23/23] Add native Rust extension write-up The one native path actually shipped (PR #381/#423/#378). ~1.33x over optimized PHP under JIT (the original ~10x conflated lazy-AST + no-JIT). Code lives on those branches; documented here. --- experiments/native-rust-extension/NOTES.md | 29 ++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 experiments/native-rust-extension/NOTES.md diff --git a/experiments/native-rust-extension/NOTES.md b/experiments/native-rust-extension/NOTES.md new file mode 100644 index 000000000..1118c3a4a --- /dev/null +++ b/experiments/native-rust-extension/NOTES.md @@ -0,0 +1,29 @@ +# Native Rust parser extension + +**Origin:** the one native path actually built and shipped — PR #381 (the +extension), follow-ups #384–#394, #423 (native-backed AST nodes), and #378 (which +reworked the AST to materialize eagerly). The crate lives at +`packages/php-ext-wp-mysql-parser/` on those branches; not copied here to keep this +branch a clean diff against trunk. + +**Idea:** skip the regex tricks; write the lexer and parser in Rust and ship them +as an optional PHP extension for environments we control. + +**Result (parser-only, AST materialized, this machine):** + +| config | trunk PHP | optimized PHP | native | +|-----------------------|-----------|---------------|--------------| +| opcache + tracing JIT | ~28K QPS | ~57K | ~77K (~1.33×) | +| no opcache / no JIT | ~12K QPS | ~34K | ~75K (~2.19×) | + +The original "~10×" claim was too optimistic for two reasons, both confirmed: +(1) the native pipeline loaded AST nodes LAZILY and the first benchmark never +materialized them (materializing costs ~3.5×: ~265K → ~77K); (2) it compared +native against pure PHP WITHOUT JIT. With JIT and materialization, native is only +~1.33× over the optimized PHP parser. + +**Verdict:** native is faster, but only ~1.3× over optimized PHP under production +JIT — questionable whether the native build is worth maintaining. Notably, the +optimized PHP parser run in validate-only mode (no node materialization, ~246K) is +in the same league as the native lazy path, so the cheapest remaining win is +PHP-side (skip materialization), not native.