From d1edb798fadc87fb50d0aa876af38f7b1d0aec63 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 2 Apr 2026 11:17:15 -0400 Subject: [PATCH 1/6] Add comment showing change in ClickBench files Signed-off-by: Andrew Duffy --- .github/workflows/sql-benchmarks.yml | 62 +++++++++++- scripts/capture-file-sizes.py | 80 +++++++++++++++ scripts/compare-file-sizes.py | 141 +++++++++++++++++++++++++++ 3 files changed, 282 insertions(+), 1 deletion(-) create mode 100644 scripts/capture-file-sizes.py create mode 100644 scripts/compare-file-sizes.py diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml index 08afe175341..042ff53c3c4 100644 --- a/.github/workflows/sql-benchmarks.yml +++ b/.github/workflows/sql-benchmarks.yml @@ -223,7 +223,6 @@ jobs: ${{ matrix.scale_factor && format('--scale-factor {0}', matrix.scale_factor) || '' }} - name: Install uv - if: inputs.mode == 'pr' uses: spiraldb/actions/.github/actions/setup-uv@0.18.5 with: sync: false @@ -260,6 +259,56 @@ jobs: # unique benchmark configuration must have a unique comment-tag. comment-tag: bench-pr-comment-${{ matrix.id }} + - name: Compare file sizes + if: inputs.mode == 'pr' + shell: bash + run: | + set -Eeu -o pipefail -x + + # Capture HEAD file sizes (vortex formats only) + uv run --no-project scripts/capture-file-sizes.py \ + vortex-bench/data \ + --benchmark ${{ matrix.subcommand }} \ + --commit ${{ github.event.pull_request.head.sha }} \ + -o head-sizes.json + + # Get base commit SHA (same as benchmark comparison) + base_commit_sha=$(\ + curl -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ + https://api.github.com/repos/vortex-data/vortex/actions/workflows/bench.yml/runs\?branch\=develop\&status\=success\&per_page\=1 \ + | jq -r '.workflow_runs[].head_sha' \ + ) + + # Download file sizes baseline + python3 scripts/s3-download.py s3://vortex-ci-benchmark-results/file-sizes.json.gz file-sizes.json.gz --no-sign-request || true + + # Generate comparison report + echo '# File Sizes: ${{ matrix.name }}' > sizes-comment.md + echo '' >> sizes-comment.md + + if [ -f file-sizes.json.gz ]; then + gzip -d -c file-sizes.json.gz | grep $base_commit_sha > base-sizes.json || true + if [ -s base-sizes.json ]; then + uv run --no-project scripts/compare-file-sizes.py base-sizes.json head-sizes.json \ + >> sizes-comment.md + else + echo '_No baseline file sizes found for base commit._' >> sizes-comment.md + fi + else + echo '_No baseline file sizes available yet._' >> sizes-comment.md + fi + + cat sizes-comment.md >> $GITHUB_STEP_SUMMARY + + - name: Comment PR with file sizes + if: inputs.mode == 'pr' && github.event.pull_request.head.repo.fork == false + uses: thollander/actions-comment-pull-request@v3 + with: + file-path: sizes-comment.md + comment-tag: file-sizes-${{ matrix.id }} + - name: Comment PR on failure if: failure() && inputs.mode == 'pr' && github.event.pull_request.head.repo.fork == false uses: thollander/actions-comment-pull-request@v3 @@ -276,6 +325,17 @@ jobs: run: | bash scripts/cat-s3.sh vortex-ci-benchmark-results data.json.gz results.json + - name: Upload File Sizes + if: inputs.mode == 'develop' + shell: bash + run: | + uv run --no-project scripts/capture-file-sizes.py \ + vortex-bench/data \ + --benchmark ${{ matrix.subcommand }} \ + --commit ${{ github.sha }} \ + -o sizes.json + bash scripts/cat-s3.sh vortex-ci-benchmark-results file-sizes.json.gz sizes.json + - name: Alert incident.io if: failure() && inputs.mode == 'develop' uses: ./.github/actions/alert-incident-io diff --git a/scripts/capture-file-sizes.py b/scripts/capture-file-sizes.py new file mode 100644 index 00000000000..d6f7e5350e1 --- /dev/null +++ b/scripts/capture-file-sizes.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.11" +# dependencies = [] +# /// + +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors + +"""Capture file sizes from benchmark data directories and output as JSONL.""" + +import argparse +import json +import sys +from pathlib import Path + + +def main(): + parser = argparse.ArgumentParser( + description="Capture file sizes from benchmark data directories" + ) + parser.add_argument("data_dir", help="Data directory (e.g., vortex-bench/data)") + parser.add_argument("--benchmark", required=True, help="Benchmark name (e.g., clickbench)") + parser.add_argument("--commit", required=True, help="Commit SHA") + parser.add_argument("-o", "--output", required=True, help="Output JSONL file path") + args = parser.parse_args() + + data_dir = Path(args.data_dir) + if not data_dir.exists(): + print(f"Data directory not found: {data_dir}", file=sys.stderr) + sys.exit(1) + + benchmark_dir = data_dir / args.benchmark + if not benchmark_dir.exists(): + print(f"Benchmark directory not found: {benchmark_dir}", file=sys.stderr) + sys.exit(1) + + # Formats to capture (vortex formats only, not parquet/duckdb) + formats_to_capture = {"vortex", "vortex-compact"} + + records = [] + + # Walk subdirectories looking for format directories + for format_dir in benchmark_dir.iterdir(): + if not format_dir.is_dir(): + continue + + format_name = format_dir.name + if format_name not in formats_to_capture: + continue + + # Capture all files in this format directory + for file_path in format_dir.rglob("*"): + if not file_path.is_file(): + continue + + size_bytes = file_path.stat().st_size + relative_path = file_path.relative_to(format_dir) + + records.append({ + "commit_id": args.commit, + "benchmark": args.benchmark, + "format": format_name, + "file": str(relative_path), + "size_bytes": size_bytes, + }) + + # Sort for deterministic output + records.sort(key=lambda r: (r["benchmark"], r["format"], r["file"])) + + # Write JSONL output + with open(args.output, "w") as f: + for record in records: + f.write(json.dumps(record) + "\n") + + print(f"Captured {len(records)} file sizes to {args.output}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/scripts/compare-file-sizes.py b/scripts/compare-file-sizes.py new file mode 100644 index 00000000000..1087f9e9727 --- /dev/null +++ b/scripts/compare-file-sizes.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.11" +# dependencies = [] +# /// + +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors + +"""Compare file sizes between base and HEAD and generate markdown report.""" + +import argparse +import json +import sys +from collections import defaultdict + + +def format_size(size_bytes: int) -> str: + """Format bytes as human-readable size.""" + if size_bytes >= 1024 ** 3: + return f"{size_bytes / (1024 ** 3):.2f} GB" + elif size_bytes >= 1024 ** 2: + return f"{size_bytes / (1024 ** 2):.2f} MB" + elif size_bytes >= 1024: + return f"{size_bytes / 1024:.2f} KB" + else: + return f"{size_bytes} B" + + +def format_change(change_bytes: int) -> str: + """Format byte change with sign.""" + sign = "+" if change_bytes > 0 else "" + return f"{sign}{format_size(abs(change_bytes))}" + + +def format_pct_change(pct: float) -> str: + """Format percentage change with sign.""" + sign = "+" if pct > 0 else "" + return f"{sign}{pct:.1f}%" + + +def main(): + parser = argparse.ArgumentParser( + description="Compare file sizes between base and HEAD" + ) + parser.add_argument("base_file", help="Base JSONL file") + parser.add_argument("head_file", help="HEAD JSONL file") + args = parser.parse_args() + + # Load base and head data + base_data = {} + try: + with open(args.base_file) as f: + for line in f: + record = json.loads(line) + key = (record["benchmark"], record["format"], record["file"]) + base_data[key] = record["size_bytes"] + except FileNotFoundError: + print("_Base file sizes not found._") + sys.exit(0) + + head_data = {} + try: + with open(args.head_file) as f: + for line in f: + record = json.loads(line) + key = (record["benchmark"], record["format"], record["file"]) + head_data[key] = record["size_bytes"] + except FileNotFoundError: + print("_HEAD file sizes not found._") + sys.exit(0) + + # Compare sizes + comparisons = [] + format_totals = defaultdict(lambda: {"base": 0, "head": 0}) + + all_keys = set(base_data.keys()) | set(head_data.keys()) + for key in all_keys: + benchmark, fmt, file_name = key + base_size = base_data.get(key, 0) + head_size = head_data.get(key, 0) + + format_totals[fmt]["base"] += base_size + format_totals[fmt]["head"] += head_size + + change = head_size - base_size + if change == 0: + continue + + if base_size > 0: + pct_change = (head_size / base_size - 1) * 100 + elif head_size > 0: + pct_change = float("inf") + else: + pct_change = 0 + + comparisons.append({ + "file": file_name, + "format": fmt, + "base_size": base_size, + "head_size": head_size, + "change": change, + "pct_change": pct_change, + }) + + if not comparisons: + print("_No file size changes detected._") + return + + # Sort by pct_change descending (largest increases first) + comparisons.sort(key=lambda x: x["pct_change"], reverse=True) + + # Output markdown table + print("| File | Format | Base | HEAD | Change | % |") + print("|------|--------|------|------|--------|---|") + + for comp in comparisons: + pct_str = format_pct_change(comp["pct_change"]) if comp["pct_change"] != float("inf") else "new" + base_str = format_size(comp["base_size"]) if comp["base_size"] > 0 else "-" + print( + f"| {comp['file']} | {comp['format']} | {base_str} | " + f"{format_size(comp['head_size'])} | {format_change(comp['change'])} | {pct_str} |" + ) + + # Output totals + print("") + print("**Totals:**") + for fmt in sorted(format_totals.keys()): + totals = format_totals[fmt] + base_total = totals["base"] + head_total = totals["head"] + if base_total > 0: + total_pct = (head_total / base_total - 1) * 100 + pct_str = f" ({format_pct_change(total_pct)})" + else: + pct_str = "" + print(f"- {fmt}: {format_size(base_total)} \u2192 {format_size(head_total)}{pct_str}") + + +if __name__ == "__main__": + main() From 6b785a8d143cbf8196a339522c3bfa1e2f413a6e Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 2 Apr 2026 11:26:59 -0400 Subject: [PATCH 2/6] ruff format Signed-off-by: Andrew Duffy --- scripts/capture-file-sizes.py | 20 ++++++++++---------- scripts/compare-file-sizes.py | 30 +++++++++++++++--------------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/scripts/capture-file-sizes.py b/scripts/capture-file-sizes.py index d6f7e5350e1..46417f80bcd 100644 --- a/scripts/capture-file-sizes.py +++ b/scripts/capture-file-sizes.py @@ -16,9 +16,7 @@ def main(): - parser = argparse.ArgumentParser( - description="Capture file sizes from benchmark data directories" - ) + parser = argparse.ArgumentParser(description="Capture file sizes from benchmark data directories") parser.add_argument("data_dir", help="Data directory (e.g., vortex-bench/data)") parser.add_argument("--benchmark", required=True, help="Benchmark name (e.g., clickbench)") parser.add_argument("--commit", required=True, help="Commit SHA") @@ -57,13 +55,15 @@ def main(): size_bytes = file_path.stat().st_size relative_path = file_path.relative_to(format_dir) - records.append({ - "commit_id": args.commit, - "benchmark": args.benchmark, - "format": format_name, - "file": str(relative_path), - "size_bytes": size_bytes, - }) + records.append( + { + "commit_id": args.commit, + "benchmark": args.benchmark, + "format": format_name, + "file": str(relative_path), + "size_bytes": size_bytes, + } + ) # Sort for deterministic output records.sort(key=lambda r: (r["benchmark"], r["format"], r["file"])) diff --git a/scripts/compare-file-sizes.py b/scripts/compare-file-sizes.py index 1087f9e9727..1e7274682fa 100644 --- a/scripts/compare-file-sizes.py +++ b/scripts/compare-file-sizes.py @@ -17,10 +17,10 @@ def format_size(size_bytes: int) -> str: """Format bytes as human-readable size.""" - if size_bytes >= 1024 ** 3: - return f"{size_bytes / (1024 ** 3):.2f} GB" - elif size_bytes >= 1024 ** 2: - return f"{size_bytes / (1024 ** 2):.2f} MB" + if size_bytes >= 1024**3: + return f"{size_bytes / (1024**3):.2f} GB" + elif size_bytes >= 1024**2: + return f"{size_bytes / (1024**2):.2f} MB" elif size_bytes >= 1024: return f"{size_bytes / 1024:.2f} KB" else: @@ -40,9 +40,7 @@ def format_pct_change(pct: float) -> str: def main(): - parser = argparse.ArgumentParser( - description="Compare file sizes between base and HEAD" - ) + parser = argparse.ArgumentParser(description="Compare file sizes between base and HEAD") parser.add_argument("base_file", help="Base JSONL file") parser.add_argument("head_file", help="HEAD JSONL file") args = parser.parse_args() @@ -94,14 +92,16 @@ def main(): else: pct_change = 0 - comparisons.append({ - "file": file_name, - "format": fmt, - "base_size": base_size, - "head_size": head_size, - "change": change, - "pct_change": pct_change, - }) + comparisons.append( + { + "file": file_name, + "format": fmt, + "base_size": base_size, + "head_size": head_size, + "change": change, + "pct_change": pct_change, + } + ) if not comparisons: print("_No file size changes detected._") From ae9f875fe53fdeff3ae1fcb721328f95a5fc5945 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 2 Apr 2026 11:39:20 -0400 Subject: [PATCH 3/6] fix script to work with clickbench_partitioned, vortex-file-compressed Signed-off-by: Andrew Duffy --- scripts/capture-file-sizes.py | 63 ++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/scripts/capture-file-sizes.py b/scripts/capture-file-sizes.py index 46417f80bcd..8bdeca5e4b3 100644 --- a/scripts/capture-file-sizes.py +++ b/scripts/capture-file-sizes.py @@ -28,42 +28,53 @@ def main(): print(f"Data directory not found: {data_dir}", file=sys.stderr) sys.exit(1) - benchmark_dir = data_dir / args.benchmark - if not benchmark_dir.exists(): - print(f"Benchmark directory not found: {benchmark_dir}", file=sys.stderr) + # Find benchmark directories matching the name (handles flavors like clickbench_partitioned) + # Also handles exact match (e.g., tpch) + benchmark_dirs = [ + d + for d in data_dir.iterdir() + if d.is_dir() and (d.name == args.benchmark or d.name.startswith(f"{args.benchmark}_")) + ] + + if not benchmark_dirs: + print(f"No benchmark directories found matching: {args.benchmark}", file=sys.stderr) sys.exit(1) # Formats to capture (vortex formats only, not parquet/duckdb) - formats_to_capture = {"vortex", "vortex-compact"} + # Note: "vortex" CLI arg maps to "vortex-file-compressed" directory name + formats_to_capture = {"vortex-file-compressed", "vortex-compact"} records = [] # Walk subdirectories looking for format directories - for format_dir in benchmark_dir.iterdir(): - if not format_dir.is_dir(): - continue - - format_name = format_dir.name - if format_name not in formats_to_capture: - continue + # Handle both direct format dirs (clickbench_partitioned/vortex-file-compressed/) + # and scale factor subdirs (tpch/1.0/vortex-file-compressed/) + for benchmark_dir in benchmark_dirs: + for format_dir in benchmark_dir.rglob("*"): + if not format_dir.is_dir(): + continue - # Capture all files in this format directory - for file_path in format_dir.rglob("*"): - if not file_path.is_file(): + format_name = format_dir.name + if format_name not in formats_to_capture: continue - size_bytes = file_path.stat().st_size - relative_path = file_path.relative_to(format_dir) - - records.append( - { - "commit_id": args.commit, - "benchmark": args.benchmark, - "format": format_name, - "file": str(relative_path), - "size_bytes": size_bytes, - } - ) + # Capture all files in this format directory + for file_path in format_dir.rglob("*"): + if not file_path.is_file(): + continue + + size_bytes = file_path.stat().st_size + relative_path = file_path.relative_to(format_dir) + + records.append( + { + "commit_id": args.commit, + "benchmark": args.benchmark, + "format": format_name, + "file": str(relative_path), + "size_bytes": size_bytes, + } + ) # Sort for deterministic output records.sort(key=lambda r: (r["benchmark"], r["format"], r["file"])) From 2912d41bec7792ff47998b0b13a419f532b284e4 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 2 Apr 2026 11:48:04 -0400 Subject: [PATCH 4/6] avoid duplicate outputs for s3 benches Signed-off-by: Andrew Duffy --- .github/workflows/sql-benchmarks.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml index 042ff53c3c4..9e7c6f86281 100644 --- a/.github/workflows/sql-benchmarks.yml +++ b/.github/workflows/sql-benchmarks.yml @@ -260,7 +260,7 @@ jobs: comment-tag: bench-pr-comment-${{ matrix.id }} - name: Compare file sizes - if: inputs.mode == 'pr' + if: inputs.mode == 'pr' && matrix.remote_storage == null shell: bash run: | set -Eeu -o pipefail -x @@ -303,7 +303,7 @@ jobs: cat sizes-comment.md >> $GITHUB_STEP_SUMMARY - name: Comment PR with file sizes - if: inputs.mode == 'pr' && github.event.pull_request.head.repo.fork == false + if: inputs.mode == 'pr' && matrix.remote_storage == null && github.event.pull_request.head.repo.fork == false uses: thollander/actions-comment-pull-request@v3 with: file-path: sizes-comment.md @@ -326,7 +326,7 @@ jobs: bash scripts/cat-s3.sh vortex-ci-benchmark-results data.json.gz results.json - name: Upload File Sizes - if: inputs.mode == 'develop' + if: inputs.mode == 'develop' && matrix.remote_storage == null shell: bash run: | uv run --no-project scripts/capture-file-sizes.py \ From 7fa0c26a792d77214e0202784b41c151d539f363 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 2 Apr 2026 12:20:28 -0400 Subject: [PATCH 5/6] save scale_factor Signed-off-by: Andrew Duffy --- scripts/capture-file-sizes.py | 8 +++++++- scripts/compare-file-sizes.py | 16 ++++++++++------ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/scripts/capture-file-sizes.py b/scripts/capture-file-sizes.py index 8bdeca5e4b3..754df1ee702 100644 --- a/scripts/capture-file-sizes.py +++ b/scripts/capture-file-sizes.py @@ -58,6 +58,11 @@ def main(): if format_name not in formats_to_capture: continue + # Extract scale factor from path (e.g., "1.0" for tpch/1.0/vortex-file-compressed) + # Default to "1.0" if no intermediate directory (e.g., clickbench) + path_between = format_dir.relative_to(benchmark_dir).parent + scale_factor = str(path_between) if str(path_between) != "." else "1.0" + # Capture all files in this format directory for file_path in format_dir.rglob("*"): if not file_path.is_file(): @@ -70,6 +75,7 @@ def main(): { "commit_id": args.commit, "benchmark": args.benchmark, + "scale_factor": scale_factor, "format": format_name, "file": str(relative_path), "size_bytes": size_bytes, @@ -77,7 +83,7 @@ def main(): ) # Sort for deterministic output - records.sort(key=lambda r: (r["benchmark"], r["format"], r["file"])) + records.sort(key=lambda r: (r["benchmark"], r["scale_factor"], r["format"], r["file"])) # Write JSONL output with open(args.output, "w") as f: diff --git a/scripts/compare-file-sizes.py b/scripts/compare-file-sizes.py index 1e7274682fa..f840576c097 100644 --- a/scripts/compare-file-sizes.py +++ b/scripts/compare-file-sizes.py @@ -51,7 +51,9 @@ def main(): with open(args.base_file) as f: for line in f: record = json.loads(line) - key = (record["benchmark"], record["format"], record["file"]) + # Support old records without scale_factor (default to "1.0") + scale_factor = record.get("scale_factor", "1.0") + key = (record["benchmark"], scale_factor, record["format"], record["file"]) base_data[key] = record["size_bytes"] except FileNotFoundError: print("_Base file sizes not found._") @@ -62,7 +64,8 @@ def main(): with open(args.head_file) as f: for line in f: record = json.loads(line) - key = (record["benchmark"], record["format"], record["file"]) + scale_factor = record.get("scale_factor", "1.0") + key = (record["benchmark"], scale_factor, record["format"], record["file"]) head_data[key] = record["size_bytes"] except FileNotFoundError: print("_HEAD file sizes not found._") @@ -74,7 +77,7 @@ def main(): all_keys = set(base_data.keys()) | set(head_data.keys()) for key in all_keys: - benchmark, fmt, file_name = key + benchmark, scale_factor, fmt, file_name = key base_size = base_data.get(key, 0) head_size = head_data.get(key, 0) @@ -95,6 +98,7 @@ def main(): comparisons.append( { "file": file_name, + "scale_factor": scale_factor, "format": fmt, "base_size": base_size, "head_size": head_size, @@ -111,14 +115,14 @@ def main(): comparisons.sort(key=lambda x: x["pct_change"], reverse=True) # Output markdown table - print("| File | Format | Base | HEAD | Change | % |") - print("|------|--------|------|------|--------|---|") + print("| File | Scale | Format | Base | HEAD | Change | % |") + print("|------|-------|--------|------|------|--------|---|") for comp in comparisons: pct_str = format_pct_change(comp["pct_change"]) if comp["pct_change"] != float("inf") else "new" base_str = format_size(comp["base_size"]) if comp["base_size"] > 0 else "-" print( - f"| {comp['file']} | {comp['format']} | {base_str} | " + f"| {comp['file']} | {comp['scale_factor']} | {comp['format']} | {base_str} | " f"{format_size(comp['head_size'])} | {format_change(comp['change'])} | {pct_str} |" ) From 0cf1a9e1f40b7e88d8e2cbc1fb9db48146aa8ba8 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 2 Apr 2026 12:58:46 -0400 Subject: [PATCH 6/6] fix statpopgen Signed-off-by: Andrew Duffy --- vortex-bench/src/statpopgen/statpopgen_benchmark.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vortex-bench/src/statpopgen/statpopgen_benchmark.rs b/vortex-bench/src/statpopgen/statpopgen_benchmark.rs index 941859dc90b..dc5fa6448ad 100644 --- a/vortex-bench/src/statpopgen/statpopgen_benchmark.rs +++ b/vortex-bench/src/statpopgen/statpopgen_benchmark.rs @@ -52,7 +52,7 @@ impl StatPopGenBenchmark { ) })?; - let data_path = "statspopgen".to_data_path().join(format!("{n_rows}/")); + let data_path = "statpopgen".to_data_path().join(format!("{n_rows}/")); let data_url = Url::from_directory_path(data_path).map_err(|_| anyhow::anyhow!("bad data path?"))?;