diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml index 08afe175341..9e7c6f86281 100644 --- a/.github/workflows/sql-benchmarks.yml +++ b/.github/workflows/sql-benchmarks.yml @@ -223,7 +223,6 @@ jobs: ${{ matrix.scale_factor && format('--scale-factor {0}', matrix.scale_factor) || '' }} - name: Install uv - if: inputs.mode == 'pr' uses: spiraldb/actions/.github/actions/setup-uv@0.18.5 with: sync: false @@ -260,6 +259,56 @@ jobs: # unique benchmark configuration must have a unique comment-tag. comment-tag: bench-pr-comment-${{ matrix.id }} + - name: Compare file sizes + if: inputs.mode == 'pr' && matrix.remote_storage == null + shell: bash + run: | + set -Eeu -o pipefail -x + + # Capture HEAD file sizes (vortex formats only) + uv run --no-project scripts/capture-file-sizes.py \ + vortex-bench/data \ + --benchmark ${{ matrix.subcommand }} \ + --commit ${{ github.event.pull_request.head.sha }} \ + -o head-sizes.json + + # Get base commit SHA (same as benchmark comparison) + base_commit_sha=$(\ + curl -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ + https://api.github.com/repos/vortex-data/vortex/actions/workflows/bench.yml/runs\?branch\=develop\&status\=success\&per_page\=1 \ + | jq -r '.workflow_runs[].head_sha' \ + ) + + # Download file sizes baseline + python3 scripts/s3-download.py s3://vortex-ci-benchmark-results/file-sizes.json.gz file-sizes.json.gz --no-sign-request || true + + # Generate comparison report + echo '# File Sizes: ${{ matrix.name }}' > sizes-comment.md + echo '' >> sizes-comment.md + + if [ -f file-sizes.json.gz ]; then + gzip -d -c file-sizes.json.gz | grep $base_commit_sha > base-sizes.json || true + if [ -s base-sizes.json ]; then + uv run --no-project scripts/compare-file-sizes.py base-sizes.json head-sizes.json \ + >> sizes-comment.md + else + echo '_No baseline file sizes found for base commit._' >> sizes-comment.md + fi + else + echo '_No baseline file sizes available yet._' >> sizes-comment.md + fi + + cat sizes-comment.md >> $GITHUB_STEP_SUMMARY + + - name: Comment PR with file sizes + if: inputs.mode == 'pr' && matrix.remote_storage == null && github.event.pull_request.head.repo.fork == false + uses: thollander/actions-comment-pull-request@v3 + with: + file-path: sizes-comment.md + comment-tag: file-sizes-${{ matrix.id }} + - name: Comment PR on failure if: failure() && inputs.mode == 'pr' && github.event.pull_request.head.repo.fork == false uses: thollander/actions-comment-pull-request@v3 @@ -276,6 +325,17 @@ jobs: run: | bash scripts/cat-s3.sh vortex-ci-benchmark-results data.json.gz results.json + - name: Upload File Sizes + if: inputs.mode == 'develop' && matrix.remote_storage == null + shell: bash + run: | + uv run --no-project scripts/capture-file-sizes.py \ + vortex-bench/data \ + --benchmark ${{ matrix.subcommand }} \ + --commit ${{ github.sha }} \ + -o sizes.json + bash scripts/cat-s3.sh vortex-ci-benchmark-results file-sizes.json.gz sizes.json + - name: Alert incident.io if: failure() && inputs.mode == 'develop' uses: ./.github/actions/alert-incident-io diff --git a/scripts/capture-file-sizes.py b/scripts/capture-file-sizes.py new file mode 100644 index 00000000000..754df1ee702 --- /dev/null +++ b/scripts/capture-file-sizes.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.11" +# dependencies = [] +# /// + +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors + +"""Capture file sizes from benchmark data directories and output as JSONL.""" + +import argparse +import json +import sys +from pathlib import Path + + +def main(): + parser = argparse.ArgumentParser(description="Capture file sizes from benchmark data directories") + parser.add_argument("data_dir", help="Data directory (e.g., vortex-bench/data)") + parser.add_argument("--benchmark", required=True, help="Benchmark name (e.g., clickbench)") + parser.add_argument("--commit", required=True, help="Commit SHA") + parser.add_argument("-o", "--output", required=True, help="Output JSONL file path") + args = parser.parse_args() + + data_dir = Path(args.data_dir) + if not data_dir.exists(): + print(f"Data directory not found: {data_dir}", file=sys.stderr) + sys.exit(1) + + # Find benchmark directories matching the name (handles flavors like clickbench_partitioned) + # Also handles exact match (e.g., tpch) + benchmark_dirs = [ + d + for d in data_dir.iterdir() + if d.is_dir() and (d.name == args.benchmark or d.name.startswith(f"{args.benchmark}_")) + ] + + if not benchmark_dirs: + print(f"No benchmark directories found matching: {args.benchmark}", file=sys.stderr) + sys.exit(1) + + # Formats to capture (vortex formats only, not parquet/duckdb) + # Note: "vortex" CLI arg maps to "vortex-file-compressed" directory name + formats_to_capture = {"vortex-file-compressed", "vortex-compact"} + + records = [] + + # Walk subdirectories looking for format directories + # Handle both direct format dirs (clickbench_partitioned/vortex-file-compressed/) + # and scale factor subdirs (tpch/1.0/vortex-file-compressed/) + for benchmark_dir in benchmark_dirs: + for format_dir in benchmark_dir.rglob("*"): + if not format_dir.is_dir(): + continue + + format_name = format_dir.name + if format_name not in formats_to_capture: + continue + + # Extract scale factor from path (e.g., "1.0" for tpch/1.0/vortex-file-compressed) + # Default to "1.0" if no intermediate directory (e.g., clickbench) + path_between = format_dir.relative_to(benchmark_dir).parent + scale_factor = str(path_between) if str(path_between) != "." else "1.0" + + # Capture all files in this format directory + for file_path in format_dir.rglob("*"): + if not file_path.is_file(): + continue + + size_bytes = file_path.stat().st_size + relative_path = file_path.relative_to(format_dir) + + records.append( + { + "commit_id": args.commit, + "benchmark": args.benchmark, + "scale_factor": scale_factor, + "format": format_name, + "file": str(relative_path), + "size_bytes": size_bytes, + } + ) + + # Sort for deterministic output + records.sort(key=lambda r: (r["benchmark"], r["scale_factor"], r["format"], r["file"])) + + # Write JSONL output + with open(args.output, "w") as f: + for record in records: + f.write(json.dumps(record) + "\n") + + print(f"Captured {len(records)} file sizes to {args.output}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/scripts/compare-file-sizes.py b/scripts/compare-file-sizes.py new file mode 100644 index 00000000000..f840576c097 --- /dev/null +++ b/scripts/compare-file-sizes.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.11" +# dependencies = [] +# /// + +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors + +"""Compare file sizes between base and HEAD and generate markdown report.""" + +import argparse +import json +import sys +from collections import defaultdict + + +def format_size(size_bytes: int) -> str: + """Format bytes as human-readable size.""" + if size_bytes >= 1024**3: + return f"{size_bytes / (1024**3):.2f} GB" + elif size_bytes >= 1024**2: + return f"{size_bytes / (1024**2):.2f} MB" + elif size_bytes >= 1024: + return f"{size_bytes / 1024:.2f} KB" + else: + return f"{size_bytes} B" + + +def format_change(change_bytes: int) -> str: + """Format byte change with sign.""" + sign = "+" if change_bytes > 0 else "" + return f"{sign}{format_size(abs(change_bytes))}" + + +def format_pct_change(pct: float) -> str: + """Format percentage change with sign.""" + sign = "+" if pct > 0 else "" + return f"{sign}{pct:.1f}%" + + +def main(): + parser = argparse.ArgumentParser(description="Compare file sizes between base and HEAD") + parser.add_argument("base_file", help="Base JSONL file") + parser.add_argument("head_file", help="HEAD JSONL file") + args = parser.parse_args() + + # Load base and head data + base_data = {} + try: + with open(args.base_file) as f: + for line in f: + record = json.loads(line) + # Support old records without scale_factor (default to "1.0") + scale_factor = record.get("scale_factor", "1.0") + key = (record["benchmark"], scale_factor, record["format"], record["file"]) + base_data[key] = record["size_bytes"] + except FileNotFoundError: + print("_Base file sizes not found._") + sys.exit(0) + + head_data = {} + try: + with open(args.head_file) as f: + for line in f: + record = json.loads(line) + scale_factor = record.get("scale_factor", "1.0") + key = (record["benchmark"], scale_factor, record["format"], record["file"]) + head_data[key] = record["size_bytes"] + except FileNotFoundError: + print("_HEAD file sizes not found._") + sys.exit(0) + + # Compare sizes + comparisons = [] + format_totals = defaultdict(lambda: {"base": 0, "head": 0}) + + all_keys = set(base_data.keys()) | set(head_data.keys()) + for key in all_keys: + benchmark, scale_factor, fmt, file_name = key + base_size = base_data.get(key, 0) + head_size = head_data.get(key, 0) + + format_totals[fmt]["base"] += base_size + format_totals[fmt]["head"] += head_size + + change = head_size - base_size + if change == 0: + continue + + if base_size > 0: + pct_change = (head_size / base_size - 1) * 100 + elif head_size > 0: + pct_change = float("inf") + else: + pct_change = 0 + + comparisons.append( + { + "file": file_name, + "scale_factor": scale_factor, + "format": fmt, + "base_size": base_size, + "head_size": head_size, + "change": change, + "pct_change": pct_change, + } + ) + + if not comparisons: + print("_No file size changes detected._") + return + + # Sort by pct_change descending (largest increases first) + comparisons.sort(key=lambda x: x["pct_change"], reverse=True) + + # Output markdown table + print("| File | Scale | Format | Base | HEAD | Change | % |") + print("|------|-------|--------|------|------|--------|---|") + + for comp in comparisons: + pct_str = format_pct_change(comp["pct_change"]) if comp["pct_change"] != float("inf") else "new" + base_str = format_size(comp["base_size"]) if comp["base_size"] > 0 else "-" + print( + f"| {comp['file']} | {comp['scale_factor']} | {comp['format']} | {base_str} | " + f"{format_size(comp['head_size'])} | {format_change(comp['change'])} | {pct_str} |" + ) + + # Output totals + print("") + print("**Totals:**") + for fmt in sorted(format_totals.keys()): + totals = format_totals[fmt] + base_total = totals["base"] + head_total = totals["head"] + if base_total > 0: + total_pct = (head_total / base_total - 1) * 100 + pct_str = f" ({format_pct_change(total_pct)})" + else: + pct_str = "" + print(f"- {fmt}: {format_size(base_total)} \u2192 {format_size(head_total)}{pct_str}") + + +if __name__ == "__main__": + main() diff --git a/vortex-bench/src/statpopgen/statpopgen_benchmark.rs b/vortex-bench/src/statpopgen/statpopgen_benchmark.rs index 941859dc90b..dc5fa6448ad 100644 --- a/vortex-bench/src/statpopgen/statpopgen_benchmark.rs +++ b/vortex-bench/src/statpopgen/statpopgen_benchmark.rs @@ -52,7 +52,7 @@ impl StatPopGenBenchmark { ) })?; - let data_path = "statspopgen".to_data_path().join(format!("{n_rows}/")); + let data_path = "statpopgen".to_data_path().join(format!("{n_rows}/")); let data_url = Url::from_directory_path(data_path).map_err(|_| anyhow::anyhow!("bad data path?"))?;