vortex-data · a10y · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml
@@ -223,7 +223,6 @@ jobs:
             ${{ matrix.scale_factor && format('--scale-factor {0}', matrix.scale_factor) || '' }}
 
       - name: Install uv
-        if: inputs.mode == 'pr'
         uses: spiraldb/actions/.github/actions/setup-uv@0.18.5
         with:
           sync: false
@@ -260,6 +259,56 @@ jobs:
           # unique benchmark configuration must have a unique comment-tag.
           comment-tag: bench-pr-comment-${{ matrix.id }}
 
+      - name: Compare file sizes
+        if: inputs.mode == 'pr' && matrix.remote_storage == null
+        shell: bash
+        run: |
+          set -Eeu -o pipefail -x
+
+          # Capture HEAD file sizes (vortex formats only)
+          uv run --no-project scripts/capture-file-sizes.py \
+            vortex-bench/data \
+            --benchmark ${{ matrix.subcommand }} \
+            --commit ${{ github.event.pull_request.head.sha }} \
+            -o head-sizes.json
+
+          # Get base commit SHA (same as benchmark comparison)
+          base_commit_sha=$(\
+            curl -L \
+              -H "Accept: application/vnd.github+json" \
+              -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+              https://api.github.com/repos/vortex-data/vortex/actions/workflows/bench.yml/runs\?branch\=develop\&status\=success\&per_page\=1 \
+            | jq -r '.workflow_runs[].head_sha' \
+          )
+
+          # Download file sizes baseline
+          python3 scripts/s3-download.py s3://vortex-ci-benchmark-results/file-sizes.json.gz file-sizes.json.gz --no-sign-request || true
+
+          # Generate comparison report
+          echo '# File Sizes: ${{ matrix.name }}' > sizes-comment.md
+          echo '' >> sizes-comment.md
+
+          if [ -f file-sizes.json.gz ]; then
+            gzip -d -c file-sizes.json.gz | grep $base_commit_sha > base-sizes.json || true
+            if [ -s base-sizes.json ]; then
+              uv run --no-project scripts/compare-file-sizes.py base-sizes.json head-sizes.json \
+                >> sizes-comment.md
+            else
+              echo '_No baseline file sizes found for base commit._' >> sizes-comment.md
+            fi
+          else
+            echo '_No baseline file sizes available yet._' >> sizes-comment.md
+          fi
+
+          cat sizes-comment.md >> $GITHUB_STEP_SUMMARY
+
+      - name: Comment PR with file sizes
+        if: inputs.mode == 'pr' && matrix.remote_storage == null && github.event.pull_request.head.repo.fork == false
+        uses: thollander/actions-comment-pull-request@v3
+        with:
+          file-path: sizes-comment.md
+          comment-tag: file-sizes-${{ matrix.id }}
+
       - name: Comment PR on failure
         if: failure() && inputs.mode == 'pr' && github.event.pull_request.head.repo.fork == false
         uses: thollander/actions-comment-pull-request@v3
@@ -276,6 +325,17 @@ jobs:
         run: |
           bash scripts/cat-s3.sh vortex-ci-benchmark-results data.json.gz results.json
 
+      - name: Upload File Sizes
+        if: inputs.mode == 'develop' && matrix.remote_storage == null
+        shell: bash
+        run: |
+          uv run --no-project scripts/capture-file-sizes.py \
+            vortex-bench/data \
+            --benchmark ${{ matrix.subcommand }} \
+            --commit ${{ github.sha }} \
+            -o sizes.json
+          bash scripts/cat-s3.sh vortex-ci-benchmark-results file-sizes.json.gz sizes.json
+
       - name: Alert incident.io
         if: failure() && inputs.mode == 'develop'
         uses: ./.github/actions/alert-incident-io

diff --git a/scripts/capture-file-sizes.py b/scripts/capture-file-sizes.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.11"
+# dependencies = []
+# ///
+
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+"""Capture file sizes from benchmark data directories and output as JSONL."""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Capture file sizes from benchmark data directories")
+    parser.add_argument("data_dir", help="Data directory (e.g., vortex-bench/data)")
+    parser.add_argument("--benchmark", required=True, help="Benchmark name (e.g., clickbench)")
+    parser.add_argument("--commit", required=True, help="Commit SHA")
+    parser.add_argument("-o", "--output", required=True, help="Output JSONL file path")
+    args = parser.parse_args()
+
+    data_dir = Path(args.data_dir)
+    if not data_dir.exists():
+        print(f"Data directory not found: {data_dir}", file=sys.stderr)
+        sys.exit(1)
+
+    # Find benchmark directories matching the name (handles flavors like clickbench_partitioned)
+    # Also handles exact match (e.g., tpch)
+    benchmark_dirs = [
+        d
+        for d in data_dir.iterdir()
+        if d.is_dir() and (d.name == args.benchmark or d.name.startswith(f"{args.benchmark}_"))
+    ]
+
+    if not benchmark_dirs:
+        print(f"No benchmark directories found matching: {args.benchmark}", file=sys.stderr)
+        sys.exit(1)
+
+    # Formats to capture (vortex formats only, not parquet/duckdb)
+    # Note: "vortex" CLI arg maps to "vortex-file-compressed" directory name
+    formats_to_capture = {"vortex-file-compressed", "vortex-compact"}
+
+    records = []
+
+    # Walk subdirectories looking for format directories
+    # Handle both direct format dirs (clickbench_partitioned/vortex-file-compressed/)
+    # and scale factor subdirs (tpch/1.0/vortex-file-compressed/)
+    for benchmark_dir in benchmark_dirs:
+        for format_dir in benchmark_dir.rglob("*"):
+            if not format_dir.is_dir():
+                continue
+
+            format_name = format_dir.name
+            if format_name not in formats_to_capture:
+                continue
+
+            # Extract scale factor from path (e.g., "1.0" for tpch/1.0/vortex-file-compressed)
+            # Default to "1.0" if no intermediate directory (e.g., clickbench)
+            path_between = format_dir.relative_to(benchmark_dir).parent
+            scale_factor = str(path_between) if str(path_between) != "." else "1.0"
+
+            # Capture all files in this format directory
+            for file_path in format_dir.rglob("*"):
+                if not file_path.is_file():
+                    continue
+
+                size_bytes = file_path.stat().st_size
+                relative_path = file_path.relative_to(format_dir)
+
+                records.append(
+                    {
+                        "commit_id": args.commit,
+                        "benchmark": args.benchmark,
+                        "scale_factor": scale_factor,
+                        "format": format_name,
+                        "file": str(relative_path),
+                        "size_bytes": size_bytes,
+                    }
+                )
+
+    # Sort for deterministic output
+    records.sort(key=lambda r: (r["benchmark"], r["scale_factor"], r["format"], r["file"]))
+
+    # Write JSONL output
+    with open(args.output, "w") as f:
+        for record in records:
+            f.write(json.dumps(record) + "\n")
+
+    print(f"Captured {len(records)} file sizes to {args.output}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/compare-file-sizes.py b/scripts/compare-file-sizes.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.11"
+# dependencies = []
+# ///
+
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+"""Compare file sizes between base and HEAD and generate markdown report."""
+
+import argparse
+import json
+import sys
+from collections import defaultdict
+
+
+def format_size(size_bytes: int) -> str:
+    """Format bytes as human-readable size."""
+    if size_bytes >= 1024**3:
+        return f"{size_bytes / (1024**3):.2f} GB"
+    elif size_bytes >= 1024**2:
+        return f"{size_bytes / (1024**2):.2f} MB"
+    elif size_bytes >= 1024:
+        return f"{size_bytes / 1024:.2f} KB"
+    else:
+        return f"{size_bytes} B"
+
+
+def format_change(change_bytes: int) -> str:
+    """Format byte change with sign."""
+    sign = "+" if change_bytes > 0 else ""
+    return f"{sign}{format_size(abs(change_bytes))}"
+
+
+def format_pct_change(pct: float) -> str:
+    """Format percentage change with sign."""
+    sign = "+" if pct > 0 else ""
+    return f"{sign}{pct:.1f}%"
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Compare file sizes between base and HEAD")
+    parser.add_argument("base_file", help="Base JSONL file")
+    parser.add_argument("head_file", help="HEAD JSONL file")
+    args = parser.parse_args()
+
+    # Load base and head data
+    base_data = {}
+    try:
+        with open(args.base_file) as f:
+            for line in f:
+                record = json.loads(line)
+                # Support old records without scale_factor (default to "1.0")
+                scale_factor = record.get("scale_factor", "1.0")
+                key = (record["benchmark"], scale_factor, record["format"], record["file"])
+                base_data[key] = record["size_bytes"]
+    except FileNotFoundError:
+        print("_Base file sizes not found._")
+        sys.exit(0)
+
+    head_data = {}
+    try:
+        with open(args.head_file) as f:
+            for line in f:
+                record = json.loads(line)
+                scale_factor = record.get("scale_factor", "1.0")
+                key = (record["benchmark"], scale_factor, record["format"], record["file"])
+                head_data[key] = record["size_bytes"]
+    except FileNotFoundError:
+        print("_HEAD file sizes not found._")
+        sys.exit(0)
+
+    # Compare sizes
+    comparisons = []
+    format_totals = defaultdict(lambda: {"base": 0, "head": 0})
+
+    all_keys = set(base_data.keys()) | set(head_data.keys())
+    for key in all_keys:
+        benchmark, scale_factor, fmt, file_name = key
+        base_size = base_data.get(key, 0)
+        head_size = head_data.get(key, 0)
+
+        format_totals[fmt]["base"] += base_size
+        format_totals[fmt]["head"] += head_size
+
+        change = head_size - base_size
+        if change == 0:
+            continue
+
+        if base_size > 0:
+            pct_change = (head_size / base_size - 1) * 100
+        elif head_size > 0:
+            pct_change = float("inf")
+        else:
+            pct_change = 0
+
+        comparisons.append(
+            {
+                "file": file_name,
+                "scale_factor": scale_factor,
+                "format": fmt,
+                "base_size": base_size,
+                "head_size": head_size,
+                "change": change,
+                "pct_change": pct_change,
+            }
+        )
+
+    if not comparisons:
+        print("_No file size changes detected._")
+        return
+
+    # Sort by pct_change descending (largest increases first)
+    comparisons.sort(key=lambda x: x["pct_change"], reverse=True)
+
+    # Output markdown table
+    print("| File | Scale | Format | Base | HEAD | Change | % |")
+    print("|------|-------|--------|------|------|--------|---|")
+
+    for comp in comparisons:
+        pct_str = format_pct_change(comp["pct_change"]) if comp["pct_change"] != float("inf") else "new"
+        base_str = format_size(comp["base_size"]) if comp["base_size"] > 0 else "-"
+        print(
+            f"| {comp['file']} | {comp['scale_factor']} | {comp['format']} | {base_str} | "
+            f"{format_size(comp['head_size'])} | {format_change(comp['change'])} | {pct_str} |"
+        )
+
+    # Output totals
+    print("")
+    print("**Totals:**")
+    for fmt in sorted(format_totals.keys()):
+        totals = format_totals[fmt]
+        base_total = totals["base"]
+        head_total = totals["head"]
+        if base_total > 0:
+            total_pct = (head_total / base_total - 1) * 100
+            pct_str = f" ({format_pct_change(total_pct)})"
+        else:
+            pct_str = ""
+        print(f"- {fmt}: {format_size(base_total)} \u2192 {format_size(head_total)}{pct_str}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vortex-bench/src/statpopgen/statpopgen_benchmark.rs b/vortex-bench/src/statpopgen/statpopgen_benchmark.rs
@@ -52,7 +52,7 @@ impl StatPopGenBenchmark {
             )
         })?;
 
-        let data_path = "statspopgen".to_data_path().join(format!("{n_rows}/"));
+        let data_path = "statpopgen".to_data_path().join(format!("{n_rows}/"));
 
         let data_url =
             Url::from_directory_path(data_path).map_err(|_| anyhow::anyhow!("bad data path?"))?;