From d1edb798fadc87fb50d0aa876af38f7b1d0aec63 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Thu, 2 Apr 2026 11:17:15 -0400
Subject: [PATCH 1/6] Add comment showing change in ClickBench files

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 .github/workflows/sql-benchmarks.yml |  62 +++++++++++-
 scripts/capture-file-sizes.py        |  80 +++++++++++++++
 scripts/compare-file-sizes.py        | 141 +++++++++++++++++++++++++++
 3 files changed, 282 insertions(+), 1 deletion(-)
 create mode 100644 scripts/capture-file-sizes.py
 create mode 100644 scripts/compare-file-sizes.py

diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml
index 08afe175341..042ff53c3c4 100644
--- a/.github/workflows/sql-benchmarks.yml
+++ b/.github/workflows/sql-benchmarks.yml
@@ -223,7 +223,6 @@ jobs:
             ${{ matrix.scale_factor && format('--scale-factor {0}', matrix.scale_factor) || '' }}
 
       - name: Install uv
-        if: inputs.mode == 'pr'
         uses: spiraldb/actions/.github/actions/setup-uv@0.18.5
         with:
           sync: false
@@ -260,6 +259,56 @@ jobs:
           # unique benchmark configuration must have a unique comment-tag.
           comment-tag: bench-pr-comment-${{ matrix.id }}
 
+      - name: Compare file sizes
+        if: inputs.mode == 'pr'
+        shell: bash
+        run: |
+          set -Eeu -o pipefail -x
+
+          # Capture HEAD file sizes (vortex formats only)
+          uv run --no-project scripts/capture-file-sizes.py \
+            vortex-bench/data \
+            --benchmark ${{ matrix.subcommand }} \
+            --commit ${{ github.event.pull_request.head.sha }} \
+            -o head-sizes.json
+
+          # Get base commit SHA (same as benchmark comparison)
+          base_commit_sha=$(\
+            curl -L \
+              -H "Accept: application/vnd.github+json" \
+              -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+              https://api.github.com/repos/vortex-data/vortex/actions/workflows/bench.yml/runs\?branch\=develop\&status\=success\&per_page\=1 \
+            | jq -r '.workflow_runs[].head_sha' \
+          )
+
+          # Download file sizes baseline
+          python3 scripts/s3-download.py s3://vortex-ci-benchmark-results/file-sizes.json.gz file-sizes.json.gz --no-sign-request || true
+
+          # Generate comparison report
+          echo '# File Sizes: ${{ matrix.name }}' > sizes-comment.md
+          echo '' >> sizes-comment.md
+
+          if [ -f file-sizes.json.gz ]; then
+            gzip -d -c file-sizes.json.gz | grep $base_commit_sha > base-sizes.json || true
+            if [ -s base-sizes.json ]; then
+              uv run --no-project scripts/compare-file-sizes.py base-sizes.json head-sizes.json \
+                >> sizes-comment.md
+            else
+              echo '_No baseline file sizes found for base commit._' >> sizes-comment.md
+            fi
+          else
+            echo '_No baseline file sizes available yet._' >> sizes-comment.md
+          fi
+
+          cat sizes-comment.md >> $GITHUB_STEP_SUMMARY
+
+      - name: Comment PR with file sizes
+        if: inputs.mode == 'pr' && github.event.pull_request.head.repo.fork == false
+        uses: thollander/actions-comment-pull-request@v3
+        with:
+          file-path: sizes-comment.md
+          comment-tag: file-sizes-${{ matrix.id }}
+
       - name: Comment PR on failure
         if: failure() && inputs.mode == 'pr' && github.event.pull_request.head.repo.fork == false
         uses: thollander/actions-comment-pull-request@v3
@@ -276,6 +325,17 @@ jobs:
         run: |
           bash scripts/cat-s3.sh vortex-ci-benchmark-results data.json.gz results.json
 
+      - name: Upload File Sizes
+        if: inputs.mode == 'develop'
+        shell: bash
+        run: |
+          uv run --no-project scripts/capture-file-sizes.py \
+            vortex-bench/data \
+            --benchmark ${{ matrix.subcommand }} \
+            --commit ${{ github.sha }} \
+            -o sizes.json
+          bash scripts/cat-s3.sh vortex-ci-benchmark-results file-sizes.json.gz sizes.json
+
       - name: Alert incident.io
         if: failure() && inputs.mode == 'develop'
         uses: ./.github/actions/alert-incident-io
diff --git a/scripts/capture-file-sizes.py b/scripts/capture-file-sizes.py
new file mode 100644
index 00000000000..d6f7e5350e1
--- /dev/null
+++ b/scripts/capture-file-sizes.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.11"
+# dependencies = []
+# ///
+
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+"""Capture file sizes from benchmark data directories and output as JSONL."""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Capture file sizes from benchmark data directories"
+    )
+    parser.add_argument("data_dir", help="Data directory (e.g., vortex-bench/data)")
+    parser.add_argument("--benchmark", required=True, help="Benchmark name (e.g., clickbench)")
+    parser.add_argument("--commit", required=True, help="Commit SHA")
+    parser.add_argument("-o", "--output", required=True, help="Output JSONL file path")
+    args = parser.parse_args()
+
+    data_dir = Path(args.data_dir)
+    if not data_dir.exists():
+        print(f"Data directory not found: {data_dir}", file=sys.stderr)
+        sys.exit(1)
+
+    benchmark_dir = data_dir / args.benchmark
+    if not benchmark_dir.exists():
+        print(f"Benchmark directory not found: {benchmark_dir}", file=sys.stderr)
+        sys.exit(1)
+
+    # Formats to capture (vortex formats only, not parquet/duckdb)
+    formats_to_capture = {"vortex", "vortex-compact"}
+
+    records = []
+
+    # Walk subdirectories looking for format directories
+    for format_dir in benchmark_dir.iterdir():
+        if not format_dir.is_dir():
+            continue
+
+        format_name = format_dir.name
+        if format_name not in formats_to_capture:
+            continue
+
+        # Capture all files in this format directory
+        for file_path in format_dir.rglob("*"):
+            if not file_path.is_file():
+                continue
+
+            size_bytes = file_path.stat().st_size
+            relative_path = file_path.relative_to(format_dir)
+
+            records.append({
+                "commit_id": args.commit,
+                "benchmark": args.benchmark,
+                "format": format_name,
+                "file": str(relative_path),
+                "size_bytes": size_bytes,
+            })
+
+    # Sort for deterministic output
+    records.sort(key=lambda r: (r["benchmark"], r["format"], r["file"]))
+
+    # Write JSONL output
+    with open(args.output, "w") as f:
+        for record in records:
+            f.write(json.dumps(record) + "\n")
+
+    print(f"Captured {len(records)} file sizes to {args.output}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/compare-file-sizes.py b/scripts/compare-file-sizes.py
new file mode 100644
index 00000000000..1087f9e9727
--- /dev/null
+++ b/scripts/compare-file-sizes.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.11"
+# dependencies = []
+# ///
+
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+"""Compare file sizes between base and HEAD and generate markdown report."""
+
+import argparse
+import json
+import sys
+from collections import defaultdict
+
+
+def format_size(size_bytes: int) -> str:
+    """Format bytes as human-readable size."""
+    if size_bytes >= 1024 ** 3:
+        return f"{size_bytes / (1024 ** 3):.2f} GB"
+    elif size_bytes >= 1024 ** 2:
+        return f"{size_bytes / (1024 ** 2):.2f} MB"
+    elif size_bytes >= 1024:
+        return f"{size_bytes / 1024:.2f} KB"
+    else:
+        return f"{size_bytes} B"
+
+
+def format_change(change_bytes: int) -> str:
+    """Format byte change with sign."""
+    sign = "+" if change_bytes > 0 else ""
+    return f"{sign}{format_size(abs(change_bytes))}"
+
+
+def format_pct_change(pct: float) -> str:
+    """Format percentage change with sign."""
+    sign = "+" if pct > 0 else ""
+    return f"{sign}{pct:.1f}%"
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Compare file sizes between base and HEAD"
+    )
+    parser.add_argument("base_file", help="Base JSONL file")
+    parser.add_argument("head_file", help="HEAD JSONL file")
+    args = parser.parse_args()
+
+    # Load base and head data
+    base_data = {}
+    try:
+        with open(args.base_file) as f:
+            for line in f:
+                record = json.loads(line)
+                key = (record["benchmark"], record["format"], record["file"])
+                base_data[key] = record["size_bytes"]
+    except FileNotFoundError:
+        print("_Base file sizes not found._")
+        sys.exit(0)
+
+    head_data = {}
+    try:
+        with open(args.head_file) as f:
+            for line in f:
+                record = json.loads(line)
+                key = (record["benchmark"], record["format"], record["file"])
+                head_data[key] = record["size_bytes"]
+    except FileNotFoundError:
+        print("_HEAD file sizes not found._")
+        sys.exit(0)
+
+    # Compare sizes
+    comparisons = []
+    format_totals = defaultdict(lambda: {"base": 0, "head": 0})
+
+    all_keys = set(base_data.keys()) | set(head_data.keys())
+    for key in all_keys:
+        benchmark, fmt, file_name = key
+        base_size = base_data.get(key, 0)
+        head_size = head_data.get(key, 0)
+
+        format_totals[fmt]["base"] += base_size
+        format_totals[fmt]["head"] += head_size
+
+        change = head_size - base_size
+        if change == 0:
+            continue
+
+        if base_size > 0:
+            pct_change = (head_size / base_size - 1) * 100
+        elif head_size > 0:
+            pct_change = float("inf")
+        else:
+            pct_change = 0
+
+        comparisons.append({
+            "file": file_name,
+            "format": fmt,
+            "base_size": base_size,
+            "head_size": head_size,
+            "change": change,
+            "pct_change": pct_change,
+        })
+
+    if not comparisons:
+        print("_No file size changes detected._")
+        return
+
+    # Sort by pct_change descending (largest increases first)
+    comparisons.sort(key=lambda x: x["pct_change"], reverse=True)
+
+    # Output markdown table
+    print("| File | Format | Base | HEAD | Change | % |")
+    print("|------|--------|------|------|--------|---|")
+
+    for comp in comparisons:
+        pct_str = format_pct_change(comp["pct_change"]) if comp["pct_change"] != float("inf") else "new"
+        base_str = format_size(comp["base_size"]) if comp["base_size"] > 0 else "-"
+        print(
+            f"| {comp['file']} | {comp['format']} | {base_str} | "
+            f"{format_size(comp['head_size'])} | {format_change(comp['change'])} | {pct_str} |"
+        )
+
+    # Output totals
+    print("")
+    print("**Totals:**")
+    for fmt in sorted(format_totals.keys()):
+        totals = format_totals[fmt]
+        base_total = totals["base"]
+        head_total = totals["head"]
+        if base_total > 0:
+            total_pct = (head_total / base_total - 1) * 100
+            pct_str = f" ({format_pct_change(total_pct)})"
+        else:
+            pct_str = ""
+        print(f"- {fmt}: {format_size(base_total)} \u2192 {format_size(head_total)}{pct_str}")
+
+
+if __name__ == "__main__":
+    main()

From 6b785a8d143cbf8196a339522c3bfa1e2f413a6e Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Thu, 2 Apr 2026 11:26:59 -0400
Subject: [PATCH 2/6] ruff format

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 scripts/capture-file-sizes.py | 20 ++++++++++----------
 scripts/compare-file-sizes.py | 30 +++++++++++++++---------------
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/scripts/capture-file-sizes.py b/scripts/capture-file-sizes.py
index d6f7e5350e1..46417f80bcd 100644
--- a/scripts/capture-file-sizes.py
+++ b/scripts/capture-file-sizes.py
@@ -16,9 +16,7 @@
 
 
 def main():
-    parser = argparse.ArgumentParser(
-        description="Capture file sizes from benchmark data directories"
-    )
+    parser = argparse.ArgumentParser(description="Capture file sizes from benchmark data directories")
     parser.add_argument("data_dir", help="Data directory (e.g., vortex-bench/data)")
     parser.add_argument("--benchmark", required=True, help="Benchmark name (e.g., clickbench)")
     parser.add_argument("--commit", required=True, help="Commit SHA")
@@ -57,13 +55,15 @@ def main():
             size_bytes = file_path.stat().st_size
             relative_path = file_path.relative_to(format_dir)
 
-            records.append({
-                "commit_id": args.commit,
-                "benchmark": args.benchmark,
-                "format": format_name,
-                "file": str(relative_path),
-                "size_bytes": size_bytes,
-            })
+            records.append(
+                {
+                    "commit_id": args.commit,
+                    "benchmark": args.benchmark,
+                    "format": format_name,
+                    "file": str(relative_path),
+                    "size_bytes": size_bytes,
+                }
+            )
 
     # Sort for deterministic output
     records.sort(key=lambda r: (r["benchmark"], r["format"], r["file"]))
diff --git a/scripts/compare-file-sizes.py b/scripts/compare-file-sizes.py
index 1087f9e9727..1e7274682fa 100644
--- a/scripts/compare-file-sizes.py
+++ b/scripts/compare-file-sizes.py
@@ -17,10 +17,10 @@
 
 def format_size(size_bytes: int) -> str:
     """Format bytes as human-readable size."""
-    if size_bytes >= 1024 ** 3:
-        return f"{size_bytes / (1024 ** 3):.2f} GB"
-    elif size_bytes >= 1024 ** 2:
-        return f"{size_bytes / (1024 ** 2):.2f} MB"
+    if size_bytes >= 1024**3:
+        return f"{size_bytes / (1024**3):.2f} GB"
+    elif size_bytes >= 1024**2:
+        return f"{size_bytes / (1024**2):.2f} MB"
     elif size_bytes >= 1024:
         return f"{size_bytes / 1024:.2f} KB"
     else:
@@ -40,9 +40,7 @@ def format_pct_change(pct: float) -> str:
 
 
 def main():
-    parser = argparse.ArgumentParser(
-        description="Compare file sizes between base and HEAD"
-    )
+    parser = argparse.ArgumentParser(description="Compare file sizes between base and HEAD")
     parser.add_argument("base_file", help="Base JSONL file")
     parser.add_argument("head_file", help="HEAD JSONL file")
     args = parser.parse_args()
@@ -94,14 +92,16 @@ def main():
         else:
             pct_change = 0
 
-        comparisons.append({
-            "file": file_name,
-            "format": fmt,
-            "base_size": base_size,
-            "head_size": head_size,
-            "change": change,
-            "pct_change": pct_change,
-        })
+        comparisons.append(
+            {
+                "file": file_name,
+                "format": fmt,
+                "base_size": base_size,
+                "head_size": head_size,
+                "change": change,
+                "pct_change": pct_change,
+            }
+        )
 
     if not comparisons:
         print("_No file size changes detected._")

From ae9f875fe53fdeff3ae1fcb721328f95a5fc5945 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Thu, 2 Apr 2026 11:39:20 -0400
Subject: [PATCH 3/6] fix script to work with clickbench_partitioned,
 vortex-file-compressed

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 scripts/capture-file-sizes.py | 63 ++++++++++++++++++++---------------
 1 file changed, 37 insertions(+), 26 deletions(-)

diff --git a/scripts/capture-file-sizes.py b/scripts/capture-file-sizes.py
index 46417f80bcd..8bdeca5e4b3 100644
--- a/scripts/capture-file-sizes.py
+++ b/scripts/capture-file-sizes.py
@@ -28,42 +28,53 @@ def main():
         print(f"Data directory not found: {data_dir}", file=sys.stderr)
         sys.exit(1)
 
-    benchmark_dir = data_dir / args.benchmark
-    if not benchmark_dir.exists():
-        print(f"Benchmark directory not found: {benchmark_dir}", file=sys.stderr)
+    # Find benchmark directories matching the name (handles flavors like clickbench_partitioned)
+    # Also handles exact match (e.g., tpch)
+    benchmark_dirs = [
+        d
+        for d in data_dir.iterdir()
+        if d.is_dir() and (d.name == args.benchmark or d.name.startswith(f"{args.benchmark}_"))
+    ]
+
+    if not benchmark_dirs:
+        print(f"No benchmark directories found matching: {args.benchmark}", file=sys.stderr)
         sys.exit(1)
 
     # Formats to capture (vortex formats only, not parquet/duckdb)
-    formats_to_capture = {"vortex", "vortex-compact"}
+    # Note: "vortex" CLI arg maps to "vortex-file-compressed" directory name
+    formats_to_capture = {"vortex-file-compressed", "vortex-compact"}
 
     records = []
 
     # Walk subdirectories looking for format directories
-    for format_dir in benchmark_dir.iterdir():
-        if not format_dir.is_dir():
-            continue
-
-        format_name = format_dir.name
-        if format_name not in formats_to_capture:
-            continue
+    # Handle both direct format dirs (clickbench_partitioned/vortex-file-compressed/)
+    # and scale factor subdirs (tpch/1.0/vortex-file-compressed/)
+    for benchmark_dir in benchmark_dirs:
+        for format_dir in benchmark_dir.rglob("*"):
+            if not format_dir.is_dir():
+                continue
 
-        # Capture all files in this format directory
-        for file_path in format_dir.rglob("*"):
-            if not file_path.is_file():
+            format_name = format_dir.name
+            if format_name not in formats_to_capture:
                 continue
 
-            size_bytes = file_path.stat().st_size
-            relative_path = file_path.relative_to(format_dir)
-
-            records.append(
-                {
-                    "commit_id": args.commit,
-                    "benchmark": args.benchmark,
-                    "format": format_name,
-                    "file": str(relative_path),
-                    "size_bytes": size_bytes,
-                }
-            )
+            # Capture all files in this format directory
+            for file_path in format_dir.rglob("*"):
+                if not file_path.is_file():
+                    continue
+
+                size_bytes = file_path.stat().st_size
+                relative_path = file_path.relative_to(format_dir)
+
+                records.append(
+                    {
+                        "commit_id": args.commit,
+                        "benchmark": args.benchmark,
+                        "format": format_name,
+                        "file": str(relative_path),
+                        "size_bytes": size_bytes,
+                    }
+                )
 
     # Sort for deterministic output
     records.sort(key=lambda r: (r["benchmark"], r["format"], r["file"]))

From 2912d41bec7792ff47998b0b13a419f532b284e4 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Thu, 2 Apr 2026 11:48:04 -0400
Subject: [PATCH 4/6] avoid duplicate outputs for s3 benches

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 .github/workflows/sql-benchmarks.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml
index 042ff53c3c4..9e7c6f86281 100644
--- a/.github/workflows/sql-benchmarks.yml
+++ b/.github/workflows/sql-benchmarks.yml
@@ -260,7 +260,7 @@ jobs:
           comment-tag: bench-pr-comment-${{ matrix.id }}
 
       - name: Compare file sizes
-        if: inputs.mode == 'pr'
+        if: inputs.mode == 'pr' && matrix.remote_storage == null
         shell: bash
         run: |
           set -Eeu -o pipefail -x
@@ -303,7 +303,7 @@ jobs:
           cat sizes-comment.md >> $GITHUB_STEP_SUMMARY
 
       - name: Comment PR with file sizes
-        if: inputs.mode == 'pr' && github.event.pull_request.head.repo.fork == false
+        if: inputs.mode == 'pr' && matrix.remote_storage == null && github.event.pull_request.head.repo.fork == false
         uses: thollander/actions-comment-pull-request@v3
         with:
           file-path: sizes-comment.md
@@ -326,7 +326,7 @@ jobs:
           bash scripts/cat-s3.sh vortex-ci-benchmark-results data.json.gz results.json
 
       - name: Upload File Sizes
-        if: inputs.mode == 'develop'
+        if: inputs.mode == 'develop' && matrix.remote_storage == null
         shell: bash
         run: |
           uv run --no-project scripts/capture-file-sizes.py \

From 7fa0c26a792d77214e0202784b41c151d539f363 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Thu, 2 Apr 2026 12:20:28 -0400
Subject: [PATCH 5/6] save scale_factor

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 scripts/capture-file-sizes.py |  8 +++++++-
 scripts/compare-file-sizes.py | 16 ++++++++++------
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/scripts/capture-file-sizes.py b/scripts/capture-file-sizes.py
index 8bdeca5e4b3..754df1ee702 100644
--- a/scripts/capture-file-sizes.py
+++ b/scripts/capture-file-sizes.py
@@ -58,6 +58,11 @@ def main():
             if format_name not in formats_to_capture:
                 continue
 
+            # Extract scale factor from path (e.g., "1.0" for tpch/1.0/vortex-file-compressed)
+            # Default to "1.0" if no intermediate directory (e.g., clickbench)
+            path_between = format_dir.relative_to(benchmark_dir).parent
+            scale_factor = str(path_between) if str(path_between) != "." else "1.0"
+
             # Capture all files in this format directory
             for file_path in format_dir.rglob("*"):
                 if not file_path.is_file():
@@ -70,6 +75,7 @@ def main():
                     {
                         "commit_id": args.commit,
                         "benchmark": args.benchmark,
+                        "scale_factor": scale_factor,
                         "format": format_name,
                         "file": str(relative_path),
                         "size_bytes": size_bytes,
@@ -77,7 +83,7 @@ def main():
                 )
 
     # Sort for deterministic output
-    records.sort(key=lambda r: (r["benchmark"], r["format"], r["file"]))
+    records.sort(key=lambda r: (r["benchmark"], r["scale_factor"], r["format"], r["file"]))
 
     # Write JSONL output
     with open(args.output, "w") as f:
diff --git a/scripts/compare-file-sizes.py b/scripts/compare-file-sizes.py
index 1e7274682fa..f840576c097 100644
--- a/scripts/compare-file-sizes.py
+++ b/scripts/compare-file-sizes.py
@@ -51,7 +51,9 @@ def main():
         with open(args.base_file) as f:
             for line in f:
                 record = json.loads(line)
-                key = (record["benchmark"], record["format"], record["file"])
+                # Support old records without scale_factor (default to "1.0")
+                scale_factor = record.get("scale_factor", "1.0")
+                key = (record["benchmark"], scale_factor, record["format"], record["file"])
                 base_data[key] = record["size_bytes"]
     except FileNotFoundError:
         print("_Base file sizes not found._")
@@ -62,7 +64,8 @@ def main():
         with open(args.head_file) as f:
             for line in f:
                 record = json.loads(line)
-                key = (record["benchmark"], record["format"], record["file"])
+                scale_factor = record.get("scale_factor", "1.0")
+                key = (record["benchmark"], scale_factor, record["format"], record["file"])
                 head_data[key] = record["size_bytes"]
     except FileNotFoundError:
         print("_HEAD file sizes not found._")
@@ -74,7 +77,7 @@ def main():
 
     all_keys = set(base_data.keys()) | set(head_data.keys())
     for key in all_keys:
-        benchmark, fmt, file_name = key
+        benchmark, scale_factor, fmt, file_name = key
         base_size = base_data.get(key, 0)
         head_size = head_data.get(key, 0)
 
@@ -95,6 +98,7 @@ def main():
         comparisons.append(
             {
                 "file": file_name,
+                "scale_factor": scale_factor,
                 "format": fmt,
                 "base_size": base_size,
                 "head_size": head_size,
@@ -111,14 +115,14 @@ def main():
     comparisons.sort(key=lambda x: x["pct_change"], reverse=True)
 
     # Output markdown table
-    print("| File | Format | Base | HEAD | Change | % |")
-    print("|------|--------|------|------|--------|---|")
+    print("| File | Scale | Format | Base | HEAD | Change | % |")
+    print("|------|-------|--------|------|------|--------|---|")
 
     for comp in comparisons:
         pct_str = format_pct_change(comp["pct_change"]) if comp["pct_change"] != float("inf") else "new"
         base_str = format_size(comp["base_size"]) if comp["base_size"] > 0 else "-"
         print(
-            f"| {comp['file']} | {comp['format']} | {base_str} | "
+            f"| {comp['file']} | {comp['scale_factor']} | {comp['format']} | {base_str} | "
             f"{format_size(comp['head_size'])} | {format_change(comp['change'])} | {pct_str} |"
         )
 

From 0cf1a9e1f40b7e88d8e2cbc1fb9db48146aa8ba8 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Thu, 2 Apr 2026 12:58:46 -0400
Subject: [PATCH 6/6] fix statpopgen

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 vortex-bench/src/statpopgen/statpopgen_benchmark.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vortex-bench/src/statpopgen/statpopgen_benchmark.rs b/vortex-bench/src/statpopgen/statpopgen_benchmark.rs
index 941859dc90b..dc5fa6448ad 100644
--- a/vortex-bench/src/statpopgen/statpopgen_benchmark.rs
+++ b/vortex-bench/src/statpopgen/statpopgen_benchmark.rs
@@ -52,7 +52,7 @@ impl StatPopGenBenchmark {
             )
         })?;
 
-        let data_path = "statspopgen".to_data_path().join(format!("{n_rows}/"));
+        let data_path = "statpopgen".to_data_path().join(format!("{n_rows}/"));
 
         let data_url =
             Url::from_directory_path(data_path).map_err(|_| anyhow::anyhow!("bad data path?"))?;