Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 61 additions & 1 deletion .github/workflows/sql-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,6 @@ jobs:
${{ matrix.scale_factor && format('--scale-factor {0}', matrix.scale_factor) || '' }}

- name: Install uv
if: inputs.mode == 'pr'
uses: spiraldb/actions/.github/actions/setup-uv@0.18.5
with:
sync: false
Expand Down Expand Up @@ -260,6 +259,56 @@ jobs:
# unique benchmark configuration must have a unique comment-tag.
comment-tag: bench-pr-comment-${{ matrix.id }}

- name: Compare file sizes
if: inputs.mode == 'pr' && matrix.remote_storage == null
shell: bash
run: |
set -Eeu -o pipefail -x

# Capture HEAD file sizes (vortex formats only)
uv run --no-project scripts/capture-file-sizes.py \
vortex-bench/data \
--benchmark ${{ matrix.subcommand }} \
--commit ${{ github.event.pull_request.head.sha }} \
-o head-sizes.json

# Get base commit SHA (same as benchmark comparison)
base_commit_sha=$(\
curl -L \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
https://api.github.com/repos/vortex-data/vortex/actions/workflows/bench.yml/runs\?branch\=develop\&status\=success\&per_page\=1 \
| jq -r '.workflow_runs[].head_sha' \
)

# Download file sizes baseline
python3 scripts/s3-download.py s3://vortex-ci-benchmark-results/file-sizes.json.gz file-sizes.json.gz --no-sign-request || true

# Generate comparison report
echo '# File Sizes: ${{ matrix.name }}' > sizes-comment.md
echo '' >> sizes-comment.md

if [ -f file-sizes.json.gz ]; then
gzip -d -c file-sizes.json.gz | grep $base_commit_sha > base-sizes.json || true
if [ -s base-sizes.json ]; then
uv run --no-project scripts/compare-file-sizes.py base-sizes.json head-sizes.json \
>> sizes-comment.md
else
echo '_No baseline file sizes found for base commit._' >> sizes-comment.md
fi
else
echo '_No baseline file sizes available yet._' >> sizes-comment.md
fi

cat sizes-comment.md >> $GITHUB_STEP_SUMMARY

- name: Comment PR with file sizes
if: inputs.mode == 'pr' && matrix.remote_storage == null && github.event.pull_request.head.repo.fork == false
uses: thollander/actions-comment-pull-request@v3
with:
file-path: sizes-comment.md
comment-tag: file-sizes-${{ matrix.id }}

- name: Comment PR on failure
if: failure() && inputs.mode == 'pr' && github.event.pull_request.head.repo.fork == false
uses: thollander/actions-comment-pull-request@v3
Expand All @@ -276,6 +325,17 @@ jobs:
run: |
bash scripts/cat-s3.sh vortex-ci-benchmark-results data.json.gz results.json

- name: Upload File Sizes
if: inputs.mode == 'develop' && matrix.remote_storage == null
shell: bash
run: |
uv run --no-project scripts/capture-file-sizes.py \
vortex-bench/data \
--benchmark ${{ matrix.subcommand }} \
--commit ${{ github.sha }} \
-o sizes.json
bash scripts/cat-s3.sh vortex-ci-benchmark-results file-sizes.json.gz sizes.json

- name: Alert incident.io
if: failure() && inputs.mode == 'develop'
uses: ./.github/actions/alert-incident-io
Expand Down
97 changes: 97 additions & 0 deletions scripts/capture-file-sizes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.11"
# dependencies = []
# ///

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright the Vortex contributors

"""Capture file sizes from benchmark data directories and output as JSONL."""

import argparse
import json
import sys
from pathlib import Path


def main():
parser = argparse.ArgumentParser(description="Capture file sizes from benchmark data directories")
parser.add_argument("data_dir", help="Data directory (e.g., vortex-bench/data)")
parser.add_argument("--benchmark", required=True, help="Benchmark name (e.g., clickbench)")
parser.add_argument("--commit", required=True, help="Commit SHA")
parser.add_argument("-o", "--output", required=True, help="Output JSONL file path")
args = parser.parse_args()

data_dir = Path(args.data_dir)
if not data_dir.exists():
print(f"Data directory not found: {data_dir}", file=sys.stderr)
sys.exit(1)

# Find benchmark directories matching the name (handles flavors like clickbench_partitioned)
# Also handles exact match (e.g., tpch)
benchmark_dirs = [
d
for d in data_dir.iterdir()
if d.is_dir() and (d.name == args.benchmark or d.name.startswith(f"{args.benchmark}_"))
]

if not benchmark_dirs:
print(f"No benchmark directories found matching: {args.benchmark}", file=sys.stderr)
sys.exit(1)

# Formats to capture (vortex formats only, not parquet/duckdb)
# Note: "vortex" CLI arg maps to "vortex-file-compressed" directory name
formats_to_capture = {"vortex-file-compressed", "vortex-compact"}

records = []

# Walk subdirectories looking for format directories
# Handle both direct format dirs (clickbench_partitioned/vortex-file-compressed/)
# and scale factor subdirs (tpch/1.0/vortex-file-compressed/)
for benchmark_dir in benchmark_dirs:
for format_dir in benchmark_dir.rglob("*"):
if not format_dir.is_dir():
continue

format_name = format_dir.name
if format_name not in formats_to_capture:
continue

# Extract scale factor from path (e.g., "1.0" for tpch/1.0/vortex-file-compressed)
# Default to "1.0" if no intermediate directory (e.g., clickbench)
path_between = format_dir.relative_to(benchmark_dir).parent
scale_factor = str(path_between) if str(path_between) != "." else "1.0"

# Capture all files in this format directory
for file_path in format_dir.rglob("*"):
if not file_path.is_file():
continue

size_bytes = file_path.stat().st_size
relative_path = file_path.relative_to(format_dir)

records.append(
{
"commit_id": args.commit,
"benchmark": args.benchmark,
"scale_factor": scale_factor,
"format": format_name,
"file": str(relative_path),
"size_bytes": size_bytes,
}
)

# Sort for deterministic output
records.sort(key=lambda r: (r["benchmark"], r["scale_factor"], r["format"], r["file"]))

# Write JSONL output
with open(args.output, "w") as f:
for record in records:
f.write(json.dumps(record) + "\n")

print(f"Captured {len(records)} file sizes to {args.output}", file=sys.stderr)


if __name__ == "__main__":
main()
145 changes: 145 additions & 0 deletions scripts/compare-file-sizes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.11"
# dependencies = []
# ///

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright the Vortex contributors

"""Compare file sizes between base and HEAD and generate markdown report."""

import argparse
import json
import sys
from collections import defaultdict


def format_size(size_bytes: int) -> str:
"""Format bytes as human-readable size."""
if size_bytes >= 1024**3:
return f"{size_bytes / (1024**3):.2f} GB"
elif size_bytes >= 1024**2:
return f"{size_bytes / (1024**2):.2f} MB"
elif size_bytes >= 1024:
return f"{size_bytes / 1024:.2f} KB"
else:
return f"{size_bytes} B"


def format_change(change_bytes: int) -> str:
"""Format byte change with sign."""
sign = "+" if change_bytes > 0 else ""
return f"{sign}{format_size(abs(change_bytes))}"


def format_pct_change(pct: float) -> str:
"""Format percentage change with sign."""
sign = "+" if pct > 0 else ""
return f"{sign}{pct:.1f}%"


def main():
parser = argparse.ArgumentParser(description="Compare file sizes between base and HEAD")
parser.add_argument("base_file", help="Base JSONL file")
parser.add_argument("head_file", help="HEAD JSONL file")
args = parser.parse_args()

# Load base and head data
base_data = {}
try:
with open(args.base_file) as f:
for line in f:
record = json.loads(line)
# Support old records without scale_factor (default to "1.0")
scale_factor = record.get("scale_factor", "1.0")
key = (record["benchmark"], scale_factor, record["format"], record["file"])
base_data[key] = record["size_bytes"]
except FileNotFoundError:
print("_Base file sizes not found._")
sys.exit(0)

head_data = {}
try:
with open(args.head_file) as f:
for line in f:
record = json.loads(line)
scale_factor = record.get("scale_factor", "1.0")
key = (record["benchmark"], scale_factor, record["format"], record["file"])
head_data[key] = record["size_bytes"]
except FileNotFoundError:
print("_HEAD file sizes not found._")
sys.exit(0)

# Compare sizes
comparisons = []
format_totals = defaultdict(lambda: {"base": 0, "head": 0})

all_keys = set(base_data.keys()) | set(head_data.keys())
for key in all_keys:
benchmark, scale_factor, fmt, file_name = key
base_size = base_data.get(key, 0)
head_size = head_data.get(key, 0)

format_totals[fmt]["base"] += base_size
format_totals[fmt]["head"] += head_size

change = head_size - base_size
if change == 0:
continue

if base_size > 0:
pct_change = (head_size / base_size - 1) * 100
elif head_size > 0:
pct_change = float("inf")
else:
pct_change = 0

comparisons.append(
{
"file": file_name,
"scale_factor": scale_factor,
"format": fmt,
"base_size": base_size,
"head_size": head_size,
"change": change,
"pct_change": pct_change,
}
)

if not comparisons:
print("_No file size changes detected._")
return

# Sort by pct_change descending (largest increases first)
comparisons.sort(key=lambda x: x["pct_change"], reverse=True)

# Output markdown table
print("| File | Scale | Format | Base | HEAD | Change | % |")
print("|------|-------|--------|------|------|--------|---|")

for comp in comparisons:
pct_str = format_pct_change(comp["pct_change"]) if comp["pct_change"] != float("inf") else "new"
base_str = format_size(comp["base_size"]) if comp["base_size"] > 0 else "-"
print(
f"| {comp['file']} | {comp['scale_factor']} | {comp['format']} | {base_str} | "
f"{format_size(comp['head_size'])} | {format_change(comp['change'])} | {pct_str} |"
)

# Output totals
print("")
print("**Totals:**")
for fmt in sorted(format_totals.keys()):
totals = format_totals[fmt]
base_total = totals["base"]
head_total = totals["head"]
if base_total > 0:
total_pct = (head_total / base_total - 1) * 100
pct_str = f" ({format_pct_change(total_pct)})"
else:
pct_str = ""
print(f"- {fmt}: {format_size(base_total)} \u2192 {format_size(head_total)}{pct_str}")


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion vortex-bench/src/statpopgen/statpopgen_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ impl StatPopGenBenchmark {
)
})?;

let data_path = "statspopgen".to_data_path().join(format!("{n_rows}/"));
let data_path = "statpopgen".to_data_path().join(format!("{n_rows}/"));

let data_url =
Url::from_directory_path(data_path).map_err(|_| anyhow::anyhow!("bad data path?"))?;
Expand Down
Loading