diff --git a/Makefile b/Makefile
index 86f3611a..cc1bb81f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: all format test test-unit test-integration install download upload docker documentation data validate-data calibrate calibrate-build publish-local-area upload-calibration upload-dataset push-to-modal build-data-modal build-matrices calibrate-modal calibrate-modal-national calibrate-both stage-h5s stage-national-h5 stage-all-h5s pipeline validate-staging validate-staging-full upload-validation check-staging check-sanity clean build paper clean-paper presentations database database-refresh promote-dataset promote build-h5s validate-local refresh-soi-targets push-pr-branch
+.PHONY: all format test test-unit test-integration install download upload docker documentation data validate-data calibrate calibrate-build publish-local-area upload-calibration upload-dataset push-to-modal build-data-modal build-matrices calibrate-modal calibrate-modal-national calibrate-both stage-h5s stage-national-h5 stage-all-h5s pipeline validate-staging validate-staging-full upload-validation check-staging check-sanity clean build paper clean-paper presentations database database-refresh promote-dataset promote build-h5s validate-local refresh-soi-targets push-pr-branch benchmarking-install-python benchmarking-install-r benchmarking-export benchmarking-run-l0 benchmarking-run-greg benchmarking-run-ipf
 
 SOI_SOURCE_YEAR ?= 2021
 SOI_TARGET_YEAR ?= 2023
@@ -13,6 +13,8 @@ BRANCH ?= $(shell git rev-parse --abbrev-ref HEAD)
 NUM_WORKERS ?= 8
 N_CLONES ?= 430
 VERSION ?=
+MANIFEST ?=
+RUN_DIR ?=
 SOI_SOURCE_YEAR ?= 2021
 SOI_TARGET_YEAR ?= 2023
 
@@ -37,6 +39,32 @@ install:
 	pip install policyengine-us
 	pip install -e ".[dev]"  --config-settings editable_mode=compat
 
+benchmarking-install-python:
+	pip install -r paper-l0/benchmarking/requirements-python.txt
+
+benchmarking-install-r:
+	Rscript paper-l0/benchmarking/install_r_packages.R
+
+benchmarking-export:
+	python paper-l0/benchmarking/benchmark_cli.py export \
+		--manifest $(MANIFEST) \
+		--output-dir $(RUN_DIR)
+
+benchmarking-run-l0:
+	python paper-l0/benchmarking/benchmark_cli.py run \
+		--method l0 \
+		--run-dir $(RUN_DIR)
+
+benchmarking-run-greg:
+	python paper-l0/benchmarking/benchmark_cli.py run \
+		--method greg \
+		--run-dir $(RUN_DIR)
+
+benchmarking-run-ipf:
+	python paper-l0/benchmarking/benchmark_cli.py run \
+		--method ipf \
+		--run-dir $(RUN_DIR)
+
 changelog:
 	python .github/bump_version.py
 	towncrier build --yes --version $$(python -c "import re; print(re.search(r'version = \"(.+?)\"', open('pyproject.toml').read()).group(1))")
diff --git a/paper-l0/.gitignore b/paper-l0/.gitignore
new file mode 100644
index 00000000..02d9b69b
--- /dev/null
+++ b/paper-l0/.gitignore
@@ -0,0 +1,35 @@
+## Core latex/pdflatex auxiliary files:
+*.aux
+*.lof
+*.log
+*.lot
+*.fls
+*.out
+*.toc
+*.fmt
+*.fot
+*.cb
+*.cb2
+.*.lb
+*.nav
+*.snm
+*.vrb
+
+## Generated if empty string is given at "Please type another file name for output:"
+.pdf
+
+## Bibliography auxiliary files (bibtex/biblatex/biber):
+*.bbl
+*.bcf
+*.blg
+*-blx.aux
+*-blx.bib
+*.run.xml
+
+## Build tool auxiliary files:
+*.fdb_latexmk
+*.synctex
+*.synctex(busy)
+*.synctex.gz
+*.synctex.gz(busy)
+*.pdfsync
diff --git a/paper-l0/benchmarking/README.md b/paper-l0/benchmarking/README.md
new file mode 100644
index 00000000..8d68b360
--- /dev/null
+++ b/paper-l0/benchmarking/README.md
@@ -0,0 +1,240 @@
+# Benchmarking Scaffold
+
+This directory contains the implementation scaffold for benchmarking the
+`L0` calibration pipeline against:
+
+- `GREG` via R's `survey` package
+- `IPF` via R's `surveysd` package
+
+## Experimental Setup
+
+The benchmark is organized around one shared exported bundle and multiple
+method adapters.
+
+- `L0` and `GREG` are compared on the shared calibration representation:
+  a sparse target-by-unit matrix, the selected target table, and 
+  initial .npy weights.
+- `IPF` is benchmarked from the same target selection, but it requires a
+  conversion step because `surveysd::ipf` consumes a microdata table plus
+  IPF constraints rather than a generic sparse linear system.
+- The intended benchmark tiers are:
+  - a practical reduced-size comparison tier, used for like-for-like `L0`
+    versus `GREG` runs that are small enough to execute routinely during
+    development
+  - an IPF-focused reduced-size tier on count-style targets, used because
+    classical `IPF` is most naturally evaluated on count or indicator margins
+    rather than the full arbitrary target set
+  - a scaling ladder over increasing target counts, used to show how runtime,
+    memory use, convergence, and outright failure change as the benchmark moves
+    from small target subsets toward the full calibration problem
+  - a production-feasibility tier, used to test which methods can still run at
+    something close to the full production clone count and target volume
+
+Methodologically, the benchmark treats the methods as related but not
+identical:
+
+- `L0` and `GREG` can consume arbitrary linear calibration targets.
+- `IPF` is most natural for count-style or indicator-style targets, so the
+  current automatic conversion path supports `person_count` and
+  `household_count`.
+
+The core workflow is:
+
+1. select a benchmark target subset with a manifest
+2. export a shared benchmark bundle from a saved calibration package
+3. auto-convert the bundle to IPF inputs when needed
+4. run `L0`, `GREG`, or `IPF`
+5. score all fitted weights against the same shared target matrix
+
+## Layout
+
+- `benchmark_cli.py`
+  Main CLI for exporting benchmark bundles and running methods.
+- `benchmark_manifest.py`
+  Manifest schema and target-filter logic.
+- `benchmark_export.py`
+  Export utilities for shared benchmark artifacts.
+- `ipf_conversion.py`
+  Automatic conversion from the saved calibration package to IPF-ready
+  unit and target metadata.
+- `benchmark_metrics.py`
+  Common diagnostics and summary generation.
+- `runners/greg_runner.R`
+  R backend for `survey`-based GREG.
+- `runners/ipf_runner.R`
+  R backend for `surveysd`-based IPF.
+- `runners/read_npy.R`
+  Minimal `.npy` reader used by the R scripts.
+- `requirements-python.txt`
+  Python dependencies for the benchmarking scaffold.
+- `install_r_packages.R`
+  Installs the required R packages for the benchmark runners.
+- `manifests/*.example.json`
+  Example benchmark manifests.
+
+## Environment Setup
+
+Python:
+
+```bash
+pip install -r paper-l0/benchmarking/requirements-python.txt
+```
+
+R:
+
+```bash
+Rscript paper-l0/benchmarking/install_r_packages.R
+```
+
+Or, from the repo root:
+
+```bash
+make benchmarking-install-python
+make benchmarking-install-r
+```
+
+## Chosen Interchange Formats
+
+- sparse matrix: Matrix Market `.mtx`
+- target metadata: `.csv`
+- unit metadata: `.csv`
+- initial weights: `.npy`
+- benchmark manifest: `.json`
+- method result summary: `.json`
+- fitted weights: `.npy`
+
+## Notes
+
+### Shared calibration package
+
+The exporter reads the saved calibration package directly from pickle rather
+than importing the full calibration CLI. This keeps the benchmark I/O path
+lightweight.
+
+### IPF inputs
+
+The exporter now auto-generates IPF inputs when the manifest includes `ipf`
+and no external overrides are supplied. It reconstructs an IPF microdata table
+from:
+
+- the saved calibration package
+- the package metadata's `dataset_path`
+- the package metadata's `db_path`
+- the selected count-like targets and their stratum constraints
+
+The generated `unit_metadata.csv` is currently built for `person_count` and
+`household_count` targets. It expands cloned households to a person-level table
+when person targets are present, carries a repeated household `unit_index`, and
+adds one derived indicator column per selected target. The generated
+`ipf_target_metadata.csv` then references those indicator columns as numerical
+IPF totals.
+
+External CSVs are still supported through `external_inputs.*` and override the
+automatic conversion path when provided.
+
+### IPF conversion step by step
+
+The IPF conversion is implemented in
+[ipf_conversion.py](/Users/movil1/Desktop/PYTHONJOBS/PolicyEngine/policyengine-us-data/paper-l0/benchmarking/ipf_conversion.py)
+and runs during `benchmark_cli.py export`.
+
+1. Load the saved calibration package and apply the manifest target filters.
+2. Read `dataset_path`, `db_path`, and `n_clones` from the package metadata.
+3. Query `stratum_constraints` for the selected targets from the target DB.
+4. Identify the source variables needed to evaluate those constraints, such as
+   `age`, `snap`, or `medicaid_enrolled`.
+5. Reconstruct the cloned household universe from `initial_weights`,
+   `block_geoid`, and `cd_geoid`. This yields one benchmark unit per matrix
+   column.
+6. If any selected IPF target is `person_count`, expand that cloned household
+   universe to a person-level table using the source dataset's person-to-
+   household links. Multiple person rows may therefore share the same
+   household-clone `unit_index`.
+7. Calculate the needed source variables from the dataset and attach them to
+   the IPF unit table.
+8. For each selected target, evaluate its original stratum logic row by row and
+   materialize the result as a derived indicator column such as
+   `ipf_indicator_00000`.
+9. Write `ipf_target_metadata.csv` so each selected target becomes a
+   `numeric_total` IPF constraint over one of those derived indicator columns.
+10. Run `surveysd::ipf` on the generated unit table and target metadata.
+11. Collapse the fitted IPF row weights back to one weight per shared benchmark
+   `unit_index`, so the fitted result can be scored against the same sparse
+   calibration matrix used by `L0` and `GREG`.
+
+This means the benchmark uses one common scoring space even though `IPF`
+requires a richer input representation than `L0` and `GREG`.
+
+### Why the IPF conversion exists
+
+`L0` and `GREG` can work directly with a sparse linear system of the form
+`X w = t`.
+
+Classical `IPF` does not start from that object. It expects:
+
+- a unit-record table
+- categorical or indicator variables on that table
+- target totals over those variables
+
+So the benchmark exporter translates selected count-style calibration targets
+into that IPF-friendly representation instead of trying to feed the sparse
+matrix directly into `surveysd::ipf`.
+
+### IPF target metadata schema
+
+`ipf_runner.R` supports two target metadata encodings:
+
+- `numeric_total`
+  One row per target with:
+  - `scope`: `person` or `household`
+  - `target_type`: `numeric_total`
+  - `value_column`: unit-data column to calibrate
+  - `variables`: grouping variables used to wrap the numeric total in a one-cell
+    or multi-cell array
+  - `cell`: pipe-separated assignments for the target cell
+  - `target_value`: numeric total
+- `categorical_margin`
+  One row per margin cell with:
+  - `scope`: `person` or `household`
+  - `target_type`: `categorical_margin`
+  - `margin_id`: identifier for a margin table
+  - `variables`: pipe-separated variable names, e.g. `district_id|age_bin`
+  - `cell`: pipe-separated assignments, e.g.
+    `district_id=0601|age_bin=18_24`
+  - `target_value`: numeric target
+
+The automatic conversion path currently emits `numeric_total` rows.
+
+## Example Commands
+
+Export a benchmark bundle:
+
+```bash
+python paper-l0/benchmarking/benchmark_cli.py export \
+  --manifest paper-l0/benchmarking/manifests/greg_demo_small.example.json \
+  --output-dir paper-l0/benchmarking/runs/greg_demo_small
+```
+
+Run a GREG benchmark from an exported bundle:
+
+```bash
+python paper-l0/benchmarking/benchmark_cli.py run \
+  --method greg \
+  --run-dir paper-l0/benchmarking/runs/greg_demo_small
+```
+
+Run `L0` on an exported bundle:
+
+```bash
+python paper-l0/benchmarking/benchmark_cli.py run \
+  --method l0 \
+  --run-dir paper-l0/benchmarking/runs/greg_demo_small
+```
+
+Equivalent root Make targets:
+
+```bash
+make benchmarking-export MANIFEST=paper-l0/benchmarking/manifests/greg_demo_small.example.json RUN_DIR=paper-l0/benchmarking/runs/greg_demo_small
+make benchmarking-run-greg RUN_DIR=paper-l0/benchmarking/runs/greg_demo_small
+make benchmarking-run-l0 RUN_DIR=paper-l0/benchmarking/runs/greg_demo_small
+```
diff --git a/paper-l0/benchmarking/benchmark_cli.py b/paper-l0/benchmarking/benchmark_cli.py
new file mode 100644
index 00000000..79f59050
--- /dev/null
+++ b/paper-l0/benchmarking/benchmark_cli.py
@@ -0,0 +1,240 @@
+from __future__ import annotations
+
+import argparse
+import json
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+from benchmark_export import export_bundle
+from benchmark_manifest import load_manifest
+from benchmark_metrics import (
+    compute_common_metrics,
+    load_targets_csv,
+    write_method_summary,
+)
+
+
+ROOT = Path(__file__).resolve().parent
+RUNNERS_DIR = ROOT / "runners"
+
+
+def _run_subprocess(cmd, cwd=None):
+    started = time.time()
+    proc = subprocess.run(cmd, cwd=cwd, check=False)
+    elapsed = time.time() - started
+    return proc, elapsed
+
+
+def cmd_export(args):
+    manifest = load_manifest(args.manifest)
+    output_dir, info = export_bundle(manifest=manifest, output_dir=args.output_dir)
+    print(json.dumps({"output_dir": str(output_dir), **info}, indent=2, sort_keys=True))
+    return 0
+
+
+def _run_l0(run_dir: Path):
+    inputs = run_dir / "inputs"
+    outputs = run_dir / "outputs"
+
+    from scipy.io import mmread
+    from policyengine_us_data.calibration.unified_calibration import fit_l0_weights
+
+    with open(inputs / "benchmark_manifest.json") as f:
+        manifest = json.load(f)
+
+    options = manifest.get("method_options", {}).get("l0", {})
+    X_sparse = mmread(str(inputs / "X_targets_by_units.mtx")).tocsr()
+    targets_df = pd.read_csv(inputs / "target_metadata.csv")
+    initial_weights = np.load(inputs / "initial_weights.npy")
+
+    weights = fit_l0_weights(
+        X_sparse=X_sparse,
+        targets=targets_df["value"].to_numpy(dtype=np.float64),
+        lambda_l0=float(options.get("lambda_l0", 1e-8)),
+        epochs=int(options.get("epochs", 1000)),
+        device=str(options.get("device", "cpu")),
+        beta=float(options.get("beta", 0.65)),
+        lambda_l2=float(options.get("lambda_l2", 1e-12)),
+        learning_rate=float(options.get("learning_rate", 0.15)),
+        target_names=targets_df["target_name"].tolist(),
+        initial_weights=initial_weights,
+        targets_df=targets_df,
+    )
+
+    weights_path = outputs / "fitted_weights.npy"
+    np.save(weights_path, weights.astype(np.float64))
+    return weights_path
+
+
+def _run_greg(run_dir: Path):
+    inputs = run_dir / "inputs"
+    outputs = run_dir / "outputs"
+    temp_csv = outputs / "_greg_weights.csv"
+
+    with open(inputs / "benchmark_manifest.json") as f:
+        manifest = json.load(f)
+    options = manifest.get("method_options", {}).get("greg", {})
+
+    cmd = [
+        "Rscript",
+        str(RUNNERS_DIR / "greg_runner.R"),
+        str(inputs / "X_targets_by_units.mtx"),
+        str(inputs / "target_metadata.csv"),
+        str(inputs / "initial_weights.npy"),
+        str(temp_csv),
+        str(int(options.get("maxit", 50))),
+        str(float(options.get("epsilon", 1e-7))),
+    ]
+    proc, elapsed = _run_subprocess(cmd)
+    if proc.returncode != 0:
+        raise RuntimeError(f"GREG runner failed with exit code {proc.returncode}")
+
+    weights = pd.read_csv(temp_csv)["fitted_weight"].to_numpy(dtype=np.float64)
+    weights_path = outputs / "fitted_weights.npy"
+    np.save(weights_path, weights)
+    temp_csv.unlink(missing_ok=True)
+    return weights_path, elapsed
+
+
+def _run_ipf(run_dir: Path):
+    inputs = run_dir / "inputs"
+    outputs = run_dir / "outputs"
+    temp_csv = outputs / "_ipf_weights.csv"
+
+    with open(inputs / "benchmark_manifest.json") as f:
+        manifest = json.load(f)
+    options = manifest.get("method_options", {}).get("ipf", {})
+
+    target_metadata_path = inputs / "ipf_target_metadata.csv"
+    if not target_metadata_path.exists():
+        raise FileNotFoundError(
+            "IPF run requires inputs/ipf_target_metadata.csv. "
+            "Provide external_inputs.ipf_target_metadata_csv in the manifest."
+        )
+
+    cmd = [
+        "Rscript",
+        str(RUNNERS_DIR / "ipf_runner.R"),
+        str(inputs / "unit_metadata.csv"),
+        str(target_metadata_path),
+        str(inputs / "initial_weights.npy"),
+        str(temp_csv),
+        str(int(options.get("max_iter", 200))),
+        str(float(options.get("bound", 4.0))),
+        str(float(options.get("epsP", 1e-6))),
+        str(float(options.get("epsH", 1e-2))),
+        str(options.get("household_id_col", "household_id")),
+        str(options.get("weight_col", "base_weight")),
+    ]
+    proc, elapsed = _run_subprocess(cmd)
+    if proc.returncode != 0:
+        raise RuntimeError(f"IPF runner failed with exit code {proc.returncode}")
+
+    raw_weights = pd.read_csv(temp_csv)
+    if "unit_index" not in raw_weights.columns:
+        raise RuntimeError("IPF runner output must include a unit_index column")
+    if raw_weights["unit_index"].isna().any():
+        raise RuntimeError("IPF runner output contains missing unit_index values")
+
+    raw_weights["unit_index"] = raw_weights["unit_index"].astype(np.int64)
+    n_units = len(np.load(inputs / "initial_weights.npy"))
+    if (raw_weights["unit_index"] < 0).any() or (
+        raw_weights["unit_index"] >= n_units
+    ).any():
+        raise RuntimeError("IPF runner output contains out-of-range unit_index values")
+
+    per_unit_spread = raw_weights.groupby("unit_index", sort=True)["fitted_weight"].agg(
+        lambda series: float(series.max() - series.min())
+    )
+    inconsistent_units = per_unit_spread[per_unit_spread > 1e-9]
+    if not inconsistent_units.empty:
+        raise RuntimeError(
+            "IPF runner produced inconsistent fitted weights within the same unit_index"
+        )
+
+    weights_by_unit = (
+        raw_weights.groupby("unit_index", sort=True)["fitted_weight"]
+        .first()
+        .reindex(np.arange(n_units, dtype=np.int64))
+    )
+    if weights_by_unit.isna().any():
+        raise RuntimeError(
+            "Aggregated IPF weights do not cover the full benchmark unit range"
+        )
+    weights = weights_by_unit.to_numpy(dtype=np.float64)
+    weights_path = outputs / "fitted_weights.npy"
+    np.save(weights_path, weights)
+    temp_csv.unlink(missing_ok=True)
+    return weights_path, elapsed
+
+
+def cmd_run(args):
+    run_dir = Path(args.run_dir)
+    inputs = run_dir / "inputs"
+    outputs = run_dir / "outputs"
+    outputs.mkdir(parents=True, exist_ok=True)
+    targets_df = load_targets_csv(inputs / "target_metadata.csv")
+
+    started = time.time()
+    if args.method == "l0":
+        weights_path = _run_l0(run_dir)
+    elif args.method == "greg":
+        weights_path, _ = _run_greg(run_dir)
+    elif args.method == "ipf":
+        weights_path, _ = _run_ipf(run_dir)
+    else:
+        raise ValueError(f"Unsupported method: {args.method}")
+    elapsed = time.time() - started
+
+    weights = np.load(weights_path)
+    summary = compute_common_metrics(
+        weights=weights,
+        targets_df=targets_df,
+        matrix_path=inputs / "X_targets_by_units.mtx",
+    )
+    summary["method"] = args.method
+    summary["run_dir"] = str(run_dir.resolve())
+    summary["runtime_seconds"] = elapsed
+    write_method_summary(summary, outputs / f"{args.method}_summary.json")
+    print(json.dumps(summary, indent=2, sort_keys=True))
+    return 0
+
+
+def build_parser():
+    parser = argparse.ArgumentParser(description="Benchmark scaffold CLI")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    export_parser = subparsers.add_parser("export", help="Export a benchmark bundle")
+    export_parser.add_argument(
+        "--manifest", required=True, help="Path to benchmark manifest JSON"
+    )
+    export_parser.add_argument(
+        "--output-dir", required=True, help="Output bundle directory"
+    )
+    export_parser.set_defaults(func=cmd_export)
+
+    run_parser = subparsers.add_parser(
+        "run", help="Run one method on an exported bundle"
+    )
+    run_parser.add_argument("--method", required=True, choices=["l0", "greg", "ipf"])
+    run_parser.add_argument(
+        "--run-dir", required=True, help="Exported benchmark bundle directory"
+    )
+    run_parser.set_defaults(func=cmd_run)
+
+    return parser
+
+
+def main(argv=None):
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    return args.func(args)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/paper-l0/benchmarking/benchmark_export.py b/paper-l0/benchmarking/benchmark_export.py
new file mode 100644
index 00000000..ff81da2f
--- /dev/null
+++ b/paper-l0/benchmarking/benchmark_export.py
@@ -0,0 +1,128 @@
+from __future__ import annotations
+
+import json
+import pickle
+import shutil
+from dataclasses import asdict
+from pathlib import Path
+from typing import Dict, Tuple
+
+import numpy as np
+import pandas as pd
+from scipy.io import mmwrite
+
+from benchmark_manifest import BenchmarkManifest, filter_targets
+from ipf_conversion import build_ipf_inputs
+
+
+def load_calibration_package_raw(path: str | Path) -> Dict:
+    with open(path, "rb") as f:
+        return pickle.load(f)
+
+
+def build_shared_unit_metadata(package: Dict) -> pd.DataFrame:
+    initial_weights = package.get("initial_weights")
+    n_units = (
+        int(initial_weights.shape[0])
+        if initial_weights is not None
+        else int(package["X_sparse"].shape[1])
+    )
+    data = {"unit_index": np.arange(n_units, dtype=np.int64)}
+
+    if initial_weights is not None:
+        data["base_weight"] = np.asarray(initial_weights, dtype=np.float64)
+
+    if package.get("cd_geoid") is not None:
+        data["cd_geoid"] = np.asarray(package["cd_geoid"]).astype(str)
+
+    if package.get("block_geoid") is not None:
+        data["block_geoid"] = np.asarray(package["block_geoid"]).astype(str)
+
+    return pd.DataFrame(data)
+
+
+def export_bundle(
+    manifest: BenchmarkManifest, output_dir: str | Path
+) -> Tuple[Path, Dict]:
+    output_dir = Path(output_dir)
+    inputs_dir = output_dir / "inputs"
+    outputs_dir = output_dir / "outputs"
+    inputs_dir.mkdir(parents=True, exist_ok=True)
+    outputs_dir.mkdir(parents=True, exist_ok=True)
+
+    package = load_calibration_package_raw(manifest.package_path)
+    targets_df = package["targets_df"].copy()
+    target_names = list(package["target_names"])
+    X_sparse = package["X_sparse"]
+
+    filtered_targets, filtered_names, filtered_matrix, kept_indices = filter_targets(
+        targets_df=targets_df,
+        target_names=target_names,
+        X_sparse=X_sparse,
+        filters=manifest.target_filters,
+    )
+
+    filtered_targets.to_csv(inputs_dir / "target_metadata.csv", index=False)
+    mmwrite(str(inputs_dir / "X_targets_by_units.mtx"), filtered_matrix)
+
+    initial_weights = np.asarray(package["initial_weights"], dtype=np.float64)
+    np.save(inputs_dir / "initial_weights.npy", initial_weights)
+
+    unit_metadata = build_shared_unit_metadata(package)
+    has_external_ipf_inputs = bool(
+        manifest.external_inputs.ipf_unit_metadata_csv
+        or manifest.external_inputs.ipf_target_metadata_csv
+    )
+    has_partial_external_ipf_inputs = bool(
+        manifest.external_inputs.ipf_unit_metadata_csv
+    ) != bool(manifest.external_inputs.ipf_target_metadata_csv)
+    if has_partial_external_ipf_inputs:
+        raise ValueError(
+            "IPF external input overrides must provide both "
+            "ipf_unit_metadata_csv and ipf_target_metadata_csv"
+        )
+
+    if manifest.external_inputs.ipf_unit_metadata_csv:
+        shutil.copyfile(
+            manifest.external_inputs.ipf_unit_metadata_csv,
+            inputs_dir / "unit_metadata.csv",
+        )
+    elif "ipf" in manifest.methods and not has_external_ipf_inputs:
+        ipf_unit_metadata, ipf_target_metadata = build_ipf_inputs(
+            package=package,
+            manifest=manifest,
+            filtered_targets=filtered_targets,
+        )
+        ipf_unit_metadata.to_csv(inputs_dir / "unit_metadata.csv", index=False)
+        ipf_target_metadata.to_csv(inputs_dir / "ipf_target_metadata.csv", index=False)
+    else:
+        unit_metadata.to_csv(inputs_dir / "unit_metadata.csv", index=False)
+
+    if manifest.external_inputs.ipf_target_metadata_csv:
+        shutil.copyfile(
+            manifest.external_inputs.ipf_target_metadata_csv,
+            inputs_dir / "ipf_target_metadata.csv",
+        )
+
+    runtime_manifest = manifest.to_dict()
+    runtime_manifest["resolved"] = {
+        "output_dir": str(output_dir.resolve()),
+        "inputs_dir": str(inputs_dir.resolve()),
+        "outputs_dir": str(outputs_dir.resolve()),
+        "n_targets": int(filtered_matrix.shape[0]),
+        "n_units": int(filtered_matrix.shape[1]),
+        "kept_target_indices": [int(i) for i in kept_indices.tolist()],
+        "target_names": filtered_names,
+        "package_metadata": package.get("metadata", {}),
+    }
+
+    with open(inputs_dir / "benchmark_manifest.json", "w") as f:
+        json.dump(runtime_manifest, f, indent=2, sort_keys=True)
+
+    export_info = {
+        "n_targets": int(filtered_matrix.shape[0]),
+        "n_units": int(filtered_matrix.shape[1]),
+        "inputs_dir": str(inputs_dir),
+        "outputs_dir": str(outputs_dir),
+    }
+    return output_dir, export_info
diff --git a/paper-l0/benchmarking/benchmark_manifest.py b/paper-l0/benchmarking/benchmark_manifest.py
new file mode 100644
index 00000000..4fad528a
--- /dev/null
+++ b/paper-l0/benchmarking/benchmark_manifest.py
@@ -0,0 +1,194 @@
+from __future__ import annotations
+
+import json
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+import pandas as pd
+
+
+COUNT_LIKE_VARIABLES = {
+    "person_count",
+    "household_count",
+    "tax_unit_count",
+    "spm_unit_count",
+    "family_count",
+    "marital_unit_count",
+}
+
+
+def _normalize_string_list(values: Optional[List[str]]) -> Optional[List[str]]:
+    if values is None:
+        return None
+    return [str(v) for v in values]
+
+
+@dataclass
+class TargetFilters:
+    include_geo_levels: Optional[List[str]] = None
+    include_national: bool = True
+    state_ids: Optional[List[str]] = None
+    district_ids: Optional[List[str]] = None
+    variables: Optional[List[str]] = None
+    domain_variables: Optional[List[str]] = None
+    count_like_only: bool = False
+    max_targets: Optional[int] = None
+
+    @classmethod
+    def from_dict(cls, raw: Optional[Dict[str, Any]]) -> "TargetFilters":
+        raw = raw or {}
+        return cls(
+            include_geo_levels=_normalize_string_list(raw.get("include_geo_levels")),
+            include_national=bool(raw.get("include_national", True)),
+            state_ids=_normalize_string_list(raw.get("state_ids")),
+            district_ids=_normalize_string_list(raw.get("district_ids")),
+            variables=_normalize_string_list(raw.get("variables")),
+            domain_variables=_normalize_string_list(raw.get("domain_variables")),
+            count_like_only=bool(raw.get("count_like_only", False)),
+            max_targets=raw.get("max_targets"),
+        )
+
+
+@dataclass
+class ExternalInputs:
+    ipf_unit_metadata_csv: Optional[str] = None
+    ipf_target_metadata_csv: Optional[str] = None
+
+    @classmethod
+    def from_dict(cls, raw: Optional[Dict[str, Any]]) -> "ExternalInputs":
+        raw = raw or {}
+        return cls(
+            ipf_unit_metadata_csv=raw.get("ipf_unit_metadata_csv"),
+            ipf_target_metadata_csv=raw.get("ipf_target_metadata_csv"),
+        )
+
+
+@dataclass
+class MethodOptions:
+    l0: Dict[str, Any] = field(default_factory=dict)
+    greg: Dict[str, Any] = field(default_factory=dict)
+    ipf: Dict[str, Any] = field(default_factory=dict)
+
+    @classmethod
+    def from_dict(cls, raw: Optional[Dict[str, Any]]) -> "MethodOptions":
+        raw = raw or {}
+        return cls(
+            l0=dict(raw.get("l0", {})),
+            greg=dict(raw.get("greg", {})),
+            ipf=dict(raw.get("ipf", {})),
+        )
+
+
+@dataclass
+class BenchmarkManifest:
+    name: str
+    tier: str
+    description: str
+    package_path: str
+    methods: List[str]
+    target_filters: TargetFilters = field(default_factory=TargetFilters)
+    external_inputs: ExternalInputs = field(default_factory=ExternalInputs)
+    method_options: MethodOptions = field(default_factory=MethodOptions)
+
+    @classmethod
+    def from_dict(cls, raw: Dict[str, Any]) -> "BenchmarkManifest":
+        return cls(
+            name=str(raw["name"]),
+            tier=str(raw["tier"]),
+            description=str(raw.get("description", "")),
+            package_path=str(raw["package_path"]),
+            methods=[str(m) for m in raw.get("methods", [])],
+            target_filters=TargetFilters.from_dict(raw.get("target_filters")),
+            external_inputs=ExternalInputs.from_dict(raw.get("external_inputs")),
+            method_options=MethodOptions.from_dict(raw.get("method_options")),
+        )
+
+    def to_dict(self) -> Dict[str, Any]:
+        payload = asdict(self)
+        payload["package_path"] = str(self.package_path)
+        return payload
+
+
+def load_manifest(path: str | Path) -> BenchmarkManifest:
+    with open(path) as f:
+        return BenchmarkManifest.from_dict(json.load(f))
+
+
+def save_manifest(manifest: BenchmarkManifest, path: str | Path) -> None:
+    with open(path, "w") as f:
+        json.dump(manifest.to_dict(), f, indent=2, sort_keys=True)
+
+
+def is_count_like_variable(variable: str) -> bool:
+    return variable in COUNT_LIKE_VARIABLES or variable.endswith("_count")
+
+
+def _build_geo_mask(targets_df: pd.DataFrame, filters: TargetFilters) -> np.ndarray:
+    geo_level = targets_df["geo_level"].astype(str)
+    geographic_id = targets_df["geographic_id"].astype(str)
+    mask = np.ones(len(targets_df), dtype=bool)
+
+    if filters.include_geo_levels:
+        mask &= geo_level.isin(filters.include_geo_levels).to_numpy()
+
+    geo_keep = np.zeros(len(targets_df), dtype=bool)
+    national_mask = geo_level.eq("national").to_numpy()
+    state_mask = geo_level.eq("state").to_numpy()
+    district_mask = geo_level.eq("district").to_numpy()
+
+    if filters.include_national:
+        geo_keep |= national_mask
+
+    if filters.state_ids:
+        geo_keep |= state_mask & geographic_id.isin(filters.state_ids).to_numpy()
+    else:
+        geo_keep |= state_mask
+
+    if filters.district_ids:
+        geo_keep |= district_mask & geographic_id.isin(filters.district_ids).to_numpy()
+    else:
+        geo_keep |= district_mask
+
+    other_mask = ~(national_mask | state_mask | district_mask)
+    geo_keep |= other_mask
+    return mask & geo_keep
+
+
+def filter_targets(
+    targets_df: pd.DataFrame,
+    target_names: List[str],
+    X_sparse,
+    filters: TargetFilters,
+):
+    mask = _build_geo_mask(targets_df, filters)
+
+    if filters.variables:
+        mask &= targets_df["variable"].astype(str).isin(filters.variables).to_numpy()
+
+    if filters.domain_variables:
+        domain_series = targets_df.get(
+            "domain_variable", pd.Series("", index=targets_df.index)
+        )
+        mask &= (
+            domain_series.fillna("")
+            .astype(str)
+            .isin(filters.domain_variables)
+            .to_numpy()
+        )
+
+    if filters.count_like_only:
+        mask &= (
+            targets_df["variable"].astype(str).map(is_count_like_variable).to_numpy()
+        )
+
+    indices = np.where(mask)[0]
+    if filters.max_targets is not None:
+        indices = indices[: int(filters.max_targets)]
+
+    filtered_targets = targets_df.iloc[indices].reset_index(drop=True).copy()
+    filtered_targets["target_name"] = [target_names[i] for i in indices]
+    filtered_names = [target_names[i] for i in indices]
+    filtered_matrix = X_sparse[indices, :]
+    return filtered_targets, filtered_names, filtered_matrix, indices
diff --git a/paper-l0/benchmarking/benchmark_metrics.py b/paper-l0/benchmarking/benchmark_metrics.py
new file mode 100644
index 00000000..34111720
--- /dev/null
+++ b/paper-l0/benchmarking/benchmark_metrics.py
@@ -0,0 +1,95 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Dict, Iterable
+
+import numpy as np
+import pandas as pd
+from scipy.io import mmread
+
+
+def load_targets_csv(path: str | Path) -> pd.DataFrame:
+    return pd.read_csv(path)
+
+
+def compute_common_metrics(
+    weights: np.ndarray,
+    targets_df: pd.DataFrame,
+    matrix_path: str | Path,
+) -> Dict:
+    X_sparse = mmread(str(matrix_path)).tocsr()
+    estimates = X_sparse.dot(weights)
+    true_values = targets_df["value"].to_numpy(dtype=np.float64)
+    rel_errors = np.where(
+        np.abs(true_values) > 0,
+        (estimates - true_values) / np.abs(true_values),
+        0.0,
+    )
+    abs_rel = np.abs(rel_errors)
+    achievable = np.asarray(X_sparse.sum(axis=1)).ravel() > 0
+
+    active_weights = weights[weights > 0]
+    weight_sum = float(weights.sum())
+    ess = (
+        float(weight_sum**2 / np.square(weights).sum())
+        if np.square(weights).sum() > 0
+        else 0.0
+    )
+
+    metrics = {
+        "n_targets": int(len(true_values)),
+        "n_units": int(len(weights)),
+        "n_achievable_targets": int(achievable.sum()),
+        "mean_abs_rel_error": float(abs_rel.mean()),
+        "median_abs_rel_error": float(np.median(abs_rel)),
+        "p95_abs_rel_error": float(np.quantile(abs_rel, 0.95)),
+        "max_abs_rel_error": float(abs_rel.max()),
+        "ess": ess,
+        "active_record_count": int((weights > 0).sum()),
+        "negative_weight_share": float((weights < 0).mean()),
+        "weight_min": float(weights.min()),
+        "weight_max": float(weights.max()),
+        "weight_mean": float(weights.mean()),
+        "weight_median": float(np.median(weights)),
+        "nonzero_weight_min": float(active_weights.min())
+        if len(active_weights)
+        else 0.0,
+        "nonzero_weight_max": float(active_weights.max())
+        if len(active_weights)
+        else 0.0,
+    }
+
+    if "geo_level" in targets_df.columns:
+        by_geo = {}
+        for geo_level, group in targets_df.assign(abs_rel_error=abs_rel).groupby(
+            "geo_level"
+        ):
+            vals = group["abs_rel_error"].to_numpy(dtype=np.float64)
+            by_geo[str(geo_level)] = {
+                "n_targets": int(len(vals)),
+                "mean_abs_rel_error": float(vals.mean()),
+                "median_abs_rel_error": float(np.median(vals)),
+                "p95_abs_rel_error": float(np.quantile(vals, 0.95)),
+                "max_abs_rel_error": float(vals.max()),
+            }
+        metrics["by_geo_level"] = by_geo
+
+    if "variable" in targets_df.columns:
+        by_variable = {}
+        enriched = targets_df.assign(abs_rel_error=abs_rel)
+        for variable, group in enriched.groupby("variable"):
+            vals = group["abs_rel_error"].to_numpy(dtype=np.float64)
+            by_variable[str(variable)] = {
+                "n_targets": int(len(vals)),
+                "mean_abs_rel_error": float(vals.mean()),
+                "median_abs_rel_error": float(np.median(vals)),
+            }
+        metrics["by_variable"] = by_variable
+
+    return metrics
+
+
+def write_method_summary(summary: Dict, path: str | Path) -> None:
+    with open(path, "w") as f:
+        json.dump(summary, f, indent=2, sort_keys=True)
diff --git a/paper-l0/benchmarking/install_r_packages.R b/paper-l0/benchmarking/install_r_packages.R
new file mode 100644
index 00000000..6ba0e608
--- /dev/null
+++ b/paper-l0/benchmarking/install_r_packages.R
@@ -0,0 +1,23 @@
+packages <- c(
+  "Matrix",
+  "survey",
+  "surveysd"
+)
+
+repos <- "https://cloud.r-project.org"
+
+missing <- packages[!vapply(packages, requireNamespace, logical(1), quietly = TRUE)]
+if (!length(missing)) {
+  cat("All benchmarking R packages are already installed.\n")
+  quit(save = "no", status = 0)
+}
+
+cat("Installing missing R packages:", paste(missing, collapse = ", "), "\n")
+install.packages(missing, repos = repos)
+
+failed <- missing[!vapply(missing, requireNamespace, logical(1), quietly = TRUE)]
+if (length(failed)) {
+  stop(sprintf("Failed to install: %s", paste(failed, collapse = ", ")))
+}
+
+cat("Benchmarking R packages installed successfully.\n")
diff --git a/paper-l0/benchmarking/ipf_conversion.py b/paper-l0/benchmarking/ipf_conversion.py
new file mode 100644
index 00000000..8f765b1c
--- /dev/null
+++ b/paper-l0/benchmarking/ipf_conversion.py
@@ -0,0 +1,295 @@
+from __future__ import annotations
+
+import sqlite3
+from pathlib import Path
+from typing import Dict, Iterable, List, Tuple
+
+import numpy as np
+import pandas as pd
+
+from benchmark_manifest import BenchmarkManifest
+from policyengine_us_data.calibration.calibration_utils import apply_op
+
+
+_GEO_VARS = {"state_fips", "congressional_district_geoid"}
+_SUPPORTED_TARGET_VARIABLES = {"person_count", "household_count"}
+
+
+def _detect_time_period(sim) -> int:
+    raw_keys = sim.dataset.load_dataset()["household_id"]
+    try:
+        return int(next(iter(raw_keys)))
+    except Exception:
+        return 2024
+
+
+def _load_stratum_constraints(
+    db_path: str | Path,
+    stratum_ids: Iterable[int],
+) -> Dict[int, List[dict]]:
+    ids = sorted({int(sid) for sid in stratum_ids})
+    if not ids:
+        return {}
+    placeholders = ",".join("?" for _ in ids)
+    query = f"""
+        SELECT
+            stratum_id,
+            constraint_variable AS variable,
+            operation,
+            value
+        FROM stratum_constraints
+        WHERE stratum_id IN ({placeholders})
+        ORDER BY stratum_id
+    """
+    with sqlite3.connect(str(db_path)) as conn:
+        rows = pd.read_sql_query(query, conn, params=ids)
+    grouped: Dict[int, List[dict]] = {}
+    for stratum_id, group in rows.groupby("stratum_id", sort=False):
+        grouped[int(stratum_id)] = group[["variable", "operation", "value"]].to_dict(
+            "records"
+        )
+    return grouped
+
+
+def _ensure_supported_targets(targets_df: pd.DataFrame) -> None:
+    unsupported = sorted(
+        set(targets_df["variable"].astype(str)) - _SUPPORTED_TARGET_VARIABLES
+    )
+    if unsupported:
+        raise ValueError(
+            "Automatic IPF conversion currently supports only "
+            f"{sorted(_SUPPORTED_TARGET_VARIABLES)} targets. "
+            f"Unsupported target variables in manifest selection: {unsupported}"
+        )
+
+
+def _required_constraint_variables(
+    stratum_constraints: Dict[int, List[dict]],
+) -> List[str]:
+    variables = set()
+    for constraints in stratum_constraints.values():
+        for constraint in constraints:
+            variable = str(constraint["variable"])
+            if variable not in _GEO_VARS:
+                variables.add(variable)
+    return sorted(variables)
+
+
+def _evaluate_constraints(
+    constraints: List[dict],
+    columns: Dict[str, np.ndarray],
+) -> np.ndarray:
+    n_rows = len(next(iter(columns.values())))
+    mask = np.ones(n_rows, dtype=bool)
+    for constraint in constraints:
+        variable = str(constraint["variable"])
+        if variable not in columns:
+            raise KeyError(f"Missing column for constraint variable: {variable}")
+        mask &= apply_op(
+            np.asarray(columns[variable]),
+            str(constraint["operation"]),
+            str(constraint["value"]),
+        )
+    return mask
+
+
+def _build_household_clone_arrays(
+    package: Dict,
+    sim,
+) -> Tuple[pd.DataFrame, np.ndarray]:
+    household_ids = sim.calculate("household_id", map_to="household").values
+    n_households = len(household_ids)
+    n_clones = int(package["metadata"]["n_clones"])
+    expected_units = n_households * n_clones
+    initial_weights = np.asarray(package["initial_weights"], dtype=np.float64)
+    if len(initial_weights) != expected_units:
+        raise ValueError(
+            "Initial weight length does not match dataset households x n_clones: "
+            f"{len(initial_weights)} != {n_households} * {n_clones}"
+        )
+
+    if package.get("cd_geoid") is None or package.get("block_geoid") is None:
+        raise ValueError(
+            "Automatic IPF conversion requires cd_geoid and block_geoid in the package"
+        )
+
+    unit_index = np.arange(expected_units, dtype=np.int64)
+    block_geoid = np.asarray(package["block_geoid"]).astype(str)
+    cd_geoid = np.asarray(package["cd_geoid"]).astype(str)
+    if len(block_geoid) != expected_units or len(cd_geoid) != expected_units:
+        raise ValueError("Geography arrays do not match expected cloned unit count")
+
+    household_df = pd.DataFrame(
+        {
+            "unit_index": unit_index,
+            "household_id": unit_index,
+            "base_weight": initial_weights,
+            "benchmark_all": "all",
+            "state_fips": block_geoid,
+            "congressional_district_geoid": cd_geoid,
+        }
+    )
+    household_df["state_fips"] = household_df["state_fips"].str.slice(0, 2).astype(int)
+    household_df["congressional_district_geoid"] = household_df[
+        "congressional_district_geoid"
+    ].astype(int)
+    return household_df, household_ids
+
+
+def _load_sim_columns(
+    sim,
+    variables: List[str],
+    level: str,
+) -> Dict[str, np.ndarray]:
+    columns: Dict[str, np.ndarray] = {}
+    for variable in variables:
+        try:
+            values = sim.calculate(variable, map_to=level).values
+        except Exception as exc:
+            raise RuntimeError(
+                f"Failed to calculate benchmark variable '{variable}' at level '{level}'"
+            ) from exc
+        values = np.asarray(values)
+        if hasattr(values, "decode_to_str"):
+            values = values.decode_to_str()
+        if values.dtype.kind == "S":
+            values = values.astype(str)
+        columns[variable] = values
+    return columns
+
+
+def _build_person_level_unit_data(
+    package: Dict,
+    household_df: pd.DataFrame,
+    sim,
+    needed_variables: List[str],
+) -> pd.DataFrame:
+    household_ids = sim.calculate("household_id", map_to="household").values
+    person_hh_ids = sim.calculate("household_id", map_to="person").values
+    hh_index = {int(hid): idx for idx, hid in enumerate(household_ids)}
+    person_hh_index = np.array(
+        [hh_index[int(hid)] for hid in person_hh_ids], dtype=np.int64
+    )
+
+    n_households = len(household_ids)
+    n_clones = int(package["metadata"]["n_clones"])
+
+    person_columns = _load_sim_columns(sim, needed_variables, level="person")
+    person_frames = []
+    for clone_idx in range(n_clones):
+        unit_index = person_hh_index + clone_idx * n_households
+        frame = pd.DataFrame(
+            {
+                "unit_index": unit_index,
+                "household_id": unit_index,
+                "base_weight": household_df["base_weight"].to_numpy()[unit_index],
+                "benchmark_all": household_df["benchmark_all"].to_numpy()[unit_index],
+                "state_fips": household_df["state_fips"].to_numpy()[unit_index],
+                "congressional_district_geoid": household_df[
+                    "congressional_district_geoid"
+                ].to_numpy()[unit_index],
+            }
+        )
+        for variable, values in person_columns.items():
+            frame[variable] = values
+        person_frames.append(frame)
+    return pd.concat(person_frames, ignore_index=True)
+
+
+def _build_household_level_unit_data(
+    household_df: pd.DataFrame,
+    sim,
+    needed_variables: List[str],
+) -> pd.DataFrame:
+    frame = household_df.copy()
+    household_columns = _load_sim_columns(sim, needed_variables, level="household")
+    repeated_columns = {
+        name: np.tile(values, len(frame) // len(values))
+        for name, values in household_columns.items()
+    }
+    for name, values in repeated_columns.items():
+        frame[name] = values
+    return frame
+
+
+def _target_scope(target_variable: str) -> str:
+    if target_variable == "person_count":
+        return "person"
+    if target_variable == "household_count":
+        return "household"
+    raise ValueError(f"Unsupported IPF target variable: {target_variable}")
+
+
+def build_ipf_inputs(
+    package: Dict,
+    manifest: BenchmarkManifest,
+    filtered_targets: pd.DataFrame,
+) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    _ensure_supported_targets(filtered_targets)
+
+    metadata = package.get("metadata", {})
+    dataset_path = metadata.get("dataset_path")
+    db_path = metadata.get("db_path")
+    if not dataset_path or not Path(dataset_path).exists():
+        raise FileNotFoundError(
+            "Automatic IPF conversion requires metadata.dataset_path to exist locally"
+        )
+    if not db_path or not Path(db_path).exists():
+        raise FileNotFoundError(
+            "Automatic IPF conversion requires metadata.db_path to exist locally"
+        )
+
+    from policyengine_us import Microsimulation
+
+    sim = Microsimulation(dataset=str(dataset_path))
+    _ = _detect_time_period(sim)
+
+    stratum_constraints = _load_stratum_constraints(
+        db_path=db_path,
+        stratum_ids=filtered_targets["stratum_id"].astype(int).tolist(),
+    )
+    needed_variables = _required_constraint_variables(stratum_constraints)
+    has_person_targets = (
+        filtered_targets["variable"].astype(str).eq("person_count").any()
+    )
+
+    household_df, _ = _build_household_clone_arrays(package, sim)
+    if has_person_targets:
+        unit_data = _build_person_level_unit_data(
+            package=package,
+            household_df=household_df,
+            sim=sim,
+            needed_variables=needed_variables,
+        )
+    else:
+        unit_data = _build_household_level_unit_data(
+            household_df=household_df,
+            sim=sim,
+            needed_variables=needed_variables,
+        )
+
+    eval_columns = {
+        column: unit_data[column].to_numpy() for column in unit_data.columns
+    }
+    ipf_target_rows = []
+    for row_idx, row in filtered_targets.reset_index(drop=True).iterrows():
+        constraints = stratum_constraints.get(int(row["stratum_id"]), [])
+        indicator_column = f"ipf_indicator_{row_idx:05d}"
+        mask = _evaluate_constraints(constraints, eval_columns)
+        unit_data[indicator_column] = mask.astype(np.int8)
+        eval_columns[indicator_column] = unit_data[indicator_column].to_numpy()
+        ipf_target_rows.append(
+            {
+                "scope": _target_scope(str(row["variable"])),
+                "target_type": "numeric_total",
+                "value_column": indicator_column,
+                "variables": "benchmark_all",
+                "cell": "benchmark_all=all",
+                "target_value": float(row["value"]),
+                "target_name": row.get("target_name", f"target_{row_idx}"),
+                "source_variable": str(row["variable"]),
+                "stratum_id": int(row["stratum_id"]),
+            }
+        )
+
+    return unit_data, pd.DataFrame(ipf_target_rows)
diff --git a/paper-l0/benchmarking/manifests/greg_demo_small.example.json b/paper-l0/benchmarking/manifests/greg_demo_small.example.json
new file mode 100644
index 00000000..01f02696
--- /dev/null
+++ b/paper-l0/benchmarking/manifests/greg_demo_small.example.json
@@ -0,0 +1,54 @@
+{
+  "name": "greg_demo_small",
+  "tier": "tier_a",
+  "description": "Example GREG benchmark manifest for a reduced package and a coherent geography subset.",
+  "package_path": "policyengine_us_data/storage/calibration/calibration_package.pkl",
+  "methods": [
+    "l0",
+    "greg"
+  ],
+  "target_filters": {
+    "include_geo_levels": [
+      "national",
+      "state",
+      "district"
+    ],
+    "include_national": true,
+    "state_ids": [
+      "06",
+      "12",
+      "36",
+      "48",
+      "53"
+    ],
+    "district_ids": [
+      "0601",
+      "0602",
+      "1201",
+      "1202",
+      "3601",
+      "3602",
+      "4801",
+      "4802",
+      "5301",
+      "5302"
+    ],
+    "count_like_only": false,
+    "max_targets": 1000
+  },
+  "external_inputs": {},
+  "method_options": {
+    "l0": {
+      "lambda_l0": 1e-08,
+      "epochs": 1000,
+      "device": "cpu",
+      "beta": 0.65,
+      "lambda_l2": 1e-12,
+      "learning_rate": 0.15
+    },
+    "greg": {
+      "maxit": 200,
+      "epsilon": 1e-07
+    }
+  }
+}
diff --git a/paper-l0/benchmarking/manifests/ipf_demo_small.example.json b/paper-l0/benchmarking/manifests/ipf_demo_small.example.json
new file mode 100644
index 00000000..f979e7d9
--- /dev/null
+++ b/paper-l0/benchmarking/manifests/ipf_demo_small.example.json
@@ -0,0 +1,48 @@
+{
+  "name": "ipf_demo_small",
+  "tier": "tier_b",
+  "description": "Example IPF benchmark manifest using automatic conversion from the saved calibration package and its source dataset.",
+  "package_path": "policyengine_us_data/storage/calibration/calibration_package.pkl",
+  "methods": [
+    "l0",
+    "ipf"
+  ],
+  "target_filters": {
+    "include_geo_levels": [
+      "national",
+      "state",
+      "district"
+    ],
+    "include_national": true,
+    "variables": [
+      "person_count",
+      "household_count"
+    ],
+    "domain_variables": [
+      "age",
+      "snap",
+      "medicaid_enrolled"
+    ],
+    "count_like_only": true,
+    "max_targets": 250
+  },
+  "external_inputs": {},
+  "method_options": {
+    "l0": {
+      "lambda_l0": 1e-08,
+      "epochs": 1000,
+      "device": "cpu",
+      "beta": 0.65,
+      "lambda_l2": 1e-12,
+      "learning_rate": 0.15
+    },
+    "ipf": {
+      "max_iter": 200,
+      "bound": 4.0,
+      "epsP": 1e-06,
+      "epsH": 0.01,
+      "household_id_col": "household_id",
+      "weight_col": "base_weight"
+    }
+  }
+}
diff --git a/paper-l0/benchmarking/requirements-python.txt b/paper-l0/benchmarking/requirements-python.txt
new file mode 100644
index 00000000..51517931
--- /dev/null
+++ b/paper-l0/benchmarking/requirements-python.txt
@@ -0,0 +1,2 @@
+-e ".[dev,l0]"
+PyYAML>=6.0
diff --git a/paper-l0/benchmarking/runners/greg_runner.R b/paper-l0/benchmarking/runners/greg_runner.R
new file mode 100644
index 00000000..c8234a73
--- /dev/null
+++ b/paper-l0/benchmarking/runners/greg_runner.R
@@ -0,0 +1,53 @@
+script_arg <- grep("^--file=", commandArgs(trailingOnly = FALSE), value = TRUE)
+if (length(script_arg) != 1L) {
+  stop("Could not determine greg_runner.R path")
+}
+script_dir <- dirname(normalizePath(sub("^--file=", "", script_arg)))
+source(file.path(script_dir, "read_npy.R"))
+
+args <- commandArgs(trailingOnly = TRUE)
+if (length(args) < 6L) {
+  stop("Usage: greg_runner.R X.mtx target_metadata.csv initial_weights.npy output_weights.csv maxit epsilon")
+}
+
+matrix_path <- args[[1]]
+target_csv <- args[[2]]
+weights_npy <- args[[3]]
+output_csv <- args[[4]]
+maxit <- as.integer(args[[5]])
+epsilon <- as.numeric(args[[6]])
+
+library(Matrix)
+library(survey)
+
+X_targets_by_units <- readMM(matrix_path)
+target_meta <- read.csv(target_csv, stringsAsFactors = FALSE)
+base_weights <- read_npy_vector(weights_npy)
+population <- as.numeric(target_meta$value)
+
+mm <- Matrix::t(X_targets_by_units)
+if (nrow(mm) != length(base_weights)) {
+  stop("Unit count mismatch between matrix and initial weights")
+}
+if (ncol(mm) != length(population)) {
+  stop("Target count mismatch between matrix and target metadata")
+}
+
+cal_linear <- get("cal.linear", envir = asNamespace("survey"))
+g <- survey:::grake(
+  mm = mm,
+  ww = base_weights,
+  calfun = cal_linear,
+  bounds = list(lower = -Inf, upper = Inf),
+  population = population,
+  epsilon = epsilon,
+  verbose = FALSE,
+  maxit = maxit
+)
+
+fitted_weights <- as.numeric(base_weights * g)
+write.csv(
+  data.frame(unit_index = seq_along(fitted_weights) - 1L, fitted_weight = fitted_weights),
+  output_csv,
+  row.names = FALSE
+)
diff --git a/paper-l0/benchmarking/runners/ipf_runner.R b/paper-l0/benchmarking/runners/ipf_runner.R
new file mode 100644
index 00000000..3348c271
--- /dev/null
+++ b/paper-l0/benchmarking/runners/ipf_runner.R
@@ -0,0 +1,148 @@
+script_arg <- grep("^--file=", commandArgs(trailingOnly = FALSE), value = TRUE)
+if (length(script_arg) != 1L) {
+  stop("Could not determine ipf_runner.R path")
+}
+script_dir <- dirname(normalizePath(sub("^--file=", "", script_arg)))
+source(file.path(script_dir, "read_npy.R"))
+
+args <- commandArgs(trailingOnly = TRUE)
+if (length(args) < 10L) {
+  stop("Usage: ipf_runner.R unit_metadata.csv ipf_target_metadata.csv initial_weights.npy output_weights.csv max_iter bound epsP epsH household_id_col weight_col")
+}
+
+unit_csv <- args[[1]]
+target_csv <- args[[2]]
+weights_npy <- args[[3]]
+output_csv <- args[[4]]
+max_iter <- as.integer(args[[5]])
+bound <- as.numeric(args[[6]])
+epsP <- as.numeric(args[[7]])
+epsH <- as.numeric(args[[8]])
+household_id_col <- args[[9]]
+weight_col <- args[[10]]
+
+if (!requireNamespace("surveysd", quietly = TRUE)) {
+  stop("The surveysd package is required for IPF benchmarks")
+}
+if (!requireNamespace("data.table", quietly = TRUE)) {
+  stop("The data.table package is required for IPF benchmarks")
+}
+
+build_margin_array <- function(df) {
+  variables <- strsplit(df$variables[[1]], "\\|")[[1]]
+  rows <- lapply(seq_len(nrow(df)), function(i) {
+    cell <- df$cell[[i]]
+    parts <- strsplit(cell, "\\|")[[1]]
+    entries <- strsplit(parts, "=")
+    row <- as.list(setNames(vapply(entries, `[`, "", 2L), vapply(entries, `[`, "", 1L)))
+    row$Freq <- as.numeric(df$target_value[[i]])
+    as.data.frame(row, stringsAsFactors = FALSE)
+  })
+  frame <- do.call(rbind, rows)
+  stats::xtabs(
+    stats::as.formula(paste("Freq ~", paste(variables, collapse = " + "))),
+    data = frame
+  )
+}
+
+build_single_cell_array <- function(variables_str, cell_str, target_value) {
+  variables <- strsplit(variables_str, "\\|")[[1]]
+  parts <- strsplit(cell_str, "\\|")[[1]]
+  entries <- strsplit(parts, "=")
+  row <- as.list(setNames(vapply(entries, `[`, "", 2L), vapply(entries, `[`, "", 1L)))
+  row$Freq <- as.numeric(target_value)
+  frame <- as.data.frame(row, stringsAsFactors = FALSE)
+  stats::xtabs(
+    stats::as.formula(paste("Freq ~", paste(variables, collapse = " + "))),
+    data = frame
+  )
+}
+
+unit_data <- read.csv(unit_csv, stringsAsFactors = FALSE)
+target_meta <- read.csv(target_csv, stringsAsFactors = FALSE)
+base_weights <- read_npy_vector(weights_npy)
+unit_data <- data.table::as.data.table(unit_data)
+
+if (!(weight_col %in% names(unit_data))) {
+  if (!("unit_index" %in% names(unit_data))) {
+    stop("Unit metadata must include either base weights or a unit_index column")
+  }
+  if (max(unit_data$unit_index) >= length(base_weights)) {
+    stop("unit_index contains values outside the initial weight vector")
+  }
+  unit_data[[weight_col]] <- base_weights[unit_data$unit_index + 1L]
+}
+
+conP <- list()
+conH <- list()
+if (!("target_type" %in% names(target_meta))) {
+  target_meta$target_type <- "categorical_margin"
+}
+
+numeric_rows <- target_meta[target_meta$target_type == "numeric_total", , drop = FALSE]
+for (i in seq_len(nrow(numeric_rows))) {
+  row <- numeric_rows[i, , drop = FALSE]
+  value_column <- row$value_column[[1]]
+    if (!(value_column %in% names(unit_data))) {
+      stop(sprintf("Unit metadata is missing numeric target column %s", value_column))
+    }
+    target_array <- build_single_cell_array(
+      if ("variables" %in% names(row)) row$variables[[1]] else "benchmark_all",
+      if ("cell" %in% names(row)) row$cell[[1]] else "benchmark_all=all",
+      row$target_value[[1]]
+    )
+    if (row$scope[[1]] == "person") {
+      conP[[value_column]] <- target_array
+    } else if (row$scope[[1]] == "household") {
+      conH[[value_column]] <- target_array
+    } else {
+      stop(sprintf("Unsupported numeric target scope: %s", row$scope[[1]]))
+    }
+}
+
+margin_rows_all <- target_meta[target_meta$target_type == "categorical_margin", , drop = FALSE]
+if (nrow(margin_rows_all) > 0) {
+  for (margin_id in unique(margin_rows_all$margin_id)) {
+    margin_rows <- margin_rows_all[margin_rows_all$margin_id == margin_id, , drop = FALSE]
+    margin_array <- build_margin_array(margin_rows)
+    scope <- unique(margin_rows$scope)
+    if (length(scope) != 1L) {
+      stop(sprintf("Margin %s has inconsistent scope values", margin_id))
+    }
+    if (scope[[1]] == "person") {
+      conP[[length(conP) + 1L]] <- margin_array
+    } else if (scope[[1]] == "household") {
+      conH[[length(conH) + 1L]] <- margin_array
+    } else {
+      stop(sprintf("Unsupported margin scope: %s", scope[[1]]))
+    }
+  }
+}
+
+ipf_result <- surveysd::ipf(
+  dat = unit_data,
+  hid = if (household_id_col %in% names(unit_data)) household_id_col else NULL,
+  conP = if (length(conP)) conP else NULL,
+  conH = if (length(conH)) conH else NULL,
+  epsP = epsP,
+  epsH = epsH,
+  verbose = FALSE,
+  w = weight_col,
+  bound = bound,
+  maxIter = max_iter,
+  returnNA = TRUE,
+  nameCalibWeight = "calibWeight"
+)
+
+if (!("calibWeight" %in% names(ipf_result))) {
+  stop("surveysd::ipf did not return a calibWeight column")
+}
+
+write.csv(
+  data.frame(
+    unit_index = if ("unit_index" %in% names(ipf_result)) ipf_result$unit_index else seq_len(nrow(ipf_result)) - 1L,
+    fitted_weight = as.numeric(ipf_result$calibWeight)
+  ),
+  output_csv,
+  row.names = FALSE
+)
diff --git a/paper-l0/benchmarking/runners/read_npy.R b/paper-l0/benchmarking/runners/read_npy.R
new file mode 100644
index 00000000..04a4ef79
--- /dev/null
+++ b/paper-l0/benchmarking/runners/read_npy.R
@@ -0,0 +1,66 @@
+read_npy_vector <- function(path) {
+  con <- file(path, "rb")
+  on.exit(close(con), add = TRUE)
+
+  magic <- readBin(con, what = "raw", n = 6, endian = "little")
+  if (!identical(as.integer(magic), as.integer(charToRaw("\x93NUMPY")))) {
+    stop("Unsupported .npy file: bad magic header")
+  }
+
+  major <- readBin(con, what = "integer", n = 1, size = 1, signed = FALSE)
+  minor <- readBin(con, what = "integer", n = 1, size = 1, signed = FALSE)
+
+  header_len <- if (major == 1L) {
+    readBin(con, what = "integer", n = 1, size = 2, signed = FALSE, endian = "little")
+  } else if (major %in% c(2L, 3L)) {
+    readBin(con, what = "integer", n = 1, size = 4, signed = FALSE, endian = "little")
+  } else {
+    stop("Unsupported .npy version")
+  }
+
+  header_raw <- readBin(con, what = "raw", n = header_len, endian = "little")
+  header <- rawToChar(header_raw)
+
+  descr_match <- regmatches(header, regexpr("'descr': *'[^']+'", header))
+  shape_match <- regmatches(header, regexpr("'shape': *\\([^\\)]*\\)", header))
+  fortran_match <- regmatches(header, regexpr("'fortran_order': *(True|False)", header))
+
+  if (length(descr_match) == 0 || length(shape_match) == 0 || length(fortran_match) == 0) {
+    stop("Could not parse .npy header")
+  }
+
+  descr <- sub("^'descr': *'([^']+)'$", "\\1", descr_match)
+  fortran_order <- sub("^'fortran_order': *(True|False)$", "\\1", fortran_match)
+  if (fortran_order != "False") {
+    stop("Only C-order .npy arrays are supported")
+  }
+
+  shape_text <- sub("^'shape': *\\(([^\\)]*)\\)$", "\\1", shape_match)
+  shape_parts <- trimws(unlist(strsplit(shape_text, ",")))
+  shape_parts <- shape_parts[nzchar(shape_parts)]
+  dims <- as.integer(shape_parts)
+
+  if (length(dims) != 1L || is.na(dims[1])) {
+    stop("Only 1D .npy vectors are supported")
+  }
+
+  n <- dims[1]
+
+  if (descr == "<f8") {
+    return(readBin(con, what = "double", n = n, size = 8, endian = "little"))
+  }
+  if (descr == "<f4") {
+    return(as.numeric(readBin(con, what = "numeric", n = n, size = 4, endian = "little")))
+  }
+  if (descr == "<i8") {
+    return(as.numeric(readBin(con, what = "integer", n = n, size = 8, endian = "little")))
+  }
+  if (descr == "<i4") {
+    return(as.numeric(readBin(con, what = "integer", n = n, size = 4, endian = "little")))
+  }
+  if (descr == "|b1") {
+    return(as.logical(readBin(con, what = "integer", n = n, size = 1, signed = FALSE)))
+  }
+
+  stop(sprintf("Unsupported .npy dtype: %s", descr))
+}
diff --git a/paper-l0/bibliography/references.bib b/paper-l0/bibliography/references.bib
new file mode 100644
index 00000000..3c3e862b
--- /dev/null
+++ b/paper-l0/bibliography/references.bib
@@ -0,0 +1,241 @@
+% === Core method references ===
+
+@inproceedings{louizos2018,
+  title     = {Learning Sparse Neural Networks through $L_0$ Regularization},
+  author    = {Louizos, Christos and Welling, Max and Kingma, Diederik P.},
+  booktitle = {International Conference on Learning Representations},
+  year      = {2018},
+  url       = {https://openreview.net/forum?id=H1Y8hhg0b}
+}
+
+@article{deville1992,
+  title   = {Calibration Estimators in Survey Sampling},
+  author  = {Deville, Jean-Claude and S{\"a}rndal, Carl-Erik},
+  journal = {Journal of the American Statistical Association},
+  volume  = {87},
+  number  = {418},
+  pages   = {376--382},
+  year    = {1992}
+}
+
+@article{sarndal2007,
+  title   = {The Calibration Approach in Survey Theory and Practice},
+  author  = {S{\"a}rndal, Carl-Erik},
+  journal = {Survey Methodology},
+  volume  = {33},
+  number  = {2},
+  pages   = {99--119},
+  year    = {2007}
+}
+
+@article{deming1940,
+  title   = {On a Least Squares Adjustment of a Sampled Frequency Table When the Expected Marginal Totals are Known},
+  author  = {Deming, W. Edwards and Stephan, Frederick F.},
+  journal = {The Annals of Mathematical Statistics},
+  volume  = {11},
+  number  = {4},
+  pages   = {427--444},
+  year    = {1940}
+}
+
+@article{ireland1968,
+  title   = {Contingency Tables with Given Marginals},
+  author  = {Ireland, C. T. and Kullback, S.},
+  journal = {Biometrika},
+  volume  = {55},
+  number  = {1},
+  pages   = {179--188},
+  year    = {1968}
+}
+
+@inproceedings{kingma2015,
+  title     = {Adam: A Method for Stochastic Optimization},
+  author    = {Kingma, Diederik P. and Ba, Jimmy},
+  booktitle = {International Conference on Learning Representations},
+  year      = {2015},
+  url       = {https://arxiv.org/abs/1412.6980}
+}
+
+% === Spatial microsimulation ===
+
+@article{williamson1998,
+  title   = {The Creation of Census-Based Small Area Microdata Using Statistical Matching Techniques},
+  author  = {Williamson, Paul and Birkin, Mark and Rees, Phil},
+  journal = {Environment and Planning A},
+  volume  = {30},
+  number  = {5},
+  pages   = {785--816},
+  year    = {1998}
+}
+
+@article{huang2001,
+  title   = {A Spatial Microsimulation Model with Student Agents},
+  author  = {Huang, Zhixin and Williamson, Paul},
+  journal = {Computers, Environment and Urban Systems},
+  volume  = {25},
+  number  = {6},
+  pages   = {473--489},
+  year    = {2001}
+}
+
+@article{tanton2011,
+  title   = {Small Area Estimation Using a Reweighting Algorithm},
+  author  = {Tanton, Robert and Vidyattama, Yogi and Nepal, Binod and McNamara, John},
+  journal = {Journal of the Royal Statistical Society: Series A},
+  volume  = {174},
+  number  = {4},
+  pages   = {931--951},
+  year    = {2011}
+}
+
+@book{lovelace2016,
+  title     = {Spatial Microsimulation with {R}},
+  author    = {Lovelace, Robin and Dumont, Morgane},
+  publisher = {Chapman and Hall/CRC},
+  year      = {2016}
+}
+
+@article{harland2012,
+  title   = {Creating Realistic Synthetic Populations at Varying Spatial Scales: A Comparative Critique of Population Synthesis Techniques},
+  author  = {Harland, Kirk and Heppenstall, Alison and Smith, Dianna and Birkin, Mark},
+  journal = {Journal of Artificial Societies and Social Simulation},
+  volume  = {15},
+  number  = {1},
+  pages   = {1},
+  year    = {2012}
+}
+
+@article{anderson2013,
+  title   = {Microsimulation for Local Impact Analysis: The Role of Small Area Estimation},
+  author  = {Anderson, Brent},
+  journal = {International Journal of Microsimulation},
+  volume  = {6},
+  number  = {2},
+  pages   = {30--55},
+  year    = {2013}
+}
+
+@article{tanton2014review,
+  title   = {A Review of Spatial Microsimulation Methods},
+  author  = {Tanton, Robert},
+  journal = {International Journal of Microsimulation},
+  volume  = {7},
+  number  = {1},
+  pages   = {4--25},
+  year    = {2014},
+  doi     = {10.34196/ijm.00092}
+}
+
+@article{odonoghue2014review,
+  title   = {Spatial Microsimulation Modelling: A Review of Applications and Methodological Choices},
+  author  = {O'Donoghue, Cathal and Morrissey, Karyn and Lennon, John},
+  journal = {International Journal of Microsimulation},
+  volume  = {7},
+  number  = {1},
+  pages   = {26--75},
+  year    = {2014},
+  doi     = {10.34196/ijm.00093}
+}
+
+% === PolicyEngine and predecessor ===
+
+@techreport{woodruff2024,
+  title       = {Enhancing Survey Microdata with Administrative Records: A Novel Approach to Microsimulation Dataset Construction},
+  author      = {Woodruff, Nikhil and Ghenis, Max},
+  institution = {PolicyEngine},
+  year        = {2024},
+  url         = {https://github.com/PolicyEngine/policyengine-us-data}
+}
+
+% === Data sources ===
+
+@techreport{census2024,
+  title       = {Current Population Survey, 2024 Annual Social and Economic ({ASEC}) Supplement},
+  author      = {{U.S. Census Bureau}},
+  institution = {U.S. Census Bureau},
+  year        = {2024},
+  url         = {https://www2.census.gov/programs-surveys/cps/datasets/2024/march/asec2024_ddl_pub_full.pdf}
+}
+
+@techreport{bryant2023a,
+  title       = {General Description Booklet for the 2015 Public Use Tax File},
+  author      = {Bryant, Victoria},
+  institution = {Statistics of Income Division, Internal Revenue Service},
+  year        = {2023},
+  month       = {February},
+  type        = {Technical Documentation},
+  url         = {https://www.irs.gov/statistics/soi-tax-stats-individual-public-use-microdata-files}
+}
+
+@techreport{cbo2018,
+  title       = {An Overview of {CBO}'s Microsimulation Tax Model},
+  author      = {{Congressional Budget Office}},
+  institution = {Congressional Budget Office},
+  year        = {2018},
+  url         = {https://www.cbo.gov/publication/54096}
+}
+
+@techreport{jct2023,
+  title       = {Overview of {JCT} Revenue Estimating Methods},
+  author      = {{Joint Committee on Taxation}},
+  institution = {Joint Committee on Taxation},
+  number      = {JCX-48-23},
+  year        = {2023},
+  url         = {https://www.jct.gov/publications/2023/jcx-48-23/}
+}
+
+@misc{tpc2024,
+  title  = {Brief Description of the Tax Model},
+  author = {{Tax Policy Center}},
+  year   = {2024},
+  url    = {https://www.taxpolicycenter.org/resources/brief-description-tax-model}
+}
+
+% === Machine learning / imputation ===
+
+@article{meinshausen2006quantile,
+  title   = {Quantile Regression Forests},
+  author  = {Meinshausen, Nicolai},
+  journal = {Journal of Machine Learning Research},
+  volume  = {7},
+  pages   = {983--999},
+  year    = {2006}
+}
+
+@article{pytorch2019,
+  title   = {{PyTorch}: An Imperative Style, High-Performance Deep Learning Library},
+  author  = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and others},
+  journal = {Advances in Neural Information Processing Systems},
+  volume  = {32},
+  year    = {2019}
+}
+
+% === Income measurement / data quality ===
+
+@article{burkhauser2012,
+  title   = {Recent Trends in Top Income Shares in the {United States}: Reconciling Estimates from {March CPS} and {IRS} Tax Return Data},
+  author  = {Burkhauser, Richard V. and Feng, Shuaizhang and Jenkins, Stephen P. and Larrimore, Jeff},
+  journal = {Review of Economics and Statistics},
+  volume  = {94},
+  number  = {2},
+  pages   = {371--388},
+  year    = {2012}
+}
+
+@article{rothbaum2021,
+  title   = {Has Income Underreporting Changed Over Time? Evidence from Linked Survey and Administrative Data},
+  author  = {Rothbaum, Jonathan and Bee, Adam},
+  journal = {Journal of Business and Economic Statistics},
+  volume  = {39},
+  number  = {2},
+  pages   = {456--475},
+  year    = {2021}
+}
+
+@article{meyer2021,
+  title   = {The Accuracy of Tax Imputations: Estimating Tax Liabilities and Credits Using Linked Survey and Administrative Data},
+  author  = {Meyer, Bruce D. and Wu, Derek and Finley, Grace and Langetieg, Patrick and Payne, Carla and Plumley, Alan and Yu, Alexa},
+  journal = {NBER Working Paper},
+  number  = {28229},
+  year    = {2021}
+}
diff --git a/paper-l0/ijm.sty b/paper-l0/ijm.sty
new file mode 100644
index 00000000..b5f44a4f
--- /dev/null
+++ b/paper-l0/ijm.sty
@@ -0,0 +1,72 @@
+% ijm.sty — Style shim for the International Journal of Microsimulation
+% Enforces IJM editorial policy formatting requirements.
+
+\NeedsTeXFormat{LaTeX2e}
+\ProvidesPackage{ijm}[2025/04/01 IJM formatting shim]
+
+% --- Font: Times New Roman equivalent for text ---
+% `mathptmx` in this local TinyTeX install requests a missing rsfs font.
+% Use the standard Times text package instead so the paper remains buildable.
+\RequirePackage{times}
+\renewcommand{\ttdefault}{cmtt}
+
+% --- Spacing: 1.5 throughout ---
+\RequirePackage{setspace}
+\onehalfspacing
+
+% --- Margins: 1.5 inches on all sides ---
+\RequirePackage[margin=1.5in]{geometry}
+
+% --- Section numbering: max 3 levels ---
+\setcounter{secnumdepth}{3}
+\setcounter{tocdepth}{3}
+
+% --- Mathematics ---
+\RequirePackage{amsmath}
+\RequirePackage{amsfonts}
+
+% --- Tables: no borders, Arial 10pt interior ---
+\RequirePackage{booktabs}
+\RequirePackage{array}
+\RequirePackage{helvet}  % provides \sffamily for Arial-like interior text
+
+% Table interior font switch
+\newcommand{\tablefont}{\fontsize{10}{12}\selectfont\sffamily}
+
+% --- Figures ---
+\RequirePackage{graphicx}
+
+% --- Citations: APA style via natbib ---
+\RequirePackage[round]{natbib}
+\bibpunct{(}{)}{;}{a}{,}{,}
+\setcitestyle{authoryear,round}
+
+% --- Hyperlinks ---
+\RequirePackage{hyperref}
+\RequirePackage{xcolor}
+\hypersetup{
+    colorlinks=true,
+    linkcolor=blue,
+    filecolor=magenta,
+    urlcolor=blue,
+    citecolor=blue,
+}
+
+% --- Algorithms ---
+\RequirePackage{algorithm}
+\RequirePackage{algpseudocode}
+
+% --- Footnotes: same font, size 10 ---
+\RequirePackage[hang,flushmargin]{footmisc}
+\renewcommand{\footnotesize}{\fontsize{10}{12}\selectfont}
+
+% --- Float placement ---
+\RequirePackage{float}
+
+% --- Source/note formatting below tables and figures ---
+\newcommand{\tablenote}[1]{%
+  \par\vspace{2pt}%
+  {\fontsize{10}{12}\selectfont\rmfamily #1}%
+}
+
+\endinput
diff --git a/paper-l0/macros.tex b/paper-l0/macros.tex
new file mode 100644
index 00000000..ebe0b5f7
--- /dev/null
+++ b/paper-l0/macros.tex
@@ -0,0 +1,30 @@
+% Custom commands and mathematics macros
+
+% --- Acronyms ---
+\newcommand{\policyengine}{\textsc{PolicyEngine}}
+\newcommand{\cps}{\textsc{CPS}}
+\newcommand{\puf}{\textsc{PUF}}
+\newcommand{\acs}{\textsc{ACS}}
+\newcommand{\sipp}{\textsc{SIPP}}
+\newcommand{\scf}{\textsc{SCF}}
+
+% --- L0 / Hard Concrete notation ---
+\newcommand{\lzero}{$L_0$}
+\newcommand{\ltwo}{$L_2$}
+\newcommand{\hc}{Hard Concrete}
+
+% --- Placeholder for empirical results ---
+\newcommand{\tbc}[1][]{%
+  \textbf{[TBC%
+    \def\temparg{#1}%
+    \ifx\temparg\empty\else: #1\fi
+  ]}%
+}
+
+% --- Math shortcuts ---
+\newcommand{\E}{\mathbb{E}}
+\DeclareMathOperator{\clip}{clip}
+\DeclareMathOperator*{\argmin}{arg\,min}
+\newcommand{\bw}{\mathbf{w}}
+\newcommand{\balpha}{\boldsymbol{\alpha}}
+\newcommand{\R}{\mathbb{R}}
diff --git a/paper-l0/main.pdf b/paper-l0/main.pdf
new file mode 100644
index 00000000..51448ecf
Binary files /dev/null and b/paper-l0/main.pdf differ
diff --git a/paper-l0/main.tex b/paper-l0/main.tex
new file mode 100644
index 00000000..3cdbe838
--- /dev/null
+++ b/paper-l0/main.tex
@@ -0,0 +1,47 @@
+\documentclass[11pt]{article}
+
+\usepackage{ijm}
+\input{macros}
+
+\title{$L_0$ regularization for subnational microsimulation calibration}
+
+\author{
+    Maria Juaristi\thanks{PolicyEngine. Corresponding author: maria@policyengine.org} \and
+    Nikhil Woodruff\footnotemark[1] \and
+    Ben Ogorek\footnotemark[1] \and
+    Max Ghenis\footnotemark[1]
+}
+\date{}
+
+\begin{document}
+
+\maketitle
+
+\input{sections/abstract}
+\input{sections/introduction}
+\input{sections/background}
+\input{sections/data}
+\input{sections/methodology}
+\input{sections/results}
+\input{sections/discussion}
+\input{sections/conclusion}
+
+% --- Required end-matter statements ---
+\section*{Conflict of interest}
+The authors declare no conflict of interest.
+
+\section*{Funding}
+This work was supported by PolicyEngine, a 501(c)(3) nonprofit organization.
+
+\section*{Data and code availability}
+The calibration pipeline, including the \lzero{} optimizer, is open source at \url{https://github.com/PolicyEngine/policyengine-us-data}. The \texttt{l0-python} PyTorch package is available at \url{https://github.com/PolicyEngine/l0-python}. Calibrated datasets are hosted on HuggingFace at \texttt{policyengine/policyengine-us-data}. The IRS Public Use File requires a restricted-access agreement; all other data sources are publicly available.
+
+\section*{Use of generative AI}
+The authors used Claude (Anthropic) for code assistance and manuscript drafting. All technical content, methodology, and analysis were designed and validated by the authors.
+
+\bibliographystyle{plainnat}
+\bibliography{./bibliography/references}
+
+\input{sections/appendix}
+
+\end{document}
diff --git a/paper-l0/sections/abstract.tex b/paper-l0/sections/abstract.tex
new file mode 100644
index 00000000..f8f7ebcd
--- /dev/null
+++ b/paper-l0/sections/abstract.tex
@@ -0,0 +1,9 @@
+\begin{abstract}
+Tax-benefit microsimulation models typically operate at the national level, using household survey weights calibrated to aggregate population targets. Subnational analysis---at the level of states, congressional districts, or local authorities---requires datasets that simultaneously satisfy geographic distributional constraints while preserving household-level detail. We present a method based on $L_0$ regularization that jointly optimizes survey weight magnitudes and sparsity to produce calibrated subnational microsimulation datasets.
+
+Our approach builds on the Hard Concrete distribution \citep{louizos2018}, which induces exact sparsity by multiplying each household's weight by a learned stochastic gate that collapses to a deterministic zero or one at inference time. We parameterize each gate with a log-alpha and temperature parameter, and jointly optimize these alongside log-transformed weight magnitudes using a single loss function combining scale-invariant relative calibration error, an $L_0$ sparsity penalty on the expected count of active households, and a light $L_2$ regularizer on weight magnitudes.
+
+The pipeline begins with the US Current Population Survey. Each household record is cloned multiple times and assigned to random census blocks drawn from a population-weighted distribution. Program participation indicators are re-randomized per geographic assignment using local take-up rates. Each clone is then run through \policyengine{}'s tax-benefit microsimulation engine to generate geography-specific outputs. The $L_0$ optimizer selects which household-geography combinations to retain, calibrating simultaneously against approximately 37,800 targets across three geographic levels. The sparsity penalty is configurable: a higher penalty produces a compact national dataset of approximately 50,000 records, while a lower penalty yields a larger dataset of approximately 3--4 million records covering all 436 congressional districts and 50 states individually. The method is implemented as the open-source \texttt{l0-python} PyTorch package.
+\end{abstract}
+
+\noindent\textbf{Keywords:} microsimulation, survey calibration, $L_0$ regularization, subnational analysis
diff --git a/paper-l0/sections/appendix.tex b/paper-l0/sections/appendix.tex
new file mode 100644
index 00000000..bd0f55d3
--- /dev/null
+++ b/paper-l0/sections/appendix.tex
@@ -0,0 +1,199 @@
+\appendix
+
+\section{Optimization hyperparameters}
+\label{app:hyperparameters}
+
+Table~\ref{tab:hyperparameters_full} lists all hyperparameters used in the $L_0$ optimization with their default values, code-level names, and roles.
+
+\begin{table}[ht]
+\centering
+{\tablefont
+\begin{tabular}{llll}
+\toprule
+Parameter & Code name & Value & Role \\
+\midrule
+Gate temperature & \texttt{BETA} & 0.35 & Controls sharpness of 0/1 gate transition \\
+Left stretch & \texttt{GAMMA} & $-0.1$ & Enables exact-zero gates after clipping \\
+Right stretch & \texttt{ZETA} & 1.1 & Enables exact-one gates after clipping \\
+Initial keep probability & \texttt{INIT\_KEEP\_PROB} & 0.999 & All records start nearly fully active \\
+Weight jitter SD & \texttt{LOG\_WEIGHT\_JITTER\_SD} & 0.05 & Log-space noise on weights at init \\
+Logit jitter SD & \texttt{LOG\_ALPHA\_JITTER\_SD} & 0.01 & Log-space noise on gate logits at init \\
+$L_2$ penalty & \texttt{LAMBDA\_L2} & $10^{-12}$ & Mild weight regularization \\
+Learning rate & \texttt{LEARNING\_RATE} & 0.15 & Adam optimizer step size \\
+Default epochs & \texttt{DEFAULT\_EPOCHS} & 100 & Training iterations (local preset) \\
+Default clones & \texttt{DEFAULT\_N\_CLONES} & 430 & CPS copies before optimization \\
+$\lambda_{L_0}$ (local) & \texttt{PRESETS[``local'']} & $10^{-8}$ & Sparsity penalty for local-area datasets \\
+$\lambda_{L_0}$ (national) & \texttt{PRESETS[``national'']} & $10^{-4}$ & Sparsity penalty for national dataset \\
+National epochs & (pipeline default) & 4{,}000 & Training iterations (national preset) \\
+\bottomrule
+\end{tabular}
+}
+\caption{Complete hyperparameter table for $L_0$ optimization. Source: \texttt{unified\_calibration.py}.}
+\label{tab:hyperparameters_full}
+\end{table}
+
+\section{Calibration target variables}
+\label{app:targets}
+
+The calibration pipeline draws targets from seven administrative sources across three geographic levels. The database contains 37,758 active targets in total. The following tables list every target domain included in the \texttt{policy\_data.db} database, grouped by geographic level.
+
+\subsection{Congressional district targets (33,572)}
+
+\begin{table}[H]
+\centering
+{\tablefont
+\begin{tabular}{p{0.55\textwidth}lr}
+\toprule
+Target domain & Type & Count \\
+\midrule
+\multicolumn{3}{l}{\textit{Census ACS S0101}} \\
+Person count by age band (18 bands $\times$ 436 CDs) & count & 7{,}848 \\
+\midrule
+\multicolumn{3}{l}{\textit{IRS SOI}} \\
+Person count by AGI bracket (9 bins $\times$ 436 CDs) & count & 3{,}924 \\
+EITC dollars by qualifying children (4 bins $\times$ 436 CDs) & \$ & 1{,}744 \\
+Tax unit count by qualifying children (4 bins $\times$ 436 CDs) & count & 1{,}744 \\
+Aggregate AGI (unconditional, $\times$ 436 CDs) & \$ & 436 \\
+21 income/deduction dollar totals, each with domain $> 0$ ($\times$ 436 CDs) & \$ & 9{,}156 \\
+Tax unit count for each of the 21 domains ($\times$ 436 CDs) & count & 9{,}156 \\
+\midrule
+\multicolumn{3}{l}{\textit{Census ACS S2201}} \\
+SNAP household count ($\times$ 436 CDs) & count & 436 \\
+\midrule
+& & \textbf{33{,}572} \\
+\bottomrule
+\end{tabular}
+}
+\caption{Congressional district calibration targets (436 CDs). Each row is replicated across all 436 districts. IRS SOI provides paired dollar and count targets for each income/deduction domain.}
+\label{tab:cd_targets}
+\end{table}
+
+\subsection{State targets (4,080)}
+
+\begin{table}[H]
+\centering
+{\tablefont
+\begin{tabular}{p{0.55\textwidth}lr}
+\toprule
+Target domain & Type & Count \\
+\midrule
+\multicolumn{3}{l}{\textit{Census ACS S0101}} \\
+Person count by age band (18 bands $\times$ 51 states) & count & 918 \\
+\midrule
+\multicolumn{3}{l}{\textit{IRS SOI}} \\
+Person count by AGI bracket (9 bins $\times$ 51 states) & count & 459 \\
+EITC dollars by qualifying children (4 bins $\times$ 51) & \$ & 204 \\
+Tax unit count by qualifying children (4 bins $\times$ 51) & count & 204 \\
+Aggregate AGI (unconditional, $\times$ 51 states) & \$ & 51 \\
+20 income/deduction dollar totals (domain $> 0$, $\times$ 51) & \$ & 1{,}020 \\
+Tax unit count for each of the 21 domains ($\times$ 51) & count & 1{,}071 \\
+\midrule
+\multicolumn{3}{l}{\textit{USDA FNS SNAP}} \\
+SNAP spending ($\times$ 51 states) & \$ & 51 \\
+SNAP household count ($\times$ 51 states) & count & 51 \\
+\midrule
+\multicolumn{3}{l}{\textit{CMS Medicaid}} \\
+Medicaid enrollment ($\times$ 51 states) & count & 51 \\
+\midrule
+\multicolumn{3}{l}{\textit{Census STC}} \\
+State income tax collections ($\times$ 51 states) & \$ & 51 \\
+\midrule
+& & \textbf{4{,}080} \\
+\bottomrule
+\end{tabular}
+}
+\caption{State-level calibration targets (50 states + DC). IRS SOI variables mirror the district structure. USDA provides both SNAP spending and household counts; CMS provides Medicaid enrollment.}
+\label{tab:state_targets}
+\end{table}
+
+\subsection{National targets (106)}
+
+\begin{table}[H]
+\centering
+{\tablefont
+\begin{tabular}{p{0.55\textwidth}lr}
+\toprule
+Target domain & Type & Count \\
+\midrule
+\multicolumn{3}{l}{\textit{Demographics (Census ACS + curated)}} \\
+Person count by age band (18 bands) & count & 18 \\
+Person count by SSN card type (4 categories) & count & 4 \\
+Person count: Medicaid enrolled & count & 1 \\
+Person count: ACA PTC recipients & count & 1 \\
+\midrule
+\multicolumn{3}{l}{\textit{IRS SOI --- domain-constrained aggregates}} \\
+AGI (unconditional) & \$ & 1 \\
+EITC by qualifying children (0--3+) & \$ & 4 \\
+Tax unit count by qualifying children (0--3+) & count & 4 \\
+21 income/deduction dollar totals (domain $> 0$) & \$ & 21 \\
+Tax unit count for each of the 21 domains & count & 21 \\
+\midrule
+\multicolumn{3}{l}{\textit{CBO budget projections}} \\
+SNAP, Social Security, SSI, unemployment comp., income tax & \$ & 5 \\
+\midrule
+\multicolumn{3}{l}{\textit{SSA benefit decomposition}} \\
+Retirement, disability, survivors, dependents & \$ & 4 \\
+\midrule
+\multicolumn{3}{l}{\textit{JCT tax expenditure estimates}} \\
+SALT ded., charitable ded., mortgage interest ded., medical expense ded., QBI ded., EITC & \$ & 6 \\
+\midrule
+\multicolumn{3}{l}{\textit{Healthcare spending (MEPS/NHEA/CMS)}} \\
+Medicaid, health insurance premiums, Medicare Part B, other medical expenses, OTC health & \$ & 5 \\
+\midrule
+\multicolumn{3}{l}{\textit{Housing, transfers, and other}} \\
+Rent, real estate taxes, housing subsidy, work/childcare expenses, TANF, alimony (paid + received), child support (paid + received), tip income, net worth & \$ & 10 \\
+\midrule
+\multicolumn{3}{l}{\textit{Retirement contributions (IRS SOI)}} \\
+Traditional IRA, Roth IRA contributions & \$ & 2 \\
+\midrule
+& & \textbf{106} \\
+\bottomrule
+\end{tabular}
+}
+\caption{National-level calibration targets. CBO, JCT, SSA, CMS, and Census values are curated from the cited administrative sources and stored in the ETL pipeline. Dollar values are inflation-adjusted to the calibration year.}
+\label{tab:national_targets}
+\end{table}
+
+\section{Algorithm pseudocode}
+\label{app:algorithm}
+
+Algorithm~\ref{alg:l0} presents the complete $L_0$-regularized calibration procedure.
+
+\begin{algorithm}[ht]
+\caption{$L_0$-regularized calibration with Hard Concrete gates}
+\label{alg:l0}
+\begin{algorithmic}[1]
+\Require Calibration matrix $\mathbf{M} \in \R^{m \times n}$, targets $\mathbf{t} \in \R^m$, initial weights $\mathbf{w}_0 \in \R^n$
+\Require Hyperparameters: $\lambda_{L_0}$, $\lambda_{L_2}$, $\beta$, $\gamma$, $\zeta$, learning rate $\eta$, epochs $E$
+\Ensure Calibrated sparse weight vector $\hat{\mathbf{w}} \in \R^n$
+
+\State Initialize $\log w_i \gets \log w_{0,i} + \mathcal{N}(0, 0.05^2)$ for all $i$
+\State Initialize $\log \alpha_i \gets \text{logit}(0.999) + \mathcal{N}(0, 0.01^2)$ for all $i$
+\State Initialize Adam optimizer with parameters $\{\log w_i, \log \alpha_i\}$ and learning rate $\eta$
+
+\For{epoch $= 1$ to $E$}
+    \State \textbf{Sample Hard Concrete gates (training):}
+    \For{$i = 1$ to $n$}
+        \State $u_i \sim \text{Uniform}(\epsilon, 1-\epsilon)$
+        \State $\bar{s}_i \gets \sigma\!\left(\frac{\log u_i - \log(1-u_i) + \log \alpha_i}{\beta}\right)$
+        \State $s_i \gets \bar{s}_i(\zeta - \gamma) + \gamma$
+        \State $z_i \gets \min(1, \max(0, s_i))$
+    \EndFor
+    \State $w_i^{\text{eff}} \gets \exp(\log w_i) \cdot z_i$ for all $i$
+    \State $\hat{t}_j \gets \sum_i M_{ji} \cdot w_i^{\text{eff}}$ for all $j$
+    \State $\mathcal{L}_{\text{cal}} \gets \frac{1}{m}\sum_{j=1}^{m}\left(\frac{\hat{t}_j - t_j}{|t_j|}\right)^2$
+    \State $\mathcal{L}_{L_0} \gets \sum_{i=1}^{n} \sigma\!\left(\log \alpha_i - \beta \log \frac{-\gamma}{\zeta}\right)$
+    \State $\mathcal{L}_{L_2} \gets \sum_{i=1}^{n} w_i^2$
+    \State $\mathcal{L} \gets \mathcal{L}_{\text{cal}} + \lambda_{L_0} \mathcal{L}_{L_0} + \lambda_{L_2} \mathcal{L}_{L_2}$
+    \State Backpropagate $\nabla_{\log w, \log \alpha} \mathcal{L}$
+    \State Adam step
+\EndFor
+
+\State \textbf{Deterministic inference:}
+\For{$i = 1$ to $n$}
+    \State $z_i^{\text{det}} \gets \min\!\left(1, \max\!\left(0,\; \sigma(\log \alpha_i)(\zeta - \gamma) + \gamma\right)\right)$
+    \State $\hat{w}_i \gets \exp(\log w_i) \cdot z_i^{\text{det}}$
+\EndFor
+\State \Return $\hat{\mathbf{w}}$
+\end{algorithmic}
+\end{algorithm}
diff --git a/paper-l0/sections/background.tex b/paper-l0/sections/background.tex
new file mode 100644
index 00000000..ab8c83e6
--- /dev/null
+++ b/paper-l0/sections/background.tex
@@ -0,0 +1,76 @@
+\section{Background and related work}
+\label{sec:background}
+
+\subsection{The survey calibration problem}
+
+Given a probability sample of $n$ units with initial design weights $d_i$ and a vector of auxiliary variables $\mathbf{x}_i \in \R^p$ for each unit $i$, survey calibration seeks adjusted weights $w_i$ such that the weighted sample totals reproduce known population totals:
+\begin{equation}
+    \sum_{i=1}^{n} w_i \mathbf{x}_i = \mathbf{T}
+    \label{eq:calibration_constraint}
+\end{equation}
+where $\mathbf{T} \in \R^m$ is the vector of $m$ known administrative or census totals. When $m < n$, the system is underdetermined and a distance criterion selects among feasible solutions. \citet{deville1992} formalized this as a constrained optimization problem: minimize a distance function $\sum_i G(w_i, d_i)$ subject to Equation~\ref{eq:calibration_constraint}, where $G$ is convex and satisfies $G(d_i, d_i) = 0$. The choice of $G$ determines the calibration method: a chi-squared distance yields the generalized regression (GREG) estimator, an exponential distance yields raking (multiplicative calibration), and a truncated distance yields bounded weight adjustments. \citet{sarndal2007} provides a comprehensive review of these methods and their statistical properties.
+
+In the subnational setting addressed here, the constraint vector $\mathbf{T}$ contains targets at multiple geographic levels---congressional districts, states, and the nation---requiring that district totals sum to state totals, which sum to national totals. This hierarchical structure produces $m \approx 37{,}800$ simultaneous constraints with near-collinearity across levels, placing the problem beyond the regime where classical closed-form calibration methods operate reliably.
+
+Subnational microsimulation also sits within the broader spatial microsimulation literature. Reviews of static spatial microsimulation often distinguish between \emph{reweighting} methods, which begin from survey microdata and adjust weights to match small-area constraints, and \emph{synthetic reconstruction} methods, which construct new small-area populations from aggregate tables \citep{tanton2014review, odonoghue2014review}. This distinction matters for the present paper. Our method remains, at core, a calibration-weighting approach over observed CPS households, so GREG and IPF are the closest classical benchmarks. At the same time, by cloning households, assigning them to new geographies, and assembling area-specific output files, the pipeline also produces derived spatial microdata for small-area policy analysis, making it relevant to some synthetic-population use cases even though the empirical comparison in this paper focuses on calibration methods.
+
+\subsection{Generalized regression (GREG) estimation}
+
+The GREG estimator minimizes the chi-squared distance $\sum_i (w_i - d_i)^2 / d_i$ subject to Equation~\ref{eq:calibration_constraint}, yielding the closed-form solution:
+\begin{equation}
+    w_i = d_i + d_i \, \mathbf{x}_i^\top \left(\sum_{j=1}^{n} d_j \, \mathbf{x}_j \mathbf{x}_j^\top\right)^{-1} \left(\mathbf{T} - \sum_{j=1}^{n} d_j \, \mathbf{x}_j\right)
+    \label{eq:greg}
+\end{equation}
+This requires inverting a $p \times p$ matrix, where $p$ is the dimension of the auxiliary vector. When $p$ is moderate (tens to hundreds), GREG is computationally efficient, statistically well understood, and produces asymptotically unbiased estimators \citep{deville1992, sarndal2007}.
+
+GREG has two practical limitations in the subnational setting. First, the matrix inversion in Equation~\ref{eq:greg} becomes numerically ill-conditioned as $p$ grows into the thousands, particularly when constraints at different geographic levels are nearly collinear (e.g., the sum of district age counts within a state nearly equals the state age total). Second, GREG does not constrain weight signs: calibrated weights can become negative, which is undesirable for microsimulation since negative weights imply subtracting households from the population. Weight truncation or bounding after the fact restores non-negativity but violates the calibration constraints.
+
+The Congressional Budget Office \citep{cbo2018}, Joint Committee on Taxation \citep{jct2023}, and Tax Policy Center \citep{tpc2024} use variants of GREG calibration for their national microsimulation models, where the number of calibration constraints is typically in the hundreds---well within GREG's reliable operating range.
+
+\subsection{Iterative proportional fitting}
+
+Iterative proportional fitting \citep[IPF;][]{deming1940, ireland1968} adjusts cell counts in a contingency table to match given marginal totals. The algorithm cycles through dimensions, scaling each dimension's cells so that their marginal matches the target, then repeating until convergence. IPF converges to the maximum entropy solution subject to the marginal constraints \citep{ireland1968}. In the spatial microsimulation literature, it appears in both synthetic reconstruction and reweighting forms; the discussion here concerns the reweighting form that starts from survey microdata and updates weights rather than building a synthetic joint distribution from scratch \citep{tanton2014review}.
+
+In the microsimulation context, IPF adjusts household weights to match cross-classified population counts---for example, persons by age group within each congressional district. IPF has several practical advantages: it preserves non-negativity by construction (weights are scaled multiplicatively, so positive weights remain positive), it requires no matrix inversion, and it scales well to high-dimensional contingency tables. EUROMOD, the EU-wide tax-benefit microsimulation model, uses IPF-based calibration to reweight national surveys to demographic benchmarks across member states.
+
+However, IPF has three limitations relevant to subnational calibration. First, IPF does not naturally enforce hierarchical consistency: district-level targets produced by IPF do not automatically sum to the correct state totals, requiring post-hoc reconciliation that may introduce new inconsistencies. Second, IPF handles only count targets organized as contingency table margins; incorporating continuous-valued targets (e.g., aggregate income or benefit spending) requires auxiliary procedures outside the IPF framework. Third, IPF scales cell counts multiplicatively, which can produce extreme weights when initial cells are small or zero, and convergence slows or fails when marginal constraints are mutually inconsistent.
+
+\subsection{Spatial microsimulation}
+
+Spatial microsimulation constructs small-area populations either by reweighting existing microdata or by synthesizing new unit-record populations from aggregate constraints \citep{tanton2014review, odonoghue2014review}. \citet{williamson1998} introduced a combinatorial optimization approach that selects a subset of survey records for each small area using simulated annealing to minimize the difference between weighted survey totals and census benchmarks. \citet{huang2001} extended this with a deterministic algorithm based on systematic record selection. \citet{tanton2011} applied generalized regression reweighting to create small-area estimates of poverty and housing stress in Australia.
+
+\citet{harland2012} developed methods for creating realistic synthetic populations at fine geographic scales using iterative proportional fitting combined with Monte Carlo sampling. \citet{lovelace2016} provided an accessible implementation in R with the \texttt{spatial-microsim-book} framework.
+
+Within this literature, combinatorial optimization and especially simulated annealing occupy an important place as methods for generating synthetic spatial microdata from observed survey records \citep{tanton2014review, odonoghue2014review}. Their main advantage is that they work directly with real microdata, can be flexible about household structure, and can accommodate settings where the unit of analysis in the constraints and the microdata do not align neatly. Their main disadvantage is computational intensity: they are usually run area by area and search a large discrete space of candidate record combinations \citep{harland2012, odonoghue2014review}.
+
+These methods typically operate at a single geographic level---producing estimates for each small area independently. Joint calibration across multiple geographic levels (district, state, national) with a single set of weights is uncommon in the spatial microsimulation literature, as it requires simultaneously satisfying tens of thousands of constraints that span different administrative geographies. Other operational models avoid the problem entirely: TAXSIM (NBER) operates at the national level without geographic calibration, while state-level models maintained by individual state revenue departments calibrate only within their own jurisdiction. For this reason, simulated annealing is an important reference point in the broader spatial microsimulation literature, but it is not the closest like-for-like empirical comparator to our setting. The benchmark design in this paper therefore focuses on GREG and IPF as the classical calibration baselines most closely aligned with a shared weighted-microdata formulation.
+
+\subsection{$L_0$ regularization and the Hard Concrete distribution}
+
+$L_0$ regularization penalizes the count of nonzero parameters in a model, encouraging sparsity. Unlike $L_1$ regularization, which penalizes the sum of absolute values and produces approximate sparsity through shrinkage, $L_0$ regularization produces exact zeros. The $L_0$ norm $\|\mathbf{w}\|_0 = \sum_i \mathbf{1}[w_i \neq 0]$ is discontinuous and combinatorial, making direct optimization intractable for large parameter spaces.
+
+\citet{louizos2018} proposed a differentiable relaxation of $L_0$ regularization using the Hard Concrete distribution. Each parameter is multiplied by a stochastic gate $z_i$ drawn from a stretched and rectified Bernoulli distribution. During training, the gate is sampled as:
+\begin{align}
+    u &\sim \text{Uniform}(0, 1) \\
+    \bar{s} &= \sigma\!\left(\frac{\log u - \log(1-u) + \log \alpha_i}{\beta}\right) \\
+    s &= \bar{s}(\zeta - \gamma) + \gamma \\
+    z_i &= \min(1, \max(0, s))
+\end{align}
+where $\sigma$ is the sigmoid function, $\alpha_i$ is a learnable parameter controlling the gate's probability of being open, $\beta$ is a temperature parameter, and $\gamma < 0$ and $\zeta > 1$ are stretch parameters that place probability mass at exactly zero and exactly one after clipping. The expected $L_0$ penalty is:
+\begin{equation}
+    \E[L_0] = \sum_i \sigma\!\left(\log \alpha_i - \beta \log \frac{-\gamma}{\zeta}\right)
+\end{equation}
+which is differentiable with respect to $\alpha_i$ and can be optimized with standard gradient methods.
+
+At inference time, the stochastic sample is replaced by the deterministic mean:
+\begin{equation}
+    z_i^{\text{det}} = \min\!\left(1, \max\!\left(0,\; \sigma(\log \alpha_i)(\zeta - \gamma) + \gamma\right)\right)
+\end{equation}
+
+The Hard Concrete distribution was originally developed for pruning neural network weights \citep{louizos2018}. In the present work, we repurpose it for survey weight selection: each household-geography combination receives a gate, and the $L_0$ penalty controls how many combinations are retained in the final dataset.
+
+\subsection{National-level predecessor}
+
+\citet{woodruff2024} developed a two-stage method for constructing enhanced national microsimulation datasets. In the first stage, quantile regression forests \citep{meinshausen2006quantile} impute 72 tax variables from the IRS Public Use File onto CPS records, preserving joint distributional characteristics through sequential conditioning. In the second stage, dropout-regularized gradient descent optimizes household weights to match approximately 7,000 national-level administrative targets from the IRS, CBO, JCT, Census Bureau, and program-specific sources.
+
+The present paper extends this framework in four directions. First, we replace the single-geography dataset with a clone-and-assign procedure that replicates each CPS household across 436 congressional districts, 50 states (plus DC), and New York City, enabling subnational coverage. Second, we replace dropout regularization with $L_0$ Hard Concrete gates, providing a principled mechanism for exact sparsity with a configurable sparsity-accuracy trade-off. Third, we introduce hierarchical uprating to reconcile targets from different administrative sources---where, for example, district-level IRS totals may not sum exactly to state-level USDA program totals due to different data collection methodologies and reference periods. Fourth, we add take-up re-randomization so that program participation varies across geographic clones of the same household, reflecting local-area take-up rates.
diff --git a/paper-l0/sections/conclusion.tex b/paper-l0/sections/conclusion.tex
new file mode 100644
index 00000000..81ef1763
--- /dev/null
+++ b/paper-l0/sections/conclusion.tex
@@ -0,0 +1,14 @@
+\section{Conclusion}
+\label{sec:conclusion}
+
+This paper presents an $L_0$ regularization method for calibrating microsimulation datasets to subnational administrative targets. The method jointly optimizes household weight magnitudes and sparsity using Hard Concrete gates, producing calibrated datasets that simultaneously match approximately 37,800 targets across congressional districts, states, and the nation.
+
+The key technical contribution is the adaptation of $L_0$ regularization---originally developed for neural network pruning---to the survey calibration setting. Each household-geography combination receives a continuous weight and a stochastic binary gate. The gate's learnable logit and the weight magnitude are jointly optimized via gradient descent, with the $L_0$ penalty controlling sparsity. At inference, gates collapse to deterministic zeros and ones, producing a sparse dataset in which the number of retained records is a smooth function of the penalty parameter $\lambda_{L_0}$.
+
+The configurable sparsity is practically useful: the same pipeline produces compact 50,000-record datasets for web-based interactive tools and detailed 3--4 million-record datasets for subnational policy analysis, without separate calibration procedures.
+
+The pipeline builds on the CPS-PUF imputation methodology of \citet{woodruff2024}, extending it from national to subnational coverage through clone-and-assign geography, hierarchical target reconciliation, and take-up re-randomization. The full pipeline---from raw CPS to calibrated H5 datasets---runs in approximately 8--12 hours on cloud infrastructure and is fully open source.
+
+The method is implemented in two open-source packages: the \texttt{l0-python} PyTorch package (\url{https://github.com/PolicyEngine/l0-python}), which provides the Hard Concrete optimizer, and the \texttt{policyengine-us-data} pipeline (\url{https://github.com/PolicyEngine/policyengine-us-data}), which implements the full four-stage calibration. Calibrated datasets are publicly available on HuggingFace (\texttt{policyengine/policyengine-us-data}).
+
+Future work includes extending calibration to county-level targets as additional administrative data becomes available, implementing adaptive $\lambda_{L_0}$ scheduling during training (analogous to learning rate schedules), incorporating an early stopping criterion based on held-out target validation, and applying the method to non-US microsimulation systems.
diff --git a/paper-l0/sections/data.tex b/paper-l0/sections/data.tex
new file mode 100644
index 00000000..b1b74151
--- /dev/null
+++ b/paper-l0/sections/data.tex
@@ -0,0 +1,89 @@
+\section{Data}
+\label{sec:data}
+
+The calibration pipeline consumes two categories of input: base microdata from household surveys, and administrative targets from government agencies. This section describes both.
+
+\subsection{Base microdata}
+
+\subsubsection{Current Population Survey}
+
+The primary microdata source is the Current Population Survey Annual Social and Economic Supplement \citep[CPS ASEC;][]{census2024}, a nationally representative household survey of approximately 200,000 persons conducted jointly by the US Census Bureau and the Bureau of Labor Statistics. The CPS ASEC contains detailed demographic variables (age, sex, race, household composition, state of residence), income variables (earnings, Social Security, pension, unemployment compensation), and program participation indicators (SNAP, Medicaid, SSI, TANF).
+
+The CPS has well-documented limitations for tax microsimulation. Top incomes are truncated for privacy protection, causing underestimation of capital gains, partnership income, and other high-income components \citep{burkhauser2012}. Benefit receipt is underreported relative to administrative totals \citep{rothbaum2021, meyer2021}. Tax-specific variables---itemized deductions, tax credits, and filing-status-dependent calculations---are either absent or imprecisely measured.
+
+\subsubsection{IRS Public Use File imputation}
+
+To address these gaps, we impute 72 tax variables from the IRS Public Use File \citep[PUF;][]{bryant2023a} onto CPS records following the methodology of \citet{woodruff2024}. The PUF contains individual tax return data with detailed income components, deductions, and credits, but lacks household structure and demographic detail.
+
+The imputation uses quantile regression forests \citep[QRF;][]{meinshausen2006quantile} trained on the PUF and applied to CPS records. To preserve the joint distribution across the 72 variables, QRF models are applied sequentially: each variable is imputed conditional on all previously imputed variables in the sequence. A stratified subsample of PUF training records preserves the top 0.5\% of the AGI distribution (retaining all records above the 99.5th percentile) alongside a random sample of 20,000 records from the remainder.
+
+The imputation overwrites a subset of 44 unreliable CPS variables with PUF predictions, while retaining original CPS values for all other variables. Each CPS record receives QRF-imputed values for the 72 tax variables, producing an enhanced dataset that combines the demographic detail of the CPS with the tax fidelity of the PUF.
+
+Social Security income, which appears as a single total in the PUF, is decomposed into four sub-components (retirement, disability, survivors, dependents) using a QRF model trained on CPS records where all four components are observed. An age-based heuristic serves as a fallback: records with age 62 or above receive 100\% retirement allocation; younger records receive 100\% disability allocation.
+
+\subsubsection{Source imputation from additional surveys}
+
+Nine additional variables are imputed from three external surveys to fill gaps in CPS measurement:
+
+\begin{itemize}
+    \item \textbf{American Community Survey (ACS) 2022}: rent and real estate taxes, imputed with state FIPS as a predictor. Because geography is assigned before source imputation (Section~\ref{sec:stage1}), the ACS QRF conditions on the clone's assigned state, producing state-aware housing cost predictions.
+    \item \textbf{Survey of Income and Program Participation (SIPP) 2023}: tip income, bank account assets, stock assets, and bond assets. SIPP public-use files lack state identifiers, so these imputations are state-blind.
+    \item \textbf{Survey of Consumer Finances (SCF) 2022}: net worth, auto loan balance, and auto loan interest. SCF also lacks state identifiers; imputations are state-blind.
+\end{itemize}
+
+Training sample sizes are 10,000 household heads for ACS, up to 20,000 weighted-probability records for SIPP, and 50\% of SCF records.
+
+\subsection{Calibration targets}
+
+The optimizer calibrates against approximately 37,800 targets spanning three geographic levels and multiple administrative sources. Table~\ref{tab:target_summary} summarizes the target structure.
+
+\input{tables/target_summary}
+
+\subsubsection{Congressional district targets}
+
+At the district level (436 congressional districts), targets come from two sources:
+
+\begin{itemize}
+    \item \textbf{IRS Statistics of Income (SOI)}: adjusted gross income, income tax, EITC by number of qualifying children, capital gains, self-employment income, pension income, and other income and deduction categories. These are available at the ZIP code level and aggregated to congressional districts.
+    \item \textbf{Census ACS}: age distributions in 18 bands (0--4, 5--9, \ldots, 80--84, 85+), providing demographic structure within each district.
+    \item \textbf{SNAP household counts}: district-level counts of SNAP-receiving households from USDA administrative data.
+\end{itemize}
+
+\subsubsection{State targets}
+
+At the state level (50 states plus DC), additional targets include:
+
+\begin{itemize}
+    \item \textbf{USDA}: administrative SNAP spending by state.
+    \item \textbf{CMS}: Medicaid enrollment by state.
+    \item \textbf{Census Bureau}: state income tax collections.
+\end{itemize}
+
+\subsubsection{National targets}
+
+National targets provide top-level fiscal and demographic constraints:
+
+\begin{itemize}
+    \item \textbf{CBO}: federal budget projections for program spending and revenue.
+    \item \textbf{JCT}: tax expenditure estimates for major deductions and credits.
+    \item \textbf{SSA}: benefit totals by program (retirement, disability, survivors, SSI).
+    \item Additional administrative totals for healthcare spending and demographic benchmarks.
+\end{itemize}
+
+\subsection{Hierarchical uprating}
+\label{sec:uprating}
+
+Calibration targets originate from different sources with different reference periods. IRS SOI district data may reference tax year 2022, while USDA SNAP data references fiscal year 2024 and CBO projections reference the current budget year. Two adjustments reconcile these differences.
+
+The \textit{uprating factor} (UF) bridges the time gap between the source data period and the calibration year. For most domains, dollar-denominated targets use the Consumer Price Index and count targets use Census population growth projections. For ACA premium tax credits, state-specific uprating factors derived from CMS/KFF enrollment data capture state-level variation in marketplace enrollment growth.
+
+The \textit{hierarchy inconsistency factor} (HIF) corrects for district-level totals that do not sum exactly to state-level totals, which occurs when district and state data come from different administrative sources or use different collection methodologies. For each state $s$ and variable $v$, the HIF is:
+\begin{equation}
+    \text{HIF}_{s,v} = \frac{T_{s,v}^{\text{state}}}{\sum_{d \in s} T_{d,v}^{\text{district}}}
+\end{equation}
+where $T_{s,v}^{\text{state}}$ is the state-level target and $T_{d,v}^{\text{district}}$ is the district-level target for district $d$ within state $s$. Each district target is then adjusted as $T_{d,v}^{\text{adj}} = T_{d,v}^{\text{district}} \times \text{HIF}_{s,v} \times \text{UF}_{s,v}$, ensuring that adjusted district targets sum exactly to the uprated state total:
+\begin{equation}
+    \sum_{d \in s} T_{d,v}^{\text{adj}} = \text{UF}_{s,v} \times T_{s,v}^{\text{state}}
+\end{equation}
+
+For IRS SOI data, where district totals are constructed from ZIP-code-level data that sums exactly to state totals, the HIF equals 1.0. For SNAP household counts, where district estimates from one source substantially undercount state administrative totals from another, HIFs range from 1.2 to 1.7 across states.
diff --git a/paper-l0/sections/discussion.tex b/paper-l0/sections/discussion.tex
new file mode 100644
index 00000000..9309c560
--- /dev/null
+++ b/paper-l0/sections/discussion.tex
@@ -0,0 +1,42 @@
+\section{Discussion}
+\label{sec:discussion}
+
+\subsection{Configurable sparsity trade-off}
+
+The $\lambda_{L_0}$ parameter provides a continuous sparsity-accuracy dial. At one extreme, $\lambda_{L_0} = 10^{-4}$ produces a compact dataset of approximately 50,000 records that loads in seconds for web-based simulation tools, where users expect near-instant feedback. At the other extreme, $\lambda_{L_0} = 10^{-8}$ retains 3--4 million records with fine geographic resolution across all 436 congressional districts. Intermediate values produce datasets of intermediate size.
+
+This trade-off does not exist in classical calibration methods. IPF and GREG produce a single set of weights without sparsity control. To reduce dataset size, researchers must discard records post hoc or apply ad hoc thresholding---neither of which jointly optimizes accuracy and sparsity. The Hard Concrete gate provides a principled mechanism for this joint optimization, with $\lambda_{L_0}$ serving as the researcher's preference parameter over the Pareto frontier.
+
+This feature also broadens the method's practical role. The local preset yields area-specific unit-record files that resemble synthetic spatial microdata, while remaining anchored in observed survey households, model-based enhancements, and calibrated administrative totals. The paper's empirical evaluation remains focused on calibration baselines, but the resulting datasets are relevant to some of the same downstream use cases as synthetic-population workflows.
+
+\subsection{Computational cost}
+
+The pipeline runs on Modal, a cloud compute platform, using T4 GPUs for the optimization step. Stage 1 (clone creation and imputation) requires approximately 2--3 hours of CPU time. Stage 2 (matrix construction) requires approximately 2--3 hours across parallel workers, dominated by running \policyengine{} simulations for each of the 51 state-level configurations. Stage 3 (optimization) requires approximately 30--60 minutes of GPU time for the national preset (4,000 epochs) and 5--15 minutes for the local preset (1,000 epochs). Stage 4 (H5 assembly) requires approximately 4--5 hours across parallel workers for all 488 (436 CDs, 50 states plus DC, NYC and a national) H5 builds.
+
+Total wall-clock time for a complete pipeline run is approximately 8--12 hours. This exceeds the cost of a single IPF or GREG calibration, which takes minutes. However, the pipeline amortizes the expensive matrix construction step: once built, the calibration package can be reused with different $\lambda_{L_0}$ presets or target configurations without rebuilding.
+
+\subsection{Limitations}
+
+\subsubsection{Temporal gap between PUF and CPS}
+
+The most recent publicly available PUF references tax year 2015 \citep{bryant2023a}. The CPS ASEC is updated annually. Imputing 2015 tax return variables onto 2024 CPS records introduces a temporal mismatch. The QRF imputation conditions on demographic and income variables that are available in both datasets, partially mitigating the gap, but structural changes in the tax code since 2015---such as the Tax Cuts and Jobs Act of 2017---may not be fully captured.
+
+\subsubsection{No early stopping}
+
+The current implementation runs for a fixed number of epochs without an automatic convergence criterion. Convergence is assessed post hoc from diagnostic outputs. An adaptive stopping rule based on validation loss on held-out targets (excluded from training via \texttt{target\_config.yaml}) could reduce computation time and prevent overfitting to the training target set.
+
+\subsubsection{Target selection sensitivity}
+
+The choice of which targets to include in the optimization affects calibration accuracy on excluded targets. The \texttt{target\_config.yaml} mechanism provides flexibility but also introduces a modeling decision: including too many targets may overfit the weights to specific administrative totals, while excluding targets leaves them uncalibrated. The current configuration was developed iteratively through validation against held-out targets, but a more systematic approach to target selection---analogous to feature selection in supervised learning---could improve robustness.
+
+\subsubsection{Take-up uncertainty}
+
+Programme take-up draws are stochastic, introducing noise into the calibration matrix entries for take-up-dependent targets. Different random seeds produce different take-up assignments, which produce different calibrated weights. The pipeline uses deterministic seeds for reproducibility but does not currently quantify the sensitivity of calibration accuracy to the take-up random seed.
+
+\subsubsection{Geographic resolution}
+
+The current pipeline calibrates at three levels: congressional district, state, and national. Finer geographic levels---counties, metropolitan areas, school districts---are feasible in principle but would require additional target data and a larger calibration matrix. The census block assignment already provides the geographic foundation; the constraint is the availability of administrative targets at finer levels.
+
+\subsection{Generalizability}
+
+The method is not specific to the United States or to any particular tax-benefit system. Any setting where (a) a household survey must be calibrated to (b) administrative targets at (c) multiple geographic levels is a candidate for $L_0$-regularized calibration. The UK Family Resources Survey calibrated to HMRC tax data across local authorities, or the EU Statistics on Income and Living Conditions calibrated to Eurostat regional benchmarks, are plausible applications. The \texttt{l0-python} package implements the Hard Concrete optimizer independently of the \policyengine{} framework.
diff --git a/paper-l0/sections/introduction.tex b/paper-l0/sections/introduction.tex
new file mode 100644
index 00000000..ef2acca4
--- /dev/null
+++ b/paper-l0/sections/introduction.tex
@@ -0,0 +1,20 @@
+\section{Introduction}
+\label{sec:introduction}
+
+Microsimulation models estimate the effects of tax and benefit policies on households by applying program rules to individual-level microdata. Most operational models---including those maintained by the Congressional Budget Office \citep{cbo2018}, the Joint Committee on Taxation \citep{jct2023}, and the Tax Policy Center \citep{tpc2024}---operate at the national level. They calibrate household survey weights to aggregate administrative totals such as total income tax revenue, program enrollment counts, and demographic benchmarks, then use the reweighted dataset to simulate policy reforms.
+
+Subnational policy analysis introduces a fundamentally different calibration challenge. Rather than matching a single set of national aggregates, the microdata must simultaneously reproduce distributional statistics at multiple geographic levels: congressional districts, states, and the nation as a whole. A dataset calibrated for the state of California must match California-specific IRS income totals, SNAP participation counts, Medicaid enrollment, and age distributions, while remaining consistent with national budget projections from the CBO and tax expenditure estimates from the JCT. Across 436 congressional districts and 50 states, this produces approximately 37,800 simultaneous calibration targets.
+
+Existing calibration methods scale poorly to this setting. Iterative proportional fitting \citep[IPF;][]{deming1940, ireland1968} adjusts weights along one dimension at a time, cycling through marginal constraints until convergence. IPF handles cross-classified tables but does not naturally accommodate hierarchical geographic constraints---district targets must sum to state targets, which must sum to national targets---without ad hoc post-processing. Generalized regression (GREG) estimators \citep{deville1992, sarndal2007} solve a constrained optimization problem that minimizes distance from initial weights subject to exact calibration constraints. GREG produces a closed-form solution for moderate numbers of constraints but becomes computationally intractable and numerically unstable as the constraint count approaches the tens of thousands.
+
+Spatial microsimulation methods take a different approach, often distinguishing between reweighting methods and synthetic reconstruction methods for constructing small-area microdata \citep{tanton2014review}. Within this broader literature, researchers have used combinatorial optimization and simulated annealing \citep{williamson1998, huang2001, harland2012} as well as deterministic reweighting \citep{tanton2011, lovelace2016}. These methods typically operate at a single geographic level and require separate calibration runs for each area, making joint multi-level calibration difficult.
+
+This paper presents a method that addresses these limitations by jointly optimizing weight magnitudes and sparsity in a single gradient-based framework. We adapt the Hard Concrete distribution \citep{louizos2018}, originally developed for neural network pruning, to the survey calibration setting. Each household-geography combination receives a continuous weight and a stochastic binary gate. The gate is parameterized by a learnable logit and trained via gradient descent to minimize a loss function that combines relative calibration error across all 37,800 targets with an $L_0$ penalty on the expected number of active records. At inference time, the stochastic gates collapse to deterministic zeros and ones, producing a sparse dataset in which most household-geography combinations are dropped while the retained records carry calibrated positive weights.
+
+The approach builds on \citet{woodruff2024}, who developed a two-stage methodology for constructing enhanced national microsimulation datasets from the Current Population Survey (CPS) and the IRS Public Use File (PUF). Their method uses quantile regression forests (QRF) to impute 72 tax variables from the PUF onto CPS records, then applies dropout-regularized gradient descent to reweight the combined dataset against approximately 7,000 national targets. The present paper extends this framework from a single national dataset to subnational coverage by introducing three new components: (a) a clone-and-assign procedure that replicates each CPS household across multiple geographic locations, (b) $L_0$ Hard Concrete gates that replace dropout regularization and enable exact sparsity, and (c) a hierarchical uprating scheme that reconciles targets from different administrative sources at district, state, and national levels. Because the method still solves a calibration-weighting problem over survey-based microdata, GREG and IPF are the closest classical empirical comparators; at the same time, the clone-and-assign pipeline produces derived spatial microdata files that are also relevant to synthetic-population use cases.
+
+The configurable sparsity penalty produces datasets of different sizes for different use cases. A high penalty ($\lambda_{L_0} = 10^{-4}$) retains approximately 50,000 records, suitable for national-level web-based simulation where download size and computation time matter. A low penalty ($\lambda_{L_0} = 10^{-8}$) retains approximately 3--4 million records, preserving geographic resolution for all 436 congressional districts.
+
+The method is implemented in open-source Python packages and deployed as part of \policyengine{}'s US microsimulation model (\url{https://policyengine.org/us/model}), which uses these calibrated datasets for both national and subnational policy analysis.
+
+The remainder of this paper is organized as follows. Section~\ref{sec:background} reviews survey calibration methods, spatial microsimulation, and the Hard Concrete distribution. Section~\ref{sec:data} describes the input microdata and calibration target sources. Section~\ref{sec:methodology} details the four-stage pipeline: clone creation, calibration matrix construction, $L_0$ optimization, and dataset assembly. Section~\ref{sec:results} presents calibration accuracy, sparsity analysis, and comparisons with IPF and GREG. Section~\ref{sec:discussion} discusses trade-offs, limitations, and generalizability. Section~\ref{sec:conclusion} concludes.
diff --git a/paper-l0/sections/methodology.tex b/paper-l0/sections/methodology.tex
new file mode 100644
index 00000000..82d9920f
--- /dev/null
+++ b/paper-l0/sections/methodology.tex
@@ -0,0 +1,158 @@
+\section{Methodology}
+\label{sec:methodology}
+
+The calibration pipeline transforms raw CPS microdata into a calibrated subnational dataset through four stages: (1) clone creation and geographic assignment, (2) calibration matrix construction, (3) $L_0$-regularized weight optimization, and (4) dataset assembly. Figure~\ref{fig:pipeline} provides an overview.
+
+\begin{figure}[ht]
+    \centering
+    \fbox{\parbox{0.9\textwidth}{\centering\textit{[Pipeline overview diagram to be inserted]}}}
+    \caption{Four-stage calibration pipeline. Stage 1 clones CPS records and assigns geography. Stage 2 builds the sparse calibration matrix by simulating each clone through the tax-benefit engine. Stage 3 runs the $L_0$ optimizer to find calibrated weights with configurable sparsity. Stage 4 expands the flat weight vector into per-area H5 dataset files.}
+    \label{fig:pipeline}
+\end{figure}
+
+\subsection{Stage 1: clone creation and geographic assignment}
+\label{sec:stage1}
+
+The base CPS contains approximately 100,000 household records, each with a single state of residence. A stratified subsampling step reduces this to approximately 12,000 households, preserving the top 0.5\% of the AGI distribution in full while uniformly sampling from the remainder. This reduction makes downstream microsimulation computationally feasible while retaining distributional diversity.
+
+To cover 436 congressional districts and 50 states, we replicate each of the approximately 12,000 households $N$ times ($N = 430$ in the default configuration) and assign each clone a random census block from a population-weighted distribution. Although the clone count is slightly below the number of congressional districts, every record is replicated 430 times across different geographic areas, providing the optimizer with sufficient degrees of freedom to place each household type wherever targets require it.
+
+\subsubsection{Block sampling}
+
+Census blocks are the finest geographic unit in the decennial census. Each block maps deterministically to a congressional district, county, tract, and state. The sampling distribution $P_{\text{pop}}(\text{block})$ is proportional to the block's share of the national population. Drawing blocks rather than congressional districts ensures fine-grained geographic variation within districts and enables derivation of county-level variables (Section~\ref{sec:stage4}).
+
+\subsubsection{AGI-conditional routing}
+
+High-income households are routed toward high-AGI congressional districts using a modified sampling distribution. Let $A_d$ denote the aggregate adjusted gross income target for district $d$ from IRS SOI data, and let $P_{\text{pop}}(b)$ denote the population-weighted block probability for block $b$ in district $d$. For households with AGI above the 90th percentile, the sampling distribution is:
+\begin{equation}
+    P_{\text{AGI}}(b) \propto P_{\text{pop}}(b) \cdot A_{d(b)}
+\end{equation}
+where $d(b)$ is the congressional district containing block $b$. This makes blocks in high-AGI districts more likely targets for high-income households, improving the initial geographic allocation before optimization. Without AGI-conditional routing, the optimizer would need to zero out high-income records that landed in low-AGI districts by chance, wasting capacity for population and demographic targets in those districts.
+
+\subsubsection{No-collision constraint}
+
+The same CPS household must not appear in the same congressional district in two different clones. If household $h$ in clone $c_1$ lands in district $d$, then household $h$ in clone $c_2$ must land in a different district. This prevents a high-weight household from dominating a small district's targets. The constraint is enforced by resampling: clone 0 draws blocks freely; each subsequent clone checks for collisions against all previous clones and resamples colliding records for up to 50 retries. Residual collisions after 50 retries are accepted; these are rare with 430 clones and the large block distribution.
+
+After cloning, the total number of household-geography combinations is $N_{\text{clones}} \times n_{\text{records}}$. With 430 clones and approximately 12,000 base households (after stratified subsampling), this produces approximately 5.2 million column positions in the calibration matrix.
+
+\subsection{Stage 2: calibration matrix construction}
+\label{sec:stage2}
+
+The calibration matrix $\mathbf{M} \in \R^{m \times n}$ maps $n$ household-geography combinations (columns) to $m$ calibration targets (rows). Entry $M_{ji}$ is the contribution of record $i$ to target $j$---for dollar targets, this is the household's simulated value of the target variable; for count targets, it is 1 if the household satisfies the target's domain constraint and 0 otherwise.
+
+\subsubsection{Per-state parallel simulation}
+
+The matrix is populated by running each household through \policyengine{}'s tax-benefit microsimulation engine. Because many target variables depend on state-specific tax and benefit rules, a separate simulation is required for each state. A parallel dispatcher sends one job per unique state FIPS code to a pool of worker processes. Each worker creates a fresh \texttt{Microsimulation} instance, overwrites every household's \texttt{state\_fips} with the target state, invalidates cached downstream variables, and calculates all target variables at the household and person levels, accounting for differences in state legislation.
+
+For county-dependent variables---currently limited to ACA premium tax credits, where eligibility depends on county-level benchmark plan premiums---an additional simulation loop iterates over counties within each state, setting the county FIPS for each iteration.
+
+\subsubsection{Take-up re-randomization}
+\label{sec:takeup}
+
+Several benefit programs require a stochastic take-up decision: whether an eligible household actually participates. To construct the matrix correctly under take-up uncertainty, the pipeline runs three simulations per state: (a) a baseline simulation with original take-up values; (b) an all-take-up-true simulation that computes each entity's benefit value assuming full participation; and (c) a would-file-false simulation for tax unit variables.
+
+The take-up rate for each program is resolved at the state level from the clone's assigned census block. Programs with take-up re-randomization include SNAP, ACA premium tax credits, Medicaid, SSI, TANF, Head Start, and Early Head Start. The pipeline draws take-up booleans per (variable, household, clone) triple using a deterministic seeded random number generator. The seed is derived from the variable name, original household ID, and clone index, ensuring that draws are reproducible and independent across clones. Different clones of the same household may have different take-up draws, reflecting the fact that the same household placed in a different geographic area would face a different local take-up rate.
+
+\subsubsection{Clone loop and COO assembly}
+
+After all per-state simulations and take-up preparation complete, a clone loop iterates over each of the $N$ clones. For each clone $c$:
+
+\begin{enumerate}
+    \item Read the geographic slice: which state and district each record maps to in clone $c$.
+    \item Draw per-clone take-up booleans for each benefit program using the clone's geographic assignment and the deterministic seed.
+    \item Assemble per-record values by looking up the appropriate state simulation results for each record's assigned state. For take-up-dependent variables, entity-level values from the all-take-up-true simulation are multiplied by the per-clone take-up draw to produce the matrix entry for that clone.
+    \item Evaluate domain constraints---predicates that determine whether a record contributes to a particular target (e.g., ``SNAP $> 0$'' for SNAP dollar targets, ``age $\geq 18$'' for adult population counts).
+    \item Emit nonzero entries as coordinate (COO) triples $(\text{row}, \text{col}, \text{value})$, where $\text{col} = c \times n_{\text{records}} + i$ for record $i$ in clone $c$.
+\end{enumerate}
+
+The COO entries from all clones are concatenated and converted to a compressed sparse row (CSR) matrix. Typical density is below 1\%, as most records contribute to targets only in their assigned geographic area. This ordering ensures that the matrix accurately represents what the calibrated weights will reproduce when applied to the final dataset.
+
+\subsubsection{Target configuration}
+
+After the full unfiltered matrix is built, a configuration file (\texttt{target\_config.yaml}) selects the subset of targets used for optimization. Each rule specifies a variable name, geographic level, and optionally a domain variable. This separation allows the same expensive matrix to be reused with different target configurations without rebuilding.
+
+Targets with zero row sums---meaning no record can contribute---are marked as unachievable and excluded from the loss function.
+
+\subsection{Stage 3: $L_0$-regularized weight optimization}
+\label{sec:stage3}
+
+\subsubsection{Loss function}
+
+Given the sparse calibration matrix $\mathbf{M}$, a vector of target values $\mathbf{t} \in \R^m$, and a weight vector $\bw \in \R^n$, the optimizer minimizes:
+\begin{equation}
+    \mathcal{L}(\bw, \balpha) = \frac{1}{m}\sum_{j=1}^{m}\left(\frac{\hat{t}_j - t_j}{|t_j|}\right)^2 + \lambda_{L_0}\sum_{i=1}^{n}\bar{z}_i + \lambda_{L_2}\|\bw\|_2^2
+    \label{eq:loss}
+\end{equation}
+where the weighted estimate for target $j$ is:
+\begin{equation}
+    \hat{t}_j = \sum_{i=1}^{n} M_{ji} \cdot w_i \cdot z_i
+    \label{eq:estimate}
+\end{equation}
+$z_i \in [0, 1]$ is the Hard Concrete gate for record $i$ (Equations~1--4 in Section~\ref{sec:background}), $\bar{z}_i = \sigma(\log \alpha_i - \beta \log(-\gamma/\zeta))$ is the expected gate activation used as the $L_0$ penalty, $\lambda_{L_0}$ controls the sparsity-accuracy trade-off, and $\lambda_{L_2}$ provides mild weight regularization.
+
+The calibration loss uses \textit{relative} errors: a 1\% miss on a \$1 billion SNAP target receives the same gradient signal as a 1\% miss on a \$9 trillion employment income target. This scale-invariant formulation prevents large-magnitude targets from dominating the loss.
+
+\subsubsection{Hyperparameters}
+
+Table~\ref{tab:hyperparameters} lists the optimization hyperparameters with their values and roles. The stretch parameters $\gamma = -0.1$ and $\zeta = 1.1$ follow the original Hard Concrete paper, placing approximately 9\% of the sigmoid's mass below 0 and above 1, which is what allows clipping to produce exact zeros and ones.
+
+\input{tables/hyperparameters}
+
+\subsubsection{Initialization}
+
+Weights are initialized from age-bin population targets rather than uniformly. For each congressional district, the sum of age-bin target values gives the district's total population. The initial weight for each record assigned to that district is set to the district population divided by the number of active records in the district. This gives the optimizer a demographically grounded starting point: records in high-population districts begin with higher weights. Small Gaussian jitter in log-weight space ($\sigma = 0.05$) and log-alpha space ($\sigma = 0.01$) breaks symmetry between duplicate CPS records.
+
+Gate logits are initialized so that $P(z_i > 0) \approx 0.999$---nearly every record starts active. The optimizer begins from a dense, approximately calibrated state and prunes as the $L_0$ penalty accumulates over training epochs.
+
+\subsubsection{Presets}
+
+Two presets control the sparsity-accuracy trade-off via $\lambda_{L_0}$:
+
+\input{tables/presets}
+
+The only difference between presets is $\lambda_{L_0}$. All other hyperparameters are shared. A higher $\lambda_{L_0}$ increases the gradient signal pushing gate logits below zero, pruning more records.
+
+\subsubsection{Optimization loop}
+
+The optimizer uses the Adam algorithm \citep{kingma2015} with learning rate 0.15. At each epoch:
+
+\begin{enumerate}
+    \item Sample Hard Concrete gates $z_i$ (stochastic during training).
+    \item Compute effective weights: $w_i^{\text{eff}} = \exp(\log w_i) \cdot z_i$.
+    \item Compute predictions: $\hat{t}_j = \sum_i M_{ji} \cdot w_i^{\text{eff}}$.
+    \item Evaluate the loss (Equation~\ref{eq:loss}).
+    \item Backpropagate and update $\{\log w_i, \log \alpha_i\}$ via Adam.
+\end{enumerate}
+
+At inference, stochastic gates are replaced by their deterministic counterparts. Records where $z_i^{\text{det}} = 0$ are excluded from the output dataset.
+
+The current implementation runs for a fixed number of epochs (100 for regional fits, 4,000 for national fits) without early stopping. Convergence is assessed post hoc from diagnostic outputs.
+
+\subsection{Stage 4: weight expansion and dataset assembly}
+\label{sec:stage4}
+
+The optimizer returns a flat weight vector of length $N_{\text{clones}} \times n_{\text{records}}$. Stage 4 expands this into complete H5 dataset files suitable for microsimulation.
+
+\subsubsection{Weight reshaping and geographic filtering}
+
+The flat vector is reshaped to a matrix $\mathbf{W} \in \R^{N_{\text{clones}} \times n_{\text{records}}}$. Most entries are zero. For per-area datasets (e.g., a state dataset for California), clones whose congressional district falls outside the target area are zeroed out. For city datasets (e.g., New York City), clones whose county FIPS is not in the city's set of counties are zeroed out. The nonzero entries---typically 3--4 million for the local preset or approximately 50,000 for the national preset---identify the active (clone, household) pairs.
+
+\subsubsection{Entity membership preservation}
+
+Each CPS household contains persons who belong to sub-entities: tax units, SPM units, families, and marital units. The assembly step looks up the membership mapping for each active household and concatenates the corresponding person and sub-entity indices across all active clones. Entity IDs are reassigned to globally unique values using a compound key $(\text{clone\_id} \times \text{offset} + \text{old\_id})$ with binary search remapping, achieving $O(n \log n)$ performance.
+
+\subsubsection{Geographic variable derivation}
+
+Each active clone carries a census block GEOID from the geography assignment. The assembly step derives state FIPS, county, tract, CBSA, SLDU, SLDL, place, VTD, PUMA, and ZCTA from the block GEOID. Because many clones share the same block (especially in dense urban areas), block GEOIDs are deduplicated before derivation and results are broadcast back via an inverse index.
+
+\subsubsection{SPM threshold recalculation}
+
+The Supplemental Poverty Measure threshold depends on local housing costs via a geographic adjustment factor. Because each clone's geography differs from the base dataset, SPM thresholds are recomputed using:
+\begin{equation}
+    \text{threshold}_k = \text{base\_threshold}(\text{tenure}_k) \times \text{equiv}(n_{\text{adults}}, n_{\text{children}}) \times \text{geoadj}(\text{CD}_k)
+\end{equation}
+where the geographic adjustment factor for each congressional district is derived from median two-bedroom rent ratios from the Census Bureau.
+
+\subsubsection{Take-up consistency}
+
+The H5 assembly must produce identical take-up draws as the matrix builder for every (variable, household, clone) triple. Both stages call the same function (\texttt{compute\_block\_takeup\_for\_entities}) with the same seed derived from (variable name, original household ID, clone index). Any divergence would mean the matrix targeted a different subpopulation than what appears in the final dataset.
diff --git a/paper-l0/sections/results.tex b/paper-l0/sections/results.tex
new file mode 100644
index 00000000..c56f7606
--- /dev/null
+++ b/paper-l0/sections/results.tex
@@ -0,0 +1,61 @@
+\section{Results}
+\label{sec:results}
+
+This section presents calibration accuracy, sparsity analysis, convergence behavior, and comparisons with alternative methods. All empirical values are marked as placeholders pending completed pipeline runs.
+
+\subsection{Calibration accuracy by geographic level}
+
+Table~\ref{tab:calibration_accuracy} reports mean, median, and maximum absolute relative error across achievable targets, disaggregated by geographic level.
+
+\input{tables/calibration_accuracy}
+
+The mean absolute relative error across all \tbc[total count] achievable targets is \tbc[mean error]\%. At the national level, where the optimizer has the most flexibility (every record contributes), error is lowest at \tbc[national mean]\%. State-level error averages \tbc[state mean]\%, and congressional district error averages \tbc[district mean]\%.
+
+\subsubsection{Error by target source}
+
+Error varies systematically across target sources. IRS SOI income targets, which have large magnitudes and strong signal in the calibration matrix, achieve mean error of \tbc[IRS mean]\%. ACS age-band targets, which are count-based and distributed across 18 narrow bins per district, achieve mean error of \tbc[ACS mean]\%. SNAP household count targets, which depend on the take-up re-randomization mechanism, achieve mean error of \tbc[SNAP mean]\%.
+
+\subsubsection{Unachievable targets}
+
+Of the approximately 37,800 targets, \tbc[count] are marked unachievable (row sum zero in the calibration matrix). These correspond to congressional districts where no clones carry nonzero values for the target variable. Increasing the clone count from 430 reduces the number of unachievable targets, at the cost of a larger calibration matrix.
+
+\subsection{Sparsity analysis}
+
+The $L_0$ penalty produces exact zeros in the weight vector. Table~\ref{tab:presets} reports the number of retained records under each preset.
+
+Under the national preset ($\lambda_{L_0} = 10^{-4}$), the optimizer retains approximately \tbc[national count] records from the initial \tbc[total] million, a reduction of \tbc[national pct]\%. Under the local preset ($\lambda_{L_0} = 10^{-8}$), approximately \tbc[local count] million records are retained (\tbc[local pct]\% reduction). The national preset provides a compact dataset suitable for in-browser simulation; the local preset preserves geographic resolution across all 436 congressional districts.
+
+\subsubsection{Weight distribution}
+
+Under the national preset, the weight distribution spans from \tbc[min weight] to \tbc[max weight], with a median of \tbc[median weight]. The effective sample size (ESS), computed as $(\sum w_i)^2 / \sum w_i^2$, is \tbc[ESS] under the national preset, representing \tbc[ESS pct]\% of the nominal sample size.
+
+\subsection{Convergence behavior}
+
+Figure~\ref{fig:convergence} plots the mean absolute relative error across all achievable targets as a function of training epoch for both presets.
+
+\begin{figure}[ht]
+    \centering
+    \fbox{\parbox{0.9\textwidth}{\centering\textit{[Convergence curves to be generated from calibration\_log.csv]}}}
+    \caption{Mean absolute relative error over training epochs for the national preset (4,000 epochs, $\lambda_{L_0} = 10^{-4}$) and the local preset (100 epochs, $\lambda_{L_0} = 10^{-8}$).}
+    \label{fig:convergence}
+\end{figure}
+
+Under the national preset, the mean error decreases from \tbc[initial]\% at epoch 0 to \tbc[final]\% at epoch 4,000. The error curve flattens by approximately epoch \tbc[plateau epoch], suggesting that the remaining error reflects structural limitations of the calibration matrix rather than insufficient training. Under the local preset, the mean error decreases from \tbc[local initial]\% to \tbc[local final]\% over 100 epochs.
+
+\subsubsection{Residual error analysis}
+
+Targets with persistent high error after convergence fall into two categories. First, targets in sparsely populated congressional districts where few clones have nonzero matrix entries---the optimizer has insufficient degrees of freedom. Second, targets where the domain constraint is restrictive (e.g., EITC by number of qualifying children in a specific district), limiting the pool of contributing records.
+
+\subsection{Comparison with alternative methods}
+
+Table~\ref{tab:comparison} compares the $L_0$ method with iterative proportional fitting (IPF) and generalized regression (GREG) estimators applied to the same calibration matrix and targets.
+
+\input{tables/comparison}
+
+\tbc[Comparison results to be populated after running IPF and GREG baselines on the same target set. Expected advantages of L0: joint multi-level calibration in a single pass, configurable sparsity, scalability to 37,800 targets. Expected disadvantages: longer computation time (GPU-based gradient descent vs closed-form or iterative solutions), sensitivity to hyperparameter selection.]
+
+\subsection{Application: subnational policy impact}
+
+To demonstrate the practical utility of the calibrated subnational datasets, we analyze the distributional impact of a policy reform at the state and congressional district level using \policyengine{}'s microsimulation engine.
+
+\tbc[Policy application to be selected and simulated. Candidate: analysing EITC expansion impacts across congressional districts, showing how district-level datasets enable geographic disaggregation of winners and losers that national datasets cannot provide.]
diff --git a/paper-l0/tables/calibration_accuracy.tex b/paper-l0/tables/calibration_accuracy.tex
new file mode 100644
index 00000000..14f7d16c
--- /dev/null
+++ b/paper-l0/tables/calibration_accuracy.tex
@@ -0,0 +1,19 @@
+\begin{table}[ht]
+\centering
+{\tablefont
+\begin{tabular}{lrrrr}
+\toprule
+Geographic level & Targets & Mean ARE (\%) & Median ARE (\%) & Max ARE (\%) \\
+\midrule
+National & \tbc & \tbc & \tbc & \tbc \\
+State & \tbc & \tbc & \tbc & \tbc \\
+Congressional district & \tbc & \tbc & \tbc & \tbc \\
+\midrule
+\textbf{All achievable} & \tbc & \tbc & \tbc & \tbc \\
+\bottomrule
+\end{tabular}
+}
+\caption{Calibration accuracy by geographic level, measured as absolute relative error (ARE) across achievable targets under the local preset ($\lambda_{L_0} = 10^{-8}$).}
+\label{tab:calibration_accuracy}
+\tablenote{Note: \tbc[Values to be populated from \texttt{unified\_diagnostics.csv} after a completed pipeline run.]}
+\end{table}
diff --git a/paper-l0/tables/comparison.tex b/paper-l0/tables/comparison.tex
new file mode 100644
index 00000000..d2dc5efa
--- /dev/null
+++ b/paper-l0/tables/comparison.tex
@@ -0,0 +1,17 @@
+\begin{table}[ht]
+\centering
+{\tablefont
+\begin{tabular}{lrrrr}
+\toprule
+Method & Mean ARE (\%) & Median ARE (\%) & Runtime & Sparsity control \\
+\midrule
+$L_0$ (this paper) & \tbc & \tbc & \tbc & Yes (configurable $\lambda_{L_0}$) \\
+IPF & \tbc & \tbc & \tbc & No \\
+GREG & \tbc & \tbc & \tbc & No \\
+\bottomrule
+\end{tabular}
+}
+\caption{Comparison of $L_0$ regularization with iterative proportional fitting (IPF) and generalized regression (GREG) estimators on the same calibration matrix and target set.}
+\label{tab:comparison}
+\tablenote{Note: \tbc[Results to be populated after running IPF and GREG baselines. ARE = absolute relative error across all achievable targets.]}
+\end{table}
diff --git a/paper-l0/tables/hyperparameters.tex b/paper-l0/tables/hyperparameters.tex
new file mode 100644
index 00000000..468299b2
--- /dev/null
+++ b/paper-l0/tables/hyperparameters.tex
@@ -0,0 +1,21 @@
+\begin{table}[ht]
+\centering
+{\tablefont
+\begin{tabular}{lll}
+\toprule
+Parameter & Value & Role \\
+\midrule
+$\beta$ (gate temperature) & 0.35 & Sharpness of 0/1 gate transition \\
+$\gamma$ (left stretch) & $-0.1$ & Enables exact-zero gates after clipping \\
+$\zeta$ (right stretch) & 1.1 & Enables exact-one gates after clipping \\
+Initial keep probability & 0.999 & All records start nearly fully active \\
+$\lambda_{L_2}$ & $10^{-12}$ & Mild weight regularization \\
+Learning rate & 0.15 & Adam optimizer step size \\
+Clone count $N$ & 430 & Geographic replicas per CPS household \\
+\bottomrule
+\end{tabular}
+}
+\caption{Hard Concrete gate and optimization hyperparameters. The stretch parameters $\gamma$ and $\zeta$ follow \citet{louizos2018}; other values are tuned for the calibration setting.}
+\label{tab:hyperparameters}
+\tablenote{Source: \texttt{unified\_calibration.py} in the \texttt{policyengine-us-data} repository.}
+\end{table}
diff --git a/paper-l0/tables/presets.tex b/paper-l0/tables/presets.tex
new file mode 100644
index 00000000..a574a1b4
--- /dev/null
+++ b/paper-l0/tables/presets.tex
@@ -0,0 +1,16 @@
+\begin{table}[ht]
+\centering
+{\tablefont
+\begin{tabular}{llrrl}
+\toprule
+Preset & $\lambda_{L_0}$ & Epochs & Retained records & Use case \\
+\midrule
+National & $10^{-4}$ & 4,000 & $\sim$50,000 & Web-based interactive simulation \\
+Local & $10^{-8}$ & 100 & $\sim$3--4 million & Subnational policy analysis \\
+\bottomrule
+\end{tabular}
+}
+\caption{Sparsity presets. The only difference between presets is $\lambda_{L_0}$ and epoch count; all other hyperparameters (Table~\ref{tab:hyperparameters}) are shared.}
+\label{tab:presets}
+\tablenote{Note: Retained record counts are approximate and vary with the base CPS size and clone count.}
+\end{table}
diff --git a/paper-l0/tables/target_summary.tex b/paper-l0/tables/target_summary.tex
new file mode 100644
index 00000000..100e952c
--- /dev/null
+++ b/paper-l0/tables/target_summary.tex
@@ -0,0 +1,32 @@
+\begin{table}[ht]
+\centering
+{\tablefont
+\begin{tabular}{p{0.55\textwidth}rr}
+\toprule
+Source & Targets & Geographies \\
+\midrule
+\multicolumn{3}{l}{\textit{Congressional district level}} \\
+IRS SOI (AGI, income tax, EITC, 21 income/ded.\ domains) & 25{,}288 & 436 CDs \\
+Census ACS S0101 (person count by 18 age bands) & 7{,}848 & 436 CDs \\
+Census ACS S2201 (SNAP household count) & 436 & 436 CDs \\
+\midrule
+\multicolumn{3}{l}{\textit{State level}} \\
+IRS SOI (same domains as district) & 2{,}958 & 51 states \\
+Census ACS S0101 (person count by 18 age bands) & 918 & 51 states \\
+USDA FNS SNAP (spending + household count) & 102 & 51 states \\
+CMS Medicaid (enrollment) & 51 & 51 states \\
+Census STC (state income tax collections) & 51 & 51 states \\
+\midrule
+\multicolumn{3}{l}{\textit{National level}} \\
+IRS SOI (domain-constrained aggregates, EITC) & 51 & 1 \\
+Curated totals (CBO, JCT, SSA, CMS, Census) & 37 & 1 \\
+Census ACS S0101 (person count by 18 age bands) & 18 & 1 \\
+\midrule
+\textbf{Total} & \textbf{37{,}758} & \\
+\bottomrule
+\end{tabular}
+}
+\caption{Calibration targets by geographic level and source. District targets dominate by count. Full variable listing in Appendix~\ref{app:targets}.}
+\label{tab:target_summary}
+\tablenote{Source: \texttt{policy\_data.db} active target count.}
+\end{table}
diff --git a/tests/unit/test_benchmarking_runners.py b/tests/unit/test_benchmarking_runners.py
new file mode 100644
index 00000000..1ff587f9
--- /dev/null
+++ b/tests/unit/test_benchmarking_runners.py
@@ -0,0 +1,235 @@
+from __future__ import annotations
+
+import importlib.util
+import json
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pytest
+from scipy.io import mmwrite
+from scipy.sparse import csr_matrix
+
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+BENCHMARK_DIR = REPO_ROOT / "paper-l0" / "benchmarking"
+BENCHMARK_CLI_PATH = BENCHMARK_DIR / "benchmark_cli.py"
+
+
+def _r_package_available(package: str) -> bool:
+    proc = subprocess.run(
+        [
+            "Rscript",
+            "-e",
+            f"quit(status = if (requireNamespace('{package}', quietly = TRUE)) 0 else 1)",
+        ],
+        check=False,
+        capture_output=True,
+        text=True,
+    )
+    return proc.returncode == 0
+
+
+def _load_benchmark_cli_module():
+    benchmark_dir_str = str(BENCHMARK_DIR)
+    if benchmark_dir_str not in sys.path:
+        sys.path.insert(0, benchmark_dir_str)
+    spec = importlib.util.spec_from_file_location(
+        "benchmark_cli_for_tests", BENCHMARK_CLI_PATH
+    )
+    module = importlib.util.module_from_spec(spec)
+    assert spec.loader is not None
+    spec.loader.exec_module(module)
+    return module
+
+
+def _write_common_inputs(
+    run_dir: Path,
+    matrix,
+    target_values: list[float],
+    variables: list[str],
+    geo_levels: list[str] | None = None,
+    target_names: list[str] | None = None,
+    initial_weights: np.ndarray | None = None,
+    method_options: dict | None = None,
+) -> Path:
+    inputs = run_dir / "inputs"
+    outputs = run_dir / "outputs"
+    inputs.mkdir(parents=True, exist_ok=True)
+    outputs.mkdir(parents=True, exist_ok=True)
+
+    mmwrite(str(inputs / "X_targets_by_units.mtx"), matrix)
+    if initial_weights is None:
+        initial_weights = np.ones(matrix.shape[1], dtype=np.float64)
+    np.save(
+        inputs / "initial_weights.npy", np.asarray(initial_weights, dtype=np.float64)
+    )
+
+    if geo_levels is None:
+        geo_levels = ["national"] * len(target_values)
+    if target_names is None:
+        target_names = [f"target_{idx}" for idx in range(len(target_values))]
+
+    target_metadata = pd.DataFrame(
+        {
+            "value": np.asarray(target_values, dtype=np.float64),
+            "variable": variables,
+            "geo_level": geo_levels,
+            "target_name": target_names,
+        }
+    )
+    target_metadata.to_csv(inputs / "target_metadata.csv", index=False)
+
+    manifest = {
+        "method_options": method_options or {},
+    }
+    with open(inputs / "benchmark_manifest.json", "w") as f:
+        json.dump(manifest, f)
+
+    return inputs
+
+
+@pytest.fixture
+def benchmark_cli_module(monkeypatch, tmp_path_factory):
+    cache_root = tmp_path_factory.mktemp("benchmarking-cache")
+    monkeypatch.setenv("MPLCONFIGDIR", str(cache_root / "mpl"))
+    monkeypatch.setenv("XDG_CACHE_HOME", str(cache_root / "xdg"))
+    return _load_benchmark_cli_module()
+
+
+@pytest.fixture(autouse=True)
+def _require_rscript():
+    if shutil.which("Rscript") is None:
+        pytest.skip("Rscript is required for benchmarking runner tests")
+
+
+def test_greg_runner_end_to_end_exact_fit(benchmark_cli_module, tmp_path):
+    if not _r_package_available("survey"):
+        pytest.skip("R package 'survey' is required for this test")
+
+    run_dir = tmp_path / "greg-run"
+    matrix = csr_matrix(np.eye(2, dtype=np.float64))
+    _write_common_inputs(
+        run_dir=run_dir,
+        matrix=matrix,
+        target_values=[2.0, 3.0],
+        variables=["household_count", "person_count"],
+        method_options={"greg": {"maxit": 50, "epsilon": 1e-10}},
+    )
+
+    weights_path, _ = benchmark_cli_module._run_greg(run_dir)
+    fitted_weights = np.load(weights_path)
+
+    np.testing.assert_allclose(
+        fitted_weights, np.array([2.0, 3.0]), atol=1e-8, rtol=1e-8
+    )
+    np.testing.assert_allclose(
+        matrix.dot(fitted_weights), np.array([2.0, 3.0]), atol=1e-8
+    )
+
+
+def test_ipf_runner_end_to_end_numeric_total_person_scope(
+    benchmark_cli_module, tmp_path
+):
+    if not _r_package_available("surveysd"):
+        pytest.skip("R package 'surveysd' is required for this test")
+
+    run_dir = tmp_path / "ipf-numeric-run"
+    matrix = csr_matrix(np.array([[1.0, 0.0]], dtype=np.float64))
+    inputs = _write_common_inputs(
+        run_dir=run_dir,
+        matrix=matrix,
+        target_values=[1.0],
+        variables=["person_count"],
+        method_options={
+            "ipf": {"max_iter": 500, "bound": 20.0, "epsP": 1e-4, "epsH": 1e-4}
+        },
+    )
+
+    unit_metadata = pd.DataFrame(
+        {
+            "unit_index": [0, 0, 1],
+            "household_id": [0, 0, 1],
+            "benchmark_all": ["all", "all", "all"],
+            "ipf_indicator_00000": [1, 0, 0],
+        }
+    )
+    unit_metadata.to_csv(inputs / "unit_metadata.csv", index=False)
+
+    ipf_target_metadata = pd.DataFrame(
+        {
+            "scope": ["person"],
+            "target_type": ["numeric_total"],
+            "value_column": ["ipf_indicator_00000"],
+            "variables": ["benchmark_all"],
+            "cell": ["benchmark_all=all"],
+            "target_value": [1.0],
+            "target_name": ["under_5_people"],
+            "source_variable": ["person_count"],
+            "stratum_id": [1],
+        }
+    )
+    ipf_target_metadata.to_csv(inputs / "ipf_target_metadata.csv", index=False)
+
+    weights_path, _ = benchmark_cli_module._run_ipf(run_dir)
+    fitted_weights = np.load(weights_path)
+
+    np.testing.assert_allclose(
+        fitted_weights, np.array([1.0, 1.0]), atol=1e-8, rtol=1e-8
+    )
+    np.testing.assert_allclose(
+        matrix.dot(fitted_weights), np.array([1.0]), atol=1e-8, rtol=1e-8
+    )
+
+
+def test_ipf_runner_end_to_end_categorical_margin_household_scope(
+    benchmark_cli_module, tmp_path
+):
+    if not _r_package_available("surveysd"):
+        pytest.skip("R package 'surveysd' is required for this test")
+
+    run_dir = tmp_path / "ipf-margin-run"
+    matrix = csr_matrix(np.array([[1.0, 0.0], [0.0, 1.0]], dtype=np.float64))
+    inputs = _write_common_inputs(
+        run_dir=run_dir,
+        matrix=matrix,
+        target_values=[2.0, 2.0],
+        variables=["household_count", "household_count"],
+        method_options={
+            "ipf": {"max_iter": 50, "bound": 10.0, "epsP": 1e-9, "epsH": 1e-9}
+        },
+    )
+
+    unit_metadata = pd.DataFrame(
+        {
+            "unit_index": [0, 1],
+            "household_id": [0, 1],
+            "snap": ["yes", "no"],
+        }
+    )
+    unit_metadata.to_csv(inputs / "unit_metadata.csv", index=False)
+
+    ipf_target_metadata = pd.DataFrame(
+        {
+            "scope": ["household", "household"],
+            "target_type": ["categorical_margin", "categorical_margin"],
+            "margin_id": ["snap_margin", "snap_margin"],
+            "variables": ["snap", "snap"],
+            "cell": ["snap=yes", "snap=no"],
+            "target_value": [2.0, 2.0],
+        }
+    )
+    ipf_target_metadata.to_csv(inputs / "ipf_target_metadata.csv", index=False)
+
+    weights_path, _ = benchmark_cli_module._run_ipf(run_dir)
+    fitted_weights = np.load(weights_path)
+
+    np.testing.assert_allclose(
+        fitted_weights, np.array([2.0, 2.0]), atol=1e-8, rtol=1e-8
+    )
+    np.testing.assert_allclose(
+        matrix.dot(fitted_weights), np.array([2.0, 2.0]), atol=1e-8
+    )