From e3e0c7ed7a285e4a4872cd4b59e432fe5bae5bda Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Sun, 26 Apr 2026 13:19:43 -0400
Subject: [PATCH 1/5] Add public transfer artifact manifest

---
 .github/CONTRIBUTING.md                       |   3 +-
 .../upload-public-transfer-dataset.yaml       |  26 ++
 Makefile                                      |   6 +
 README.md                                     |  10 +
 docs/myst.yml                                 |   1 +
 docs/public_transfer_dataset.md               |  48 ++++
 policyengine_uk_data/datasets/__init__.py     |   2 +
 policyengine_uk_data/datasets/enhanced_cps.py |   1 +
 .../datasets/policybench_transfer.py          |   8 +-
 .../storage/enhanced_cps_manifest_2025.json   |  90 +++++++
 .../storage/upload_public_transfer_dataset.py |  29 ++
 .../storage/write_enhanced_cps_manifest.py    |   7 +
 .../test_enhanced_cps_artifact_manifest.py    | 137 ++++++++++
 .../utils/enhanced_cps_manifest.py            | 248 ++++++++++++++++++
 uv.lock                                       |   2 +-
 15 files changed, 615 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/upload-public-transfer-dataset.yaml
 create mode 100644 docs/public_transfer_dataset.md
 create mode 100644 policyengine_uk_data/storage/enhanced_cps_manifest_2025.json
 create mode 100644 policyengine_uk_data/storage/upload_public_transfer_dataset.py
 create mode 100644 policyengine_uk_data/storage/write_enhanced_cps_manifest.py
 create mode 100644 policyengine_uk_data/tests/test_enhanced_cps_artifact_manifest.py
 create mode 100644 policyengine_uk_data/utils/enhanced_cps_manifest.py

diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index ffc952586..0705841ec 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -30,7 +30,8 @@ This repo builds the `.h5` files that feed `policyengine-uk`:
 
 The enhanced FRS dataset is licensed under strict UK Data Service terms. Violating them risks losing access, which would end PolicyEngine UK.
 
-- **Never upload data to any public location.** The HuggingFace repo `policyengine/policyengine-uk-data-private` is private and authenticated.
+- **Never upload FRS-derived or UKDS-licensed data to any public location.** The HuggingFace repo `policyengine/policyengine-uk-data-private` is private and authenticated.
+- The public transfer artifacts documented in `docs/public_transfer_dataset.md` are the narrow exception. Upload them only through `make upload-public-transfer`, which targets the public repo intentionally.
 - **Never modify `upload_completed_datasets.py` or `utils/data_upload.py`** to change upload destinations without explicit confirmation from the data controller (currently Nikhil Woodruff).
 - **Never print, log, or output individual-level records.** Aggregates (sums, means, counts, weighted totals) are fine; individual rows are not.
 - **If you see a private/public repo split, assume it is intentional** — ask why before changing it.
diff --git a/.github/workflows/upload-public-transfer-dataset.yaml b/.github/workflows/upload-public-transfer-dataset.yaml
new file mode 100644
index 000000000..79d9827f5
--- /dev/null
+++ b/.github/workflows/upload-public-transfer-dataset.yaml
@@ -0,0 +1,26 @@
+name: Upload public transfer dataset
+
+on:
+  workflow_dispatch:
+
+jobs:
+  upload-public-transfer:
+    runs-on: ubuntu-latest
+    env:
+      HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.13
+      - name: Install package
+        run: uv pip install -e ".[dev]" --system
+      - name: Verify public transfer artifact contract
+        run: |
+          pytest policyengine_uk_data/tests/test_enhanced_cps_artifact_manifest.py
+          pytest policyengine_uk_data/tests/test_policybench_transfer.py
+      - name: Upload public transfer artifacts
+        run: make upload-public-transfer
diff --git a/Makefile b/Makefile
index 30955d252..2e2bb699d 100644
--- a/Makefile
+++ b/Makefile
@@ -15,6 +15,12 @@ download:
 upload:
 	python policyengine_uk_data/storage/upload_completed_datasets.py
 
+enhanced-cps-manifest:
+	python policyengine_uk_data/storage/write_enhanced_cps_manifest.py
+
+upload-public-transfer:
+	python policyengine_uk_data/storage/upload_public_transfer_dataset.py
+
 documentation:
 	pip install --pre "jupyter-book>=2"
 	jb clean docs && jb build docs
diff --git a/README.md b/README.md
index 4f6de4a32..d0f8b2e07 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,7 @@ This repo now also includes a public calibrated microdata file:
 
 - `policyengine_uk_data/storage/enhanced_cps_2025.h5`
 - source manifest: `policyengine_uk_data/storage/enhanced_cps_source_2025.csv`
+- artifact manifest: `policyengine_uk_data/storage/enhanced_cps_manifest_2025.json`
 
 The public UK calibrated transfer dataset starts from a public export of eligible households from
 PolicyEngine-US Enhanced CPS. In the current build that source manifest contains
@@ -47,11 +48,20 @@ This is a public calibrated dataset, not a replacement for the FRS or enhanced
 FRS. It is intended as the first step in a broader cross-country public-microdata
 strategy.
 
+The legacy `policybench_transfer_2025.h5` and
+`policybench_transfer_source_2025.csv` files remain the original
+1,000-household proof-of-method artifacts. The Python
+`create_policybench_transfer` and `save_policybench_transfer` entry points are
+backward-compatible aliases for the current `enhanced_cps` builder, not a
+request to regenerate the legacy 1,000-household files.
+
 Programmatic entrypoints:
 
 - `policyengine_uk_data.datasets.create_enhanced_cps`
 - `policyengine_uk_data.datasets.export_enhanced_cps_source`
 - `policyengine_uk_data.datasets.save_enhanced_cps`
+- `make enhanced-cps-manifest`
+- `make upload-public-transfer`
 
 Backward-compatible aliases remain available:
 
diff --git a/docs/myst.yml b/docs/myst.yml
index 29f21f0a3..1fd8ceb55 100644
--- a/docs/myst.yml
+++ b/docs/myst.yml
@@ -9,6 +9,7 @@ project:
   github: policyengine/policyengine-uk-data
   toc:
     - file: intro.md
+    - file: public_transfer_dataset.md
     - file: methodology.ipynb
     - file: imputations.md
     - file: validation/index.md
diff --git a/docs/public_transfer_dataset.md b/docs/public_transfer_dataset.md
new file mode 100644
index 000000000..9612ae6ca
--- /dev/null
+++ b/docs/public_transfer_dataset.md
@@ -0,0 +1,48 @@
+# Public UK transfer dataset
+
+The public UK transfer dataset is an openly distributable benchmark artifact.
+It is not native UK survey microdata.
+
+The 2025 artifact starts from a public export of benchmark-compatible
+PolicyEngine US Enhanced CPS households. The builder maps those households into
+UK-facing PolicyEngine inputs, assigns synthetic UK geography, populates input
+leaves such as council tax bands, vehicle ownership, pensions, disability/PIP,
+consumption, and capital gains, and recalibrates household weights to selected
+UK national, regional, and country targets.
+
+The current public artifact is:
+
+- `policyengine_uk_data/storage/enhanced_cps_2025.h5`
+- `policyengine_uk_data/storage/enhanced_cps_source_2025.csv`
+- `policyengine_uk_data/storage/enhanced_cps_manifest_2025.json`
+
+The artifact manifest is the source of record for row counts, checksums, build
+assumptions, weight diagnostics, and loss diagnostics. The checked-in 2025
+manifest reports 28,532 households in both the source CSV and H5 file, 58,848
+people in the H5 file, an effective sample size of about 11,197 households, and
+a top-10 household-weight share of about 0.52%.
+
+## Intended use
+
+Use this dataset for public demos, reproducible examples, and public benchmark
+analysis where restricted UK microdata cannot be redistributed.
+
+Do not use this dataset as a substitute for FRS or enhanced FRS, as evidence of
+the UK joint household distribution, or as administrative ground truth. Aggregate
+calibration can improve target fit without recovering the native UK joint
+distribution.
+
+## Versioning
+
+The public artifact should be cited by path, manifest, package version, and
+checksum. The 2025 artifact uses a pinned USD-to-GBP exchange rate of 0.759 from
+the IRS 2025 yearly average exchange-rate table. The builder deliberately does
+not call a live foreign-exchange API.
+
+## Legacy files
+
+The `policybench_transfer_2025.h5` and `policybench_transfer_source_2025.csv`
+files are retained as the original 1,000-household proof-of-method artifacts.
+Current Python entry points named `create_policybench_transfer` and
+`save_policybench_transfer` are aliases for the current 28,532-household
+`enhanced_cps` builder.
diff --git a/policyengine_uk_data/datasets/__init__.py b/policyengine_uk_data/datasets/__init__.py
index c7db400b3..84d8add24 100644
--- a/policyengine_uk_data/datasets/__init__.py
+++ b/policyengine_uk_data/datasets/__init__.py
@@ -1,5 +1,6 @@
 from .enhanced_cps import (
     ENHANCED_CPS_FILE,
+    ENHANCED_CPS_MANIFEST_FILE,
     ENHANCED_CPS_SOURCE_FILE,
     create_enhanced_cps,
     export_enhanced_cps_source,
@@ -15,6 +16,7 @@
 
 __all__ = [
     "ENHANCED_CPS_FILE",
+    "ENHANCED_CPS_MANIFEST_FILE",
     "ENHANCED_CPS_SOURCE_FILE",
     "create_enhanced_cps",
     "export_enhanced_cps_source",
diff --git a/policyengine_uk_data/datasets/enhanced_cps.py b/policyengine_uk_data/datasets/enhanced_cps.py
index 5819573d4..8ba026511 100644
--- a/policyengine_uk_data/datasets/enhanced_cps.py
+++ b/policyengine_uk_data/datasets/enhanced_cps.py
@@ -23,6 +23,7 @@
 
 ENHANCED_CPS_SOURCE_FILE = STORAGE_FOLDER / "enhanced_cps_source_2025.csv"
 ENHANCED_CPS_FILE = STORAGE_FOLDER / "enhanced_cps_2025.h5"
+ENHANCED_CPS_MANIFEST_FILE = STORAGE_FOLDER / "enhanced_cps_manifest_2025.json"
 COUNCIL_TAX_BANDS_FILE = STORAGE_FOLDER / "council_tax_bands_2024.csv"
 
 # Build assumptions are pinned so the checked-in H5 is reproducible. Do not
diff --git a/policyengine_uk_data/datasets/policybench_transfer.py b/policyengine_uk_data/datasets/policybench_transfer.py
index 360202de6..baa144c28 100644
--- a/policyengine_uk_data/datasets/policybench_transfer.py
+++ b/policyengine_uk_data/datasets/policybench_transfer.py
@@ -1,4 +1,10 @@
-"""Backward-compatible aliases for the public UK enhanced CPS dataset."""
+"""Backward-compatible aliases for the current public UK transfer builder.
+
+The checked-in ``policybench_transfer_2025`` artifacts remain the original
+1,000-household proof-of-method files for historical comparison. The Python
+entry points below intentionally alias the current 28,532-household
+``enhanced_cps_2025`` builder instead of recreating those legacy artifacts.
+"""
 
 from policyengine_uk_data.datasets.enhanced_cps import (
     ENHANCED_CPS_SOURCE_FILE as POLICYBENCH_TRANSFER_SOURCE_FILE,
diff --git a/policyengine_uk_data/storage/enhanced_cps_manifest_2025.json b/policyengine_uk_data/storage/enhanced_cps_manifest_2025.json
new file mode 100644
index 000000000..89a6d0878
--- /dev/null
+++ b/policyengine_uk_data/storage/enhanced_cps_manifest_2025.json
@@ -0,0 +1,90 @@
+{
+  "artifact": "enhanced_cps_2025",
+  "build": {
+    "build_command": "uv run --python 3.13 python -m policyengine_uk_data.storage.write_enhanced_cps_manifest",
+    "builder": "policyengine_uk_data.datasets.save_enhanced_cps",
+    "calibrated": true,
+    "calibration_target_year": 2025,
+    "exchange_rate": {
+      "live_api_called": false,
+      "source_url": "https://www.irs.gov/individuals/international-taxpayers/yearly-average-currency-exchange-rates",
+      "usd_to_gbp": 0.759
+    },
+    "fiscal_year": 2025,
+    "source_dataset": "PolicyEngine US Enhanced CPS public export",
+    "source_scope": "benchmark-compatible households"
+  },
+  "description": "Public UK calibrated transfer dataset derived from a public export of benchmark-compatible PolicyEngine US Enhanced CPS households.",
+  "files": {
+    "artifact": {
+      "git_blob_sha": "6184d619b15497b00fea2d349a4af041f85d17a0",
+      "last_modified_commit": "9514dfb7ec607897c9f7122a2e073b922c9fd8b6",
+      "path": "policyengine_uk_data/storage/enhanced_cps_2025.h5",
+      "sha256": "199ebc61d29231b4799ad337a95393765b5fb5aede1834b93ff2acecceded866",
+      "size_bytes": 23379906
+    },
+    "source_manifest": {
+      "git_blob_sha": "b94400dd8d729bcf7807ddc59a920c6f48bee2dd",
+      "last_modified_commit": "9514dfb7ec607897c9f7122a2e073b922c9fd8b6",
+      "path": "policyengine_uk_data/storage/enhanced_cps_source_2025.csv",
+      "sha256": "ab69ba5cec8c079a7cacd1494bc6766139e3c6e07e779e3719a57be60e039892",
+      "size_bytes": 58751828
+    }
+  },
+  "generated_at": "2026-04-26T17:15:02.755838Z",
+  "intended_uses": [
+    "public demos",
+    "reproducible examples",
+    "public benchmark analysis"
+  ],
+  "loss_diagnostics": {
+    "calibrated": {
+      "excludes_zero_target_relative_errors": true,
+      "mean_abs_relative_error": 0.330212,
+      "median_abs_relative_error": 0.191874,
+      "nonfinite_relative_error_count": 1,
+      "nonzero_finite_target_count": 590,
+      "p90_abs_relative_error": 1.0,
+      "share_within_10pct": 0.284746,
+      "share_within_25pct": 0.608475,
+      "target_count": 591,
+      "zero_target_count": 1
+    },
+    "raw_transfer_weights": {
+      "excludes_zero_target_relative_errors": true,
+      "mean_abs_relative_error": 3.418089,
+      "median_abs_relative_error": 1.575202,
+      "nonfinite_relative_error_count": 1,
+      "nonzero_finite_target_count": 590,
+      "p90_abs_relative_error": 5.829189,
+      "share_within_10pct": 0.022034,
+      "share_within_25pct": 0.052542,
+      "target_count": 591,
+      "zero_target_count": 1
+    },
+    "target_year": 2025
+  },
+  "not_intended_uses": [
+    "substitution for FRS or enhanced FRS",
+    "precise UK distributional analysis",
+    "administrative truth validation"
+  ],
+  "row_counts": {
+    "h5_benunits": 28532,
+    "h5_households": 28532,
+    "h5_people": 58848,
+    "source_households": 28532
+  },
+  "schema_version": 1,
+  "weight_diagnostics": {
+    "effective_sample_size": 11196.873445,
+    "max_household_weight": 20155.812507,
+    "min_household_weight": 1.0000000000000003e-09,
+    "near_zero_weight_count": 1408,
+    "near_zero_weight_threshold": 1e-06,
+    "top_100_share": 0.033223773,
+    "top_10_share": 0.005239509,
+    "top_1_share": 0.000714643,
+    "total_household_weight": 28204012.49725
+  }
+}
diff --git a/policyengine_uk_data/storage/upload_public_transfer_dataset.py b/policyengine_uk_data/storage/upload_public_transfer_dataset.py
new file mode 100644
index 000000000..ab5f507da
--- /dev/null
+++ b/policyengine_uk_data/storage/upload_public_transfer_dataset.py
@@ -0,0 +1,29 @@
+"""Upload the explicitly public UK calibrated transfer dataset artifacts."""
+
+from policyengine_uk_data.datasets import (
+    ENHANCED_CPS_FILE,
+    ENHANCED_CPS_MANIFEST_FILE,
+    ENHANCED_CPS_SOURCE_FILE,
+)
+from policyengine_uk_data.utils.data_upload import upload_files_to_hf
+from policyengine_uk_data.utils.hf_destinations import PUBLIC_REPO
+
+
+def upload_public_transfer_dataset() -> None:
+    files = [
+        ENHANCED_CPS_FILE,
+        ENHANCED_CPS_SOURCE_FILE,
+        ENHANCED_CPS_MANIFEST_FILE,
+    ]
+    for file_path in files:
+        if not file_path.exists():
+            raise ValueError(f"File {file_path} does not exist.")
+
+    upload_files_to_hf(
+        files=files,
+        hf_repo_name=PUBLIC_REPO,
+    )
+
+
+if __name__ == "__main__":
+    upload_public_transfer_dataset()
diff --git a/policyengine_uk_data/storage/write_enhanced_cps_manifest.py b/policyengine_uk_data/storage/write_enhanced_cps_manifest.py
new file mode 100644
index 000000000..cfdb1c45a
--- /dev/null
+++ b/policyengine_uk_data/storage/write_enhanced_cps_manifest.py
@@ -0,0 +1,7 @@
+"""Write the public enhanced CPS artifact manifest."""
+
+from policyengine_uk_data.utils.enhanced_cps_manifest import main
+
+
+if __name__ == "__main__":
+    main()
diff --git a/policyengine_uk_data/tests/test_enhanced_cps_artifact_manifest.py b/policyengine_uk_data/tests/test_enhanced_cps_artifact_manifest.py
new file mode 100644
index 000000000..a9922311c
--- /dev/null
+++ b/policyengine_uk_data/tests/test_enhanced_cps_artifact_manifest.py
@@ -0,0 +1,137 @@
+import hashlib
+import json
+import subprocess
+from pathlib import Path
+from unittest.mock import patch
+
+import numpy as np
+import pandas as pd
+from policyengine_uk.data import UKSingleYearDataset
+
+from policyengine_uk_data.datasets import (
+    ENHANCED_CPS_FILE,
+    ENHANCED_CPS_MANIFEST_FILE,
+    ENHANCED_CPS_SOURCE_FILE,
+)
+from policyengine_uk_data.datasets.policybench_transfer import (
+    POLICYBENCH_TRANSFER_SOURCE_FILE,
+)
+from policyengine_uk_data.storage import STORAGE_FOLDER
+from policyengine_uk_data.storage.upload_public_transfer_dataset import (
+    upload_public_transfer_dataset,
+)
+from policyengine_uk_data.utils.hf_destinations import PUBLIC_REPO
+
+
+def _sha256(path: Path) -> str:
+    digest = hashlib.sha256()
+    with open(path, "rb") as f:
+        for chunk in iter(lambda: f.read(1024 * 1024), b""):
+            digest.update(chunk)
+    return digest.hexdigest()
+
+
+def _git_blob_sha(path: Path) -> str:
+    root = Path(
+        subprocess.check_output(
+            ["git", "rev-parse", "--show-toplevel"],
+            text=True,
+        ).strip()
+    )
+    relative_path = path.resolve().relative_to(root)
+    return subprocess.check_output(
+        ["git", "rev-parse", f"HEAD:{relative_path}"],
+        text=True,
+    ).strip()
+
+
+def _csv_rows(path: Path) -> int:
+    return len(pd.read_csv(path, usecols=["scenario_id"]))
+
+
+def _dataset_counts(path: Path) -> dict[str, int]:
+    values = UKSingleYearDataset(file_path=str(path)).load()
+    return {
+        "h5_households": len(values["household_id"]),
+        "h5_people": len(values["person_id"]),
+        "h5_benunits": len(values["benunit_id"]),
+    }
+
+
+def _household_weights(path: Path) -> np.ndarray:
+    values = UKSingleYearDataset(file_path=str(path)).load()
+    return np.asarray(values["household_weight"], dtype=float)
+
+
+def test_enhanced_cps_manifest_matches_committed_artifacts():
+    manifest = json.loads(ENHANCED_CPS_MANIFEST_FILE.read_text())
+
+    assert manifest["row_counts"]["source_households"] == _csv_rows(
+        ENHANCED_CPS_SOURCE_FILE
+    )
+    assert (
+        manifest["row_counts"] | _dataset_counts(ENHANCED_CPS_FILE)
+        == manifest["row_counts"]
+    )
+    assert manifest["files"]["artifact"]["sha256"] == _sha256(ENHANCED_CPS_FILE)
+    assert manifest["files"]["source_manifest"]["sha256"] == _sha256(
+        ENHANCED_CPS_SOURCE_FILE
+    )
+    assert manifest["files"]["artifact"]["git_blob_sha"] == _git_blob_sha(
+        ENHANCED_CPS_FILE
+    )
+    assert manifest["files"]["source_manifest"]["git_blob_sha"] == _git_blob_sha(
+        ENHANCED_CPS_SOURCE_FILE
+    )
+
+
+def test_enhanced_cps_manifest_matches_docs_and_weight_diagnostics():
+    manifest = json.loads(ENHANCED_CPS_MANIFEST_FILE.read_text())
+    readme = Path("README.md").read_text()
+    source_rows = manifest["row_counts"]["source_households"]
+
+    assert f"{source_rows:,}" in readme
+
+    weights = _household_weights(ENHANCED_CPS_FILE)
+    sorted_weights = np.sort(weights)[::-1]
+    total_weight = weights.sum()
+    diagnostics = manifest["weight_diagnostics"]
+
+    assert np.isclose(diagnostics["total_household_weight"], total_weight)
+    assert np.isclose(
+        diagnostics["effective_sample_size"],
+        total_weight**2 / np.square(weights).sum(),
+    )
+    assert np.isclose(diagnostics["max_household_weight"], weights.max())
+    assert np.isclose(
+        diagnostics["top_10_share"],
+        sorted_weights[:10].sum() / total_weight,
+    )
+    assert diagnostics["near_zero_weight_count"] == int(
+        (weights <= diagnostics["near_zero_weight_threshold"]).sum()
+    )
+
+
+def test_legacy_policybench_transfer_artifacts_are_explicitly_legacy():
+    legacy_source = STORAGE_FOLDER / "policybench_transfer_source_2025.csv"
+    legacy_artifact = STORAGE_FOLDER / "policybench_transfer_2025.h5"
+
+    assert _csv_rows(legacy_source) == 1_000
+    assert _dataset_counts(legacy_artifact)["h5_households"] == 1_000
+    assert POLICYBENCH_TRANSFER_SOURCE_FILE == ENHANCED_CPS_SOURCE_FILE
+
+
+def test_public_transfer_upload_targets_public_hf_repo():
+    with patch(
+        "policyengine_uk_data.storage.upload_public_transfer_dataset.upload_files_to_hf"
+    ) as upload_files_to_hf:
+        upload_public_transfer_dataset()
+
+    upload_files_to_hf.assert_called_once()
+    kwargs = upload_files_to_hf.call_args.kwargs
+    assert kwargs["hf_repo_name"] == PUBLIC_REPO
+    assert kwargs["files"] == [
+        ENHANCED_CPS_FILE,
+        ENHANCED_CPS_SOURCE_FILE,
+        ENHANCED_CPS_MANIFEST_FILE,
+    ]
diff --git a/policyengine_uk_data/utils/enhanced_cps_manifest.py b/policyengine_uk_data/utils/enhanced_cps_manifest.py
new file mode 100644
index 000000000..207bc27f4
--- /dev/null
+++ b/policyengine_uk_data/utils/enhanced_cps_manifest.py
@@ -0,0 +1,248 @@
+"""Artifact manifest generation for the public UK enhanced CPS transfer data."""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import subprocess
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import pandas as pd
+from policyengine_uk.data import UKSingleYearDataset
+
+from policyengine_uk_data.datasets.enhanced_cps import (
+    ENHANCED_CPS_FILE,
+    ENHANCED_CPS_MANIFEST_FILE,
+    ENHANCED_CPS_SOURCE_FILE,
+    USD_TO_GBP,
+    USD_TO_GBP_SOURCE_URL,
+    create_enhanced_cps,
+)
+from policyengine_uk_data.utils.loss import get_loss_results
+
+ENHANCED_CPS_MANIFEST_SCHEMA_VERSION = 1
+
+
+def _sha256(path: Path) -> str:
+    digest = hashlib.sha256()
+    with open(path, "rb") as f:
+        for chunk in iter(lambda: f.read(1024 * 1024), b""):
+            digest.update(chunk)
+    return digest.hexdigest()
+
+
+def _repo_root() -> Path | None:
+    try:
+        root = subprocess.check_output(
+            ["git", "rev-parse", "--show-toplevel"],
+            text=True,
+        ).strip()
+    except Exception:
+        return None
+    return Path(root)
+
+
+def _git_value(*args: str) -> str | None:
+    try:
+        return subprocess.check_output(["git", *args], text=True).strip()
+    except Exception:
+        return None
+
+
+def _relative_to_repo(path: Path) -> str | None:
+    root = _repo_root()
+    if root is None:
+        return None
+    try:
+        return str(path.resolve().relative_to(root))
+    except ValueError:
+        return None
+
+
+def _git_blob_sha(path: Path) -> str | None:
+    relative_path = _relative_to_repo(path)
+    if relative_path is None:
+        return None
+    return _git_value("rev-parse", f"HEAD:{relative_path}")
+
+
+def _last_commit(path: Path) -> str | None:
+    relative_path = _relative_to_repo(path)
+    if relative_path is None:
+        return None
+    return _git_value("log", "-1", "--format=%H", "--", relative_path)
+
+
+def _loss_summary(dataset: UKSingleYearDataset, fiscal_year: int) -> dict[str, Any]:
+    loss = get_loss_results(dataset, str(fiscal_year))
+    abs_relative_error = loss.abs_rel_error.to_numpy()
+    include = np.isfinite(abs_relative_error) & (loss.target.to_numpy() != 0)
+    included_errors = loss.loc[include, "abs_rel_error"]
+    return {
+        "target_count": int(len(loss)),
+        "nonzero_finite_target_count": int(include.sum()),
+        "zero_target_count": int((loss.target == 0).sum()),
+        "nonfinite_relative_error_count": int((~np.isfinite(abs_relative_error)).sum()),
+        "mean_abs_relative_error": round(float(included_errors.mean()), 6),
+        "median_abs_relative_error": round(float(included_errors.median()), 6),
+        "p90_abs_relative_error": round(
+            float(included_errors.quantile(0.9)),
+            6,
+        ),
+        "share_within_10pct": round(float((included_errors <= 0.10).mean()), 6),
+        "share_within_25pct": round(float((included_errors <= 0.25).mean()), 6),
+        "excludes_zero_target_relative_errors": True,
+    }
+
+
+def _weight_diagnostics(weights: np.ndarray) -> dict[str, Any]:
+    sorted_weights = np.sort(weights)[::-1]
+    total = weights.sum()
+    threshold = 1e-6
+    return {
+        "total_household_weight": round(float(total), 6),
+        "effective_sample_size": round(
+            float(total**2 / np.square(weights).sum()),
+            6,
+        ),
+        "min_household_weight": float(weights.min()),
+        "max_household_weight": round(float(weights.max()), 6),
+        "top_1_share": round(float(sorted_weights[:1].sum() / total), 9),
+        "top_10_share": round(float(sorted_weights[:10].sum() / total), 9),
+        "top_100_share": round(float(sorted_weights[:100].sum() / total), 9),
+        "near_zero_weight_threshold": threshold,
+        "near_zero_weight_count": int((weights <= threshold).sum()),
+    }
+
+
+def build_enhanced_cps_manifest(
+    *,
+    source_file_path: str | Path = ENHANCED_CPS_SOURCE_FILE,
+    artifact_file_path: str | Path = ENHANCED_CPS_FILE,
+    fiscal_year: int = 2025,
+    include_loss: bool = True,
+    include_raw_loss: bool = True,
+) -> dict[str, Any]:
+    """Build a JSON-serializable manifest for the committed public artifact."""
+    source_file_path = Path(source_file_path)
+    artifact_file_path = Path(artifact_file_path)
+
+    source = pd.read_csv(source_file_path, usecols=["scenario_id"])
+    dataset = UKSingleYearDataset(file_path=str(artifact_file_path))
+    values = dataset.load()
+    household_weights = np.asarray(values["household_weight"], dtype=float)
+
+    manifest: dict[str, Any] = {
+        "schema_version": ENHANCED_CPS_MANIFEST_SCHEMA_VERSION,
+        "artifact": "enhanced_cps_2025",
+        "generated_at": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
+        "description": (
+            "Public UK calibrated transfer dataset derived from a public export "
+            "of benchmark-compatible PolicyEngine US Enhanced CPS households."
+        ),
+        "intended_uses": [
+            "public demos",
+            "reproducible examples",
+            "public benchmark analysis",
+        ],
+        "not_intended_uses": [
+            "substitution for FRS or enhanced FRS",
+            "precise UK distributional analysis",
+            "administrative truth validation",
+        ],
+        "files": {
+            "artifact": {
+                "path": _relative_to_repo(artifact_file_path)
+                or str(artifact_file_path),
+                "sha256": _sha256(artifact_file_path),
+                "git_blob_sha": _git_blob_sha(artifact_file_path),
+                "last_modified_commit": _last_commit(artifact_file_path),
+                "size_bytes": artifact_file_path.stat().st_size,
+            },
+            "source_manifest": {
+                "path": _relative_to_repo(source_file_path) or str(source_file_path),
+                "sha256": _sha256(source_file_path),
+                "git_blob_sha": _git_blob_sha(source_file_path),
+                "last_modified_commit": _last_commit(source_file_path),
+                "size_bytes": source_file_path.stat().st_size,
+            },
+        },
+        "build": {
+            "builder": "policyengine_uk_data.datasets.save_enhanced_cps",
+            "build_command": (
+                "uv run --python 3.13 python -m "
+                "policyengine_uk_data.storage.write_enhanced_cps_manifest"
+            ),
+            "fiscal_year": fiscal_year,
+            "source_dataset": "PolicyEngine US Enhanced CPS public export",
+            "source_scope": "benchmark-compatible households",
+            "calibrated": True,
+            "calibration_target_year": fiscal_year,
+            "exchange_rate": {
+                "usd_to_gbp": USD_TO_GBP,
+                "source_url": USD_TO_GBP_SOURCE_URL,
+                "live_api_called": False,
+            },
+        },
+        "row_counts": {
+            "source_households": int(len(source)),
+            "h5_households": int(len(values["household_id"])),
+            "h5_people": int(len(values["person_id"])),
+            "h5_benunits": int(len(values["benunit_id"])),
+        },
+        "weight_diagnostics": _weight_diagnostics(household_weights),
+    }
+
+    if include_loss:
+        manifest["loss_diagnostics"] = {
+            "target_year": fiscal_year,
+            "calibrated": _loss_summary(dataset, fiscal_year),
+        }
+        if include_raw_loss:
+            raw_dataset = create_enhanced_cps(
+                source_file_path=source_file_path,
+                fiscal_year=fiscal_year,
+                calibrate=False,
+            )
+            manifest["loss_diagnostics"]["raw_transfer_weights"] = _loss_summary(
+                raw_dataset,
+                fiscal_year,
+            )
+
+    return manifest
+
+
+def write_enhanced_cps_manifest(
+    output_file_path: str | Path = ENHANCED_CPS_MANIFEST_FILE,
+    **kwargs,
+) -> dict[str, Any]:
+    """Write the public enhanced CPS artifact manifest to disk."""
+    manifest = build_enhanced_cps_manifest(**kwargs)
+    output_file_path = Path(output_file_path)
+    output_file_path.write_text(
+        json.dumps(manifest, indent=2, sort_keys=True) + "\n",
+        encoding="utf-8",
+    )
+    return manifest
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output", default=str(ENHANCED_CPS_MANIFEST_FILE))
+    parser.add_argument("--skip-loss", action="store_true")
+    parser.add_argument("--skip-raw-loss", action="store_true")
+    args = parser.parse_args()
+
+    write_enhanced_cps_manifest(
+        output_file_path=args.output,
+        include_loss=not args.skip_loss,
+        include_raw_loss=not args.skip_raw_loss,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/uv.lock b/uv.lock
index adc63bb55..d2b35a8c3 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1366,7 +1366,7 @@ wheels = [
 
 [[package]]
 name = "policyengine-uk-data"
-version = "1.53.1"
+version = "1.54.0"
 source = { editable = "." }
 dependencies = [
     { name = "google-auth" },

From de08a89192780abbeaa269d22e09ff250c0f162f Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Sun, 26 Apr 2026 22:29:37 -0400
Subject: [PATCH 2/5] Tighten transfer leaf input guard

---
 .../tests/test_policybench_transfer.py        | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/policyengine_uk_data/tests/test_policybench_transfer.py b/policyengine_uk_data/tests/test_policybench_transfer.py
index 2d5c10f6d..970c16395 100644
--- a/policyengine_uk_data/tests/test_policybench_transfer.py
+++ b/policyengine_uk_data/tests/test_policybench_transfer.py
@@ -21,6 +21,21 @@
 }
 
 
+def _is_valid_leaf_input(column: str, entity: str, system: CountryTaxBenefitSystem):
+    variable = system.variables.get(column)
+    if variable is None:
+        return False
+    if variable.entity.key != entity:
+        return False
+    if column in ALLOWED_REPORTED_DATA_INPUTS:
+        return True
+    if not variable.is_input_variable():
+        return False
+    if getattr(variable, "defined_for", None) is not None:
+        return False
+    return True
+
+
 def _subset_source(tmp_path: Path, rows: int) -> Path:
     source = pd.read_csv(ENHANCED_CPS_SOURCE_FILE).head(rows).copy()
     subset_path = tmp_path / f"enhanced_cps_source_{rows}.csv"
@@ -58,12 +73,7 @@ def test_policybench_transfer_writes_only_valid_leaf_inputs(tmp_path: Path):
         invalid_columns = [
             column
             for column in frame.columns
-            if column not in system.variables
-            or system.variables[column].entity.key != entity
-            or (
-                not system.variables[column].is_input_variable()
-                and column not in ALLOWED_REPORTED_DATA_INPUTS
-            )
+            if not _is_valid_leaf_input(column, entity, system)
         ]
         assert invalid_columns == []
 

From 7c0692830d5c9f6ee5243d5f846623cd11c445f5 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Thu, 30 Apr 2026 10:23:57 -0400
Subject: [PATCH 3/5] Move PIP amount mapping into UK data

---
 policyengine_uk_data/datasets/enhanced_cps.py | 34 +------
 policyengine_uk_data/datasets/frs.py          | 93 ++++++++++++-------
 .../datasets/imputations/frs_only.py          | 48 ++++++++--
 .../storage/uprating_factors.csv              |  2 -
 .../storage/uprating_growth_factors.csv       |  2 -
 .../tests/test_frs_only_imputation.py         | 15 ++-
 .../tests/test_policybench_transfer.py        |  6 ++
 7 files changed, 128 insertions(+), 72 deletions(-)

diff --git a/policyengine_uk_data/datasets/enhanced_cps.py b/policyengine_uk_data/datasets/enhanced_cps.py
index 8ba026511..61db8e62d 100644
--- a/policyengine_uk_data/datasets/enhanced_cps.py
+++ b/policyengine_uk_data/datasets/enhanced_cps.py
@@ -36,26 +36,13 @@
     "yearly-average-currency-exchange-rates"
 )
 
-# 2025/26 reported-benefit mapping assumptions used only to populate UK input
-# leaves from U.S. source records. PolicyEngine UK applies its own parameters
-# when calculating derived tax and benefit outputs.
+# 2025/26 transfer assumptions used only to populate UK input leaves from U.S.
+# source records. PolicyEngine UK applies its own parameters when calculating
+# derived tax and benefit outputs.
 NEW_STATE_PENSION_2025 = 224.96 * 52
 DIVIDEND_YIELD_FOR_WEALTH_IMPUTATION = 0.03
 RENTAL_YIELD_FOR_WEALTH_IMPUTATION = 0.04
 
-PIP_2025_WEEKLY_RATES = {
-    "daily_living": {
-        "NONE": 0.0,
-        "STANDARD": 73.89,
-        "ENHANCED": 110.40,
-    },
-    "mobility": {
-        "NONE": 0.0,
-        "STANDARD": 29.19,
-        "ENHANCED": 77.04,
-    },
-}
-
 REGION_SHARES = (
     ("NORTH_EAST", 0.04),
     ("NORTH_WEST", 0.11),
@@ -249,11 +236,6 @@ def _pip_category(person: dict) -> str:
     return "ENHANCED" if severe_signal or low_earnings else "STANDARD"
 
 
-def _pip_reported_amount(category: str, component: str) -> float:
-    weekly = PIP_2025_WEEKLY_RATES[component][category]
-    return round(weekly * 52, 2)
-
-
 def _household_cash_income(people: list[dict], exchange_rate: float) -> float:
     total = 0.0
     for person in people:
@@ -689,14 +671,8 @@ def _build_base_dataset(
                     if bool(inputs.get("is_blind", False))
                     else 0.0,
                     "is_disabled_for_benefits": bool(inputs.get("is_disabled", False)),
-                    "pip_dl_reported": _pip_reported_amount(
-                        pip_category,
-                        "daily_living",
-                    ),
-                    "pip_m_reported": _pip_reported_amount(
-                        pip_category,
-                        "mobility",
-                    ),
+                    "pip_dl_category": pip_category,
+                    "pip_m_category": pip_category,
                     "hours_worked": float(
                         inputs.get(
                             "weekly_hours_worked",
diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py
index 170f11b51..37deb0a14 100644
--- a/policyengine_uk_data/datasets/frs.py
+++ b/policyengine_uk_data/datasets/frs.py
@@ -56,6 +56,25 @@
     "disabled_students_allowance_course_eligible",
     "disabled_students_allowance_has_qualifying_condition",
 )
+PIP_CATEGORY_SAFETY_MARGIN = 0.1
+
+
+def _pip_category_from_reported(
+    reported,
+    standard_rate: float,
+    enhanced_rate: float,
+) -> np.ndarray:
+    """Convert annual reported PIP amounts to PE-UK PIP category inputs."""
+
+    reported_weekly = pd.Series(reported).fillna(0).astype(float) / WEEKS_IN_YEAR
+    return np.select(
+        [
+            reported_weekly >= enhanced_rate * (1 - PIP_CATEGORY_SAFETY_MARGIN),
+            reported_weekly >= standard_rate * (1 - PIP_CATEGORY_SAFETY_MARGIN),
+        ],
+        ["ENHANCED", "STANDARD"],
+        default="NONE",
+    )
 
 
 @lru_cache(maxsize=None)
@@ -1099,6 +1118,49 @@ def determine_education_level(fted_val, typeed2_val, age_val):
         household.index,
     )
 
+    benefit = CountryTaxBenefitSystem().parameters(year).gov.dwp
+
+    pe_person["pip_dl_category"] = _pip_category_from_reported(
+        pe_person["pip_dl_reported"],
+        benefit.pip.daily_living.standard,
+        benefit.pip.daily_living.enhanced,
+    )
+    pe_person["pip_m_category"] = _pip_category_from_reported(
+        pe_person["pip_m_reported"],
+        benefit.pip.mobility.standard,
+        benefit.pip.mobility.enhanced,
+    )
+
+    has_pip = (pe_person["pip_dl_category"] != "NONE") | (
+        pe_person["pip_m_category"] != "NONE"
+    )
+    pe_person["is_disabled_for_benefits"] = (
+        pe_person.dla_sc_reported + pe_person.dla_m_reported > 0
+    ) | has_pip
+
+    THRESHOLD_SAFETY_GAP = 1 * WEEKS_IN_YEAR
+
+    pe_person["is_enhanced_disabled_for_benefits"] = (
+        pe_person.dla_sc_reported
+        > benefit.dla.self_care.higher * WEEKS_IN_YEAR - THRESHOLD_SAFETY_GAP
+    )
+
+    # Child Tax Credit Regulations 2002 s. 8
+    paragraph_3 = (
+        pe_person.dla_sc_reported
+        >= benefit.dla.self_care.higher * WEEKS_IN_YEAR - THRESHOLD_SAFETY_GAP
+    )
+    paragraph_4 = pe_person["pip_dl_category"] == "ENHANCED"
+    paragraph_5 = pe_person.afcs_reported > 0
+    pe_person["is_severely_disabled_for_benefits"] = (
+        paragraph_3 | paragraph_4 | paragraph_5
+    )
+
+    pe_person = pe_person.drop(
+        columns=["pip_dl_reported", "pip_m_reported"],
+        errors="ignore",
+    )
+
     dataset = UKSingleYearDataset(
         person=pe_person,
         benunit=pe_benunit,
@@ -1144,37 +1206,6 @@ def determine_education_level(fted_val, typeed2_val, age_val):
 
     pe_household["brma"] = brmas
 
-    parameters = sim.tax_benefit_system.parameters
-    benefit = parameters(year).gov.dwp
-
-    pe_person["is_disabled_for_benefits"] = (
-        pe_person.dla_sc_reported
-        + pe_person.dla_m_reported
-        + pe_person.pip_m_reported
-        + pe_person.pip_dl_reported
-    ) > 0
-
-    THRESHOLD_SAFETY_GAP = 1 * WEEKS_IN_YEAR
-
-    pe_person["is_enhanced_disabled_for_benefits"] = (
-        pe_person.dla_sc_reported
-        > benefit.dla.self_care.higher * WEEKS_IN_YEAR - THRESHOLD_SAFETY_GAP
-    )
-
-    # Child Tax Credit Regulations 2002 s. 8
-    paragraph_3 = (
-        pe_person.dla_sc_reported
-        >= benefit.dla.self_care.higher * WEEKS_IN_YEAR - THRESHOLD_SAFETY_GAP
-    )
-    paragraph_4 = (
-        pe_person.pip_dl_reported
-        >= benefit.pip.daily_living.enhanced * WEEKS_IN_YEAR - THRESHOLD_SAFETY_GAP
-    )
-    paragraph_5 = pe_person.afcs_reported > 0
-    pe_person["is_severely_disabled_for_benefits"] = (
-        paragraph_3 | paragraph_4 | paragraph_5
-    )
-
     # Dataset-side claimant-state approximations for future legacy ESA/JSA
     # modelling. These are explicit proxies based on observed survey
     # conditions, not legislative determinations.
diff --git a/policyengine_uk_data/datasets/imputations/frs_only.py b/policyengine_uk_data/datasets/imputations/frs_only.py
index 1730bc257..0ec4541e2 100644
--- a/policyengine_uk_data/datasets/imputations/frs_only.py
+++ b/policyengine_uk_data/datasets/imputations/frs_only.py
@@ -81,8 +81,6 @@
     "state_pension_reported",
     "dla_sc_reported",
     "dla_m_reported",
-    "pip_m_reported",
-    "pip_dl_reported",
     "sda_reported",
     "carers_allowance_reported",
     "iidb_reported",
@@ -98,6 +96,22 @@
     "esa_income_reported",
 ]
 
+FRS_ONLY_PERSON_CATEGORY_VARIABLES = [
+    "pip_m_category",
+    "pip_dl_category",
+]
+
+PIP_CATEGORY_TO_CODE = {
+    "NONE": 0.0,
+    "STANDARD": 1.0,
+    "ENHANCED": 2.0,
+}
+PIP_CODE_TO_CATEGORY = {
+    0: "NONE",
+    1: "STANDARD",
+    2: "ENHANCED",
+}
+
 
 def _one_hot_encode(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
     """Return ``df`` with object-typed ``columns`` one-hot encoded.
@@ -157,6 +171,11 @@ def _build_predictor_frame(dataset: UKSingleYearDataset) -> pd.DataFrame:
     return frame
 
 
+def _category_codes(series: pd.Series) -> pd.Series:
+    categories = series.fillna("NONE").astype(str).str.upper()
+    return categories.map(PIP_CATEGORY_TO_CODE).fillna(0.0)
+
+
 def impute_frs_only_variables(
     train_dataset: UKSingleYearDataset,
     target_dataset: UKSingleYearDataset,
@@ -183,12 +202,20 @@ def impute_frs_only_variables(
     target_person = target_dataset.person
 
     # Use only variables present in both frames.
-    outputs = [
+    numeric_outputs = [
         v
         for v in FRS_ONLY_PERSON_VARIABLES
         if v in train_person.columns and v in target_person.columns
     ]
-    missing = set(FRS_ONLY_PERSON_VARIABLES) - set(outputs)
+    category_outputs = [
+        v
+        for v in FRS_ONLY_PERSON_CATEGORY_VARIABLES
+        if v in train_person.columns and v in target_person.columns
+    ]
+    outputs = numeric_outputs + category_outputs
+    missing = (
+        set(FRS_ONLY_PERSON_VARIABLES) | set(FRS_ONLY_PERSON_CATEGORY_VARIABLES)
+    ) - set(outputs)
     if missing:
         logger.warning(
             "Stage-2 FRS-only imputation: %d variables absent from "
@@ -213,7 +240,9 @@ def impute_frs_only_variables(
     # Replace NaNs in outputs with 0 so the QRF trains on clean targets;
     # FRS-only variables are almost all zero-heavy "amount if eligible"
     # columns that default to zero when unreported.
-    train_outputs = train_person[outputs].fillna(0).astype(float)
+    train_outputs = train_person[numeric_outputs].fillna(0).astype(float)
+    for column in category_outputs:
+        train_outputs[column] = _category_codes(train_person[column])
 
     logger.info(
         "Stage-2 FRS-only imputation: %d outputs, training on %d FRS "
@@ -231,10 +260,17 @@ def impute_frs_only_variables(
     # clamp to zero (the population-typical value for these variables).
     predictions = predictions.fillna(0.0)
 
-    for column in outputs:
+    for column in numeric_outputs:
         # Clamp negative predictions — these columns represent receipted
         # amounts or contributions and are non-negative by construction.
         values = np.maximum(predictions[column].values, 0.0)
         target_dataset.person[column] = values
 
+    for column in category_outputs:
+        values = np.rint(predictions[column].fillna(0.0).values).astype(int)
+        values = np.clip(values, 0, 2)
+        target_dataset.person[column] = [
+            PIP_CODE_TO_CATEGORY[value] for value in values
+        ]
+
     return target_dataset
diff --git a/policyengine_uk_data/storage/uprating_factors.csv b/policyengine_uk_data/storage/uprating_factors.csv
index 5a7e54df4..73f18973b 100644
--- a/policyengine_uk_data/storage/uprating_factors.csv
+++ b/policyengine_uk_data/storage/uprating_factors.csv
@@ -60,8 +60,6 @@ pension_credit_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,
 pension_income,1.0,1.0,1.092,1.147,1.19,1.223,1.258,1.297,1.34,1.384,1.384,1.384,1.384,1.384,1.384
 personal_pension_contributions,1.0,1.059,1.127,1.205,1.261,1.308,1.337,1.365,1.396,1.431,1.431,1.431,1.431,1.431,1.431
 petrol_spending,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38
-pip_dl_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38
-pip_m_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38
 private_pension_income,1.0,1.003,1.053,1.106,1.161,1.216,1.261,1.288,1.315,1.346,1.346,1.346,1.346,1.346,1.346
 private_transfer_income,1.0,1.0,1.092,1.147,1.19,1.223,1.258,1.297,1.34,1.384,1.384,1.384,1.384,1.384,1.384
 property_income,1.0,1.0,1.092,1.147,1.19,1.223,1.258,1.297,1.34,1.384,1.384,1.384,1.384,1.384,1.384
diff --git a/policyengine_uk_data/storage/uprating_growth_factors.csv b/policyengine_uk_data/storage/uprating_growth_factors.csv
index eb8b7fb6d..8a4a924fa 100644
--- a/policyengine_uk_data/storage/uprating_growth_factors.csv
+++ b/policyengine_uk_data/storage/uprating_growth_factors.csv
@@ -60,8 +60,6 @@ pension_credit_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.
 pension_income,0,0.0,0.092,0.05,0.037,0.028,0.029,0.031,0.033,0.033,0.0,0.0,0.0,0.0,0.0
 personal_pension_contributions,0,0.059,0.064,0.069,0.046,0.037,0.022,0.021,0.023,0.025,0.0,0.0,0.0,0.0,0.0
 petrol_spending,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0
-pip_dl_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0
-pip_m_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0
 private_pension_income,0,0.003,0.05,0.05,0.05,0.047,0.037,0.021,0.021,0.024,0.0,0.0,0.0,0.0,0.0
 private_transfer_income,0,0.0,0.092,0.05,0.037,0.028,0.029,0.031,0.033,0.033,0.0,0.0,0.0,0.0,0.0
 property_income,0,0.0,0.092,0.05,0.037,0.028,0.029,0.031,0.033,0.033,0.0,0.0,0.0,0.0,0.0
diff --git a/policyengine_uk_data/tests/test_frs_only_imputation.py b/policyengine_uk_data/tests/test_frs_only_imputation.py
index 9393c7436..265105ab7 100644
--- a/policyengine_uk_data/tests/test_frs_only_imputation.py
+++ b/policyengine_uk_data/tests/test_frs_only_imputation.py
@@ -72,8 +72,16 @@ def _fake_dataset(person_rows: int, seed: int = 0):
             "state_pension_reported": 0.0,
             "dla_sc_reported": 0.0,
             "dla_m_reported": 0.0,
-            "pip_m_reported": 0.0,
-            "pip_dl_reported": 0.0,
+            "pip_m_category": np.where(
+                rng.random(person_rows) < 0.05,
+                "STANDARD",
+                "NONE",
+            ),
+            "pip_dl_category": np.where(
+                rng.random(person_rows) < 0.05,
+                "ENHANCED",
+                "NONE",
+            ),
             "sda_reported": 0.0,
             "carers_allowance_reported": 0.0,
             "iidb_reported": 0.0,
@@ -126,6 +134,9 @@ def test_frs_only_outputs_are_non_negative():
         assert np.all(values >= 0), f"{column} has negative predictions"
         assert np.isfinite(values).all(), f"{column} has NaN / inf predictions"
 
+    for column in ("pip_m_category", "pip_dl_category"):
+        assert result.person[column].isin(["NONE", "STANDARD", "ENHANCED"]).all()
+
 
 def test_frs_only_does_not_touch_non_output_columns():
     """Stage-2 must only rewrite the curated output list, nothing else."""
diff --git a/policyengine_uk_data/tests/test_policybench_transfer.py b/policyengine_uk_data/tests/test_policybench_transfer.py
index 970c16395..ac75231c3 100644
--- a/policyengine_uk_data/tests/test_policybench_transfer.py
+++ b/policyengine_uk_data/tests/test_policybench_transfer.py
@@ -79,6 +79,12 @@ def test_policybench_transfer_writes_only_valid_leaf_inputs(tmp_path: Path):
 
     assert "household_wealth" not in dataset.household.columns
     assert "total_wealth" not in dataset.household.columns
+    assert "pip_dl_reported" not in dataset.person.columns
+    assert "pip_m_reported" not in dataset.person.columns
+    assert (
+        dataset.person["pip_dl_category"].isin(["NONE", "STANDARD", "ENHANCED"]).all()
+    )
+    assert dataset.person["pip_m_category"].isin(["NONE", "STANDARD", "ENHANCED"]).all()
     for column in (
         "savings",
         "main_residence_value",

From 8be858ff588449705728f1fc48e91675ed6c7722 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Thu, 30 Apr 2026 10:33:14 -0400
Subject: [PATCH 4/5] Move disability benefit category mapping into UK data

---
 policyengine_uk_data/datasets/frs.py          | 76 +++++++++++++------
 .../datasets/imputations/frs_only.py          | 46 +++++------
 .../storage/uprating_factors.csv              |  3 -
 .../storage/uprating_growth_factors.csv       |  3 -
 .../tests/test_frs_only_imputation.py         | 20 +++--
 .../tests/test_policybench_transfer.py        |  3 +
 6 files changed, 89 insertions(+), 62 deletions(-)

diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py
index 18e9535b2..9aa11f8e2 100644
--- a/policyengine_uk_data/datasets/frs.py
+++ b/policyengine_uk_data/datasets/frs.py
@@ -59,20 +59,19 @@
 PIP_CATEGORY_SAFETY_MARGIN = 0.1
 
 
-def _pip_category_from_reported(
+def _category_from_reported(
     reported,
-    standard_rate: float,
-    enhanced_rate: float,
+    thresholds: tuple[tuple[str, float], ...],
 ) -> np.ndarray:
-    """Convert annual reported PIP amounts to PE-UK PIP category inputs."""
+    """Convert annual reported amounts to PE-UK category inputs."""
 
     reported_weekly = pd.Series(reported).fillna(0).astype(float) / WEEKS_IN_YEAR
     return np.select(
         [
-            reported_weekly >= enhanced_rate * (1 - PIP_CATEGORY_SAFETY_MARGIN),
-            reported_weekly >= standard_rate * (1 - PIP_CATEGORY_SAFETY_MARGIN),
+            reported_weekly >= rate * (1 - PIP_CATEGORY_SAFETY_MARGIN)
+            for _, rate in thresholds
         ],
-        ["ENHANCED", "STANDARD"],
+        [category for category, _ in thresholds],
         default="NONE",
     )
 
@@ -1244,36 +1243,57 @@ def determine_education_level(fted_val, typeed2_val, age_val):
 
     benefit = CountryTaxBenefitSystem().parameters(year).gov.dwp
 
-    pe_person["pip_dl_category"] = _pip_category_from_reported(
+    pe_person["aa_category"] = _category_from_reported(
+        pe_person["attendance_allowance_reported"],
+        (
+            ("HIGHER", benefit.attendance_allowance.higher),
+            ("LOWER", benefit.attendance_allowance.lower),
+        ),
+    )
+    pe_person["dla_sc_category"] = _category_from_reported(
+        pe_person["dla_sc_reported"],
+        (
+            ("HIGHER", benefit.dla.self_care.higher),
+            ("MIDDLE", benefit.dla.self_care.middle),
+            ("LOWER", benefit.dla.self_care.lower),
+        ),
+    )
+    pe_person["dla_m_category"] = _category_from_reported(
+        pe_person["dla_m_reported"],
+        (
+            ("HIGHER", benefit.dla.mobility.higher),
+            ("LOWER", benefit.dla.mobility.lower),
+        ),
+    )
+    pe_person["pip_dl_category"] = _category_from_reported(
         pe_person["pip_dl_reported"],
-        benefit.pip.daily_living.standard,
-        benefit.pip.daily_living.enhanced,
+        (
+            ("ENHANCED", benefit.pip.daily_living.enhanced),
+            ("STANDARD", benefit.pip.daily_living.standard),
+        ),
     )
-    pe_person["pip_m_category"] = _pip_category_from_reported(
+    pe_person["pip_m_category"] = _category_from_reported(
         pe_person["pip_m_reported"],
-        benefit.pip.mobility.standard,
-        benefit.pip.mobility.enhanced,
+        (
+            ("ENHANCED", benefit.pip.mobility.enhanced),
+            ("STANDARD", benefit.pip.mobility.standard),
+        ),
     )
 
     has_pip = (pe_person["pip_dl_category"] != "NONE") | (
         pe_person["pip_m_category"] != "NONE"
     )
-    pe_person["is_disabled_for_benefits"] = (
-        pe_person.dla_sc_reported + pe_person.dla_m_reported > 0
-    ) | has_pip
-
-    THRESHOLD_SAFETY_GAP = 1 * WEEKS_IN_YEAR
+    has_dla = (pe_person["dla_sc_category"] != "NONE") | (
+        pe_person["dla_m_category"] != "NONE"
+    )
+    pe_person["is_disabled_for_benefits"] = has_dla | has_pip
 
     pe_person["is_enhanced_disabled_for_benefits"] = (
-        pe_person.dla_sc_reported
-        > benefit.dla.self_care.higher * WEEKS_IN_YEAR - THRESHOLD_SAFETY_GAP
+        pe_person["dla_sc_category"] == "HIGHER"
     )
 
     # Child Tax Credit Regulations 2002 s. 8
-    paragraph_3 = (
-        pe_person.dla_sc_reported
-        >= benefit.dla.self_care.higher * WEEKS_IN_YEAR - THRESHOLD_SAFETY_GAP
-    )
+    paragraph_3 = pe_person["dla_sc_category"] == "HIGHER"
     paragraph_4 = pe_person["pip_dl_category"] == "ENHANCED"
     paragraph_5 = pe_person.afcs_reported > 0
     pe_person["is_severely_disabled_for_benefits"] = (
@@ -1281,7 +1301,13 @@ def determine_education_level(fted_val, typeed2_val, age_val):
     )
 
     pe_person = pe_person.drop(
-        columns=["pip_dl_reported", "pip_m_reported"],
+        columns=[
+            "attendance_allowance_reported",
+            "dla_sc_reported",
+            "dla_m_reported",
+            "pip_dl_reported",
+            "pip_m_reported",
+        ],
         errors="ignore",
     )
 
diff --git a/policyengine_uk_data/datasets/imputations/frs_only.py b/policyengine_uk_data/datasets/imputations/frs_only.py
index 0ec4541e2..55c46d095 100644
--- a/policyengine_uk_data/datasets/imputations/frs_only.py
+++ b/policyengine_uk_data/datasets/imputations/frs_only.py
@@ -77,10 +77,7 @@
     "income_support_reported",
     "working_tax_credit_reported",
     "child_tax_credit_reported",
-    "attendance_allowance_reported",
     "state_pension_reported",
-    "dla_sc_reported",
-    "dla_m_reported",
     "sda_reported",
     "carers_allowance_reported",
     "iidb_reported",
@@ -96,20 +93,12 @@
     "esa_income_reported",
 ]
 
-FRS_ONLY_PERSON_CATEGORY_VARIABLES = [
-    "pip_m_category",
-    "pip_dl_category",
-]
-
-PIP_CATEGORY_TO_CODE = {
-    "NONE": 0.0,
-    "STANDARD": 1.0,
-    "ENHANCED": 2.0,
-}
-PIP_CODE_TO_CATEGORY = {
-    0: "NONE",
-    1: "STANDARD",
-    2: "ENHANCED",
+FRS_ONLY_PERSON_CATEGORY_VARIABLES = {
+    "aa_category": ["NONE", "LOWER", "HIGHER"],
+    "dla_m_category": ["NONE", "LOWER", "HIGHER"],
+    "dla_sc_category": ["NONE", "LOWER", "MIDDLE", "HIGHER"],
+    "pip_m_category": ["NONE", "STANDARD", "ENHANCED"],
+    "pip_dl_category": ["NONE", "STANDARD", "ENHANCED"],
 }
 
 
@@ -171,9 +160,12 @@ def _build_predictor_frame(dataset: UKSingleYearDataset) -> pd.DataFrame:
     return frame
 
 
-def _category_codes(series: pd.Series) -> pd.Series:
+def _category_codes(series: pd.Series, allowed_categories: list[str]) -> pd.Series:
+    category_to_code = {
+        category: float(index) for index, category in enumerate(allowed_categories)
+    }
     categories = series.fillna("NONE").astype(str).str.upper()
-    return categories.map(PIP_CATEGORY_TO_CODE).fillna(0.0)
+    return categories.map(category_to_code).fillna(0.0)
 
 
 def impute_frs_only_variables(
@@ -209,12 +201,12 @@ def impute_frs_only_variables(
     ]
     category_outputs = [
         v
-        for v in FRS_ONLY_PERSON_CATEGORY_VARIABLES
+        for v in FRS_ONLY_PERSON_CATEGORY_VARIABLES.keys()
         if v in train_person.columns and v in target_person.columns
     ]
     outputs = numeric_outputs + category_outputs
     missing = (
-        set(FRS_ONLY_PERSON_VARIABLES) | set(FRS_ONLY_PERSON_CATEGORY_VARIABLES)
+        set(FRS_ONLY_PERSON_VARIABLES) | set(FRS_ONLY_PERSON_CATEGORY_VARIABLES.keys())
     ) - set(outputs)
     if missing:
         logger.warning(
@@ -242,7 +234,10 @@ def impute_frs_only_variables(
     # columns that default to zero when unreported.
     train_outputs = train_person[numeric_outputs].fillna(0).astype(float)
     for column in category_outputs:
-        train_outputs[column] = _category_codes(train_person[column])
+        train_outputs[column] = _category_codes(
+            train_person[column],
+            FRS_ONLY_PERSON_CATEGORY_VARIABLES[column],
+        )
 
     logger.info(
         "Stage-2 FRS-only imputation: %d outputs, training on %d FRS "
@@ -267,10 +262,9 @@ def impute_frs_only_variables(
         target_dataset.person[column] = values
 
     for column in category_outputs:
+        allowed_categories = FRS_ONLY_PERSON_CATEGORY_VARIABLES[column]
         values = np.rint(predictions[column].fillna(0.0).values).astype(int)
-        values = np.clip(values, 0, 2)
-        target_dataset.person[column] = [
-            PIP_CODE_TO_CATEGORY[value] for value in values
-        ]
+        values = np.clip(values, 0, len(allowed_categories) - 1)
+        target_dataset.person[column] = [allowed_categories[value] for value in values]
 
     return target_dataset
diff --git a/policyengine_uk_data/storage/uprating_factors.csv b/policyengine_uk_data/storage/uprating_factors.csv
index 73f18973b..c5f6f638e 100644
--- a/policyengine_uk_data/storage/uprating_factors.csv
+++ b/policyengine_uk_data/storage/uprating_factors.csv
@@ -1,7 +1,6 @@
 Variable,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034
 afcs_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38
 alcohol_and_tobacco_consumption,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38
-attendance_allowance_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38
 benunit_rent,1.0,1.0,1.0,1.11,1.184,1.223,1.275,1.312,1.351,1.392,1.392,1.392,1.392,1.392,1.392
 bsp_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38
 capital_gains,1.0,1.0,1.092,1.147,1.19,1.223,1.258,1.297,1.34,1.384,1.384,1.384,1.384,1.384,1.384
@@ -15,8 +14,6 @@ communication_consumption,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.3
 corporate_wealth,1.0,1.0,1.092,1.147,1.19,1.223,1.258,1.297,1.34,1.384,1.384,1.384,1.384,1.384,1.384
 diesel_spending,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38
 dividend_income,1.0,1.0,1.092,1.147,1.19,1.223,1.258,1.297,1.34,1.384,1.384,1.384,1.384,1.384,1.384
-dla_m_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38
-dla_sc_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38
 domestic_energy_consumption,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38
 education_consumption,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38
 employee_pension_contributions,1.0,1.059,1.127,1.205,1.261,1.308,1.337,1.365,1.396,1.431,1.431,1.431,1.431,1.431,1.431
diff --git a/policyengine_uk_data/storage/uprating_growth_factors.csv b/policyengine_uk_data/storage/uprating_growth_factors.csv
index 8a4a924fa..7d330d17a 100644
--- a/policyengine_uk_data/storage/uprating_growth_factors.csv
+++ b/policyengine_uk_data/storage/uprating_growth_factors.csv
@@ -1,7 +1,6 @@
 Variable,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034
 afcs_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0
 alcohol_and_tobacco_consumption,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0
-attendance_allowance_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0
 benunit_rent,0,0.0,0.0,0.11,0.067,0.033,0.043,0.029,0.03,0.03,0.0,0.0,0.0,0.0,0.0
 bsp_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0
 capital_gains,0,0.0,0.092,0.05,0.037,0.028,0.029,0.031,0.033,0.033,0.0,0.0,0.0,0.0,0.0
@@ -15,8 +14,6 @@ communication_consumption,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,
 corporate_wealth,0,0.0,0.092,0.05,0.037,0.028,0.029,0.031,0.033,0.033,0.0,0.0,0.0,0.0,0.0
 diesel_spending,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0
 dividend_income,0,0.0,0.092,0.05,0.037,0.028,0.029,0.031,0.033,0.033,0.0,0.0,0.0,0.0,0.0
-dla_m_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0
-dla_sc_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0
 domestic_energy_consumption,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0
 education_consumption,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0
 employee_pension_contributions,0,0.059,0.064,0.069,0.046,0.037,0.022,0.021,0.023,0.025,0.0,0.0,0.0,0.0,0.0
diff --git a/policyengine_uk_data/tests/test_frs_only_imputation.py b/policyengine_uk_data/tests/test_frs_only_imputation.py
index 265105ab7..cb2554950 100644
--- a/policyengine_uk_data/tests/test_frs_only_imputation.py
+++ b/policyengine_uk_data/tests/test_frs_only_imputation.py
@@ -68,10 +68,10 @@ def _fake_dataset(person_rows: int, seed: int = 0):
             "income_support_reported": 0.0,
             "working_tax_credit_reported": 0.0,
             "child_tax_credit_reported": 0.0,
-            "attendance_allowance_reported": 0.0,
+            "aa_category": "NONE",
             "state_pension_reported": 0.0,
-            "dla_sc_reported": 0.0,
-            "dla_m_reported": 0.0,
+            "dla_sc_category": "NONE",
+            "dla_m_category": "NONE",
             "pip_m_category": np.where(
                 rng.random(person_rows) < 0.05,
                 "STANDARD",
@@ -134,8 +134,18 @@ def test_frs_only_outputs_are_non_negative():
         assert np.all(values >= 0), f"{column} has negative predictions"
         assert np.isfinite(values).all(), f"{column} has NaN / inf predictions"
 
-    for column in ("pip_m_category", "pip_dl_category"):
-        assert result.person[column].isin(["NONE", "STANDARD", "ENHANCED"]).all()
+    for column in (
+        "aa_category",
+        "dla_sc_category",
+        "dla_m_category",
+        "pip_m_category",
+        "pip_dl_category",
+    ):
+        assert (
+            result.person[column]
+            .isin(["NONE", "LOWER", "MIDDLE", "HIGHER", "STANDARD", "ENHANCED"])
+            .all()
+        )
 
 
 def test_frs_only_does_not_touch_non_output_columns():
diff --git a/policyengine_uk_data/tests/test_policybench_transfer.py b/policyengine_uk_data/tests/test_policybench_transfer.py
index ac75231c3..7dec319e8 100644
--- a/policyengine_uk_data/tests/test_policybench_transfer.py
+++ b/policyengine_uk_data/tests/test_policybench_transfer.py
@@ -79,6 +79,9 @@ def test_policybench_transfer_writes_only_valid_leaf_inputs(tmp_path: Path):
 
     assert "household_wealth" not in dataset.household.columns
     assert "total_wealth" not in dataset.household.columns
+    assert "attendance_allowance_reported" not in dataset.person.columns
+    assert "dla_sc_reported" not in dataset.person.columns
+    assert "dla_m_reported" not in dataset.person.columns
     assert "pip_dl_reported" not in dataset.person.columns
     assert "pip_m_reported" not in dataset.person.columns
     assert (

From 3621fcbc6613d79e18f11a09ecccb2d2e4ec57de Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Sat, 2 May 2026 08:59:59 -0400
Subject: [PATCH 5/5] Fix public transfer upload version

---
 .../storage/upload_public_transfer_dataset.py             | 8 +++++++-
 .../tests/test_enhanced_cps_artifact_manifest.py          | 6 ++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/policyengine_uk_data/storage/upload_public_transfer_dataset.py b/policyengine_uk_data/storage/upload_public_transfer_dataset.py
index ab5f507da..89e3f7ad5 100644
--- a/policyengine_uk_data/storage/upload_public_transfer_dataset.py
+++ b/policyengine_uk_data/storage/upload_public_transfer_dataset.py
@@ -1,5 +1,7 @@
 """Upload the explicitly public UK calibrated transfer dataset artifacts."""
 
+from importlib import metadata
+
 from policyengine_uk_data.datasets import (
     ENHANCED_CPS_FILE,
     ENHANCED_CPS_MANIFEST_FILE,
@@ -9,7 +11,7 @@
 from policyengine_uk_data.utils.hf_destinations import PUBLIC_REPO
 
 
-def upload_public_transfer_dataset() -> None:
+def upload_public_transfer_dataset(version: str | None = None) -> None:
     files = [
         ENHANCED_CPS_FILE,
         ENHANCED_CPS_SOURCE_FILE,
@@ -19,8 +21,12 @@ def upload_public_transfer_dataset() -> None:
         if not file_path.exists():
             raise ValueError(f"File {file_path} does not exist.")
 
+    if version is None:
+        version = metadata.version("policyengine-uk-data")
+
     upload_files_to_hf(
         files=files,
+        version=version,
         hf_repo_name=PUBLIC_REPO,
     )
 
diff --git a/policyengine_uk_data/tests/test_enhanced_cps_artifact_manifest.py b/policyengine_uk_data/tests/test_enhanced_cps_artifact_manifest.py
index a9922311c..d1d2c0622 100644
--- a/policyengine_uk_data/tests/test_enhanced_cps_artifact_manifest.py
+++ b/policyengine_uk_data/tests/test_enhanced_cps_artifact_manifest.py
@@ -123,12 +123,14 @@ def test_legacy_policybench_transfer_artifacts_are_explicitly_legacy():
 
 def test_public_transfer_upload_targets_public_hf_repo():
     with patch(
-        "policyengine_uk_data.storage.upload_public_transfer_dataset.upload_files_to_hf"
+        "policyengine_uk_data.storage.upload_public_transfer_dataset.upload_files_to_hf",
+        autospec=True,
     ) as upload_files_to_hf:
-        upload_public_transfer_dataset()
+        upload_public_transfer_dataset(version="1.55.3")
 
     upload_files_to_hf.assert_called_once()
     kwargs = upload_files_to_hf.call_args.kwargs
+    assert kwargs["version"] == "1.55.3"
     assert kwargs["hf_repo_name"] == PUBLIC_REPO
     assert kwargs["files"] == [
         ENHANCED_CPS_FILE,